# This is the notebook for the main project of Advanced Learning Model:

- **./data/train** contain data necessary for training
- **./data/test** contain data necessary for test
- **./main** will contain the main script used to create and train the model

### Reading training features and concatenate them

In [None]:
def get_training():
    import csv

    clean_features = []

    with open('./data/train/Xtr0.csv', 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            clean_features.append(list(row[0].replace("\n", "")))

    with open('./data/train/Xtr1.csv', 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            clean_features.append(list(row[0].replace("\n", "")))

    with open('./data/train/Xtr2.csv', 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            clean_features.append(list(row[0].replace("\n", "")))

    print ("N. samples: " + str(len(clean_features)))
    
    clean_labels = []

    with open('./data/train/Ytr0.csv', 'r') as f:
        reader = csv.reader(f)
        header = True;
        for row in reader:
            if header == True:
                header = False
            else:
                clean_labels.append(int(row[1].replace("\n", "")))

    with open('./data/train/Ytr1.csv', 'r') as f:
        reader = csv.reader(f)
        header = True;
        for row in reader:
            if header == True:
                header = False
            else:
                clean_labels.append(int(row[1].replace("\n", "")))

    with open('./data/train/Ytr2.csv', 'r') as f:
        reader = csv.reader(f)
        header = True;
        for row in reader:
            if header == True:
                header = False
            else:
                clean_labels.append(int(row[1].replace("\n", "")))
                
                
    print ("N. labels: " + str(len(clean_labels)))   
    return (clean_features, clean_labels)

### Reading Test Set

In [None]:
def get_test():
    import csv

    clean_features_test = []

    with open('./data/test/Xte0.csv', 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            clean_features_test.append(list(row[0].replace("\n", "")))

    with open('./data/test/Xte1.csv', 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            clean_features_test.append(list(row[0].replace("\n", "")))

    with open('./data/test/Xte2.csv', 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            clean_features_test.append(list(row[0].replace("\n", "")))

    print ("N. test samples: " + str(len(clean_features_test)))
    return clean_features_test

### Some useful functions:
- **get_frequencies(X)** return features of dimension 4 having each the count of each gene (countA, countC, countG, countT)
- **get_encoded(X)** return features of dimension 100 substituting at each position the number for the corresponding gene (A=1, C=2, G=3, T=4)
- **get_spectrum_space(X, n)** return the feature mapping with sequences n
- **def get_spectrum_space_boost(X, n):** return the feature mapping with sequences n, n-1 ... 1
- **get_nonzero_labels(y)** return labels -1/+1
- **get_zero_labels(y)** return labels 0/1

In [None]:
def get_frequencies(clean_features):
    freqCount = []
    for i in range(0, len(clean_features)):
    
        countA = 0
        countC = 0
        countG = 0
        countT = 0

        for j in range(0, len(clean_features[i])):

            if clean_features[i][j] == "A":
                countA = countA + 1
            elif clean_features[i][j] == "C":
                countC = countC + 1
            elif clean_features[i][j] == "G":
                countG = countG + 1
            else:
                countT = countT + 1

        freqCount.append([countA, countC, countG, countT])
    
    return freqCount

In [None]:
def get_encoded(clean_features):
    encoded = []
    for i in range(0, len(clean_features)):
        tmp_encoded = []
        for j in range(0, len(clean_features[i])):
            
            if clean_features[i][j] == "A":
                tmp_encoded.append(1)
            elif clean_features[i][j] == "C":
                tmp_encoded.append(2)
            elif clean_features[i][j] == "G":
                tmp_encoded.append(3)
            else:
                tmp_encoded.append(4)

        encoded.append(tmp_encoded)
    
    return encoded

In [None]:
def get_nplet_list(n=3):
    from itertools import product
    nplets = []
    t =  product('ACTG', repeat=n)
    for i in t: 
        nplets.append(''.join(i))
    return nplets

In [None]:
def get_triplet_space_boost(clean_features, n=3):
    
    import numpy as np
    import copy
    
    triplets = []
    
    for i in range(1, n+1):
        triplets = triplets + get_nplet_list(i)
    
    features = copy.copy(clean_features)
    
    if (len(features) <= 1):
        for i in range(0, len(features)):
            features[i] = list(features[i])

    new_features = np.zeros((len(features), len(triplets)))
    
    for j in range(0, len(features)):
        for i in range(0, len(features[j])-(n-1)):
            ts = ""
            for k in range(0, n):
                ts= ts + features[j][i+k]
            new_features[j][triplets.index(ts)] = new_features[j][triplets.index(ts)] + 1

    return new_features

In [None]:
def get_triplet_space(clean_features, n=3):
    
    import numpy as np
    import copy
    
    triplets = get_nplet_list(n)
    
    features = copy.copy(clean_features)
    
    if (len(features) <= 1):
        for i in range(0, len(features)):
            features[i] = list(features[i])

    new_features = np.zeros((len(features), len(triplets)))
    
    for j in range(0, len(features)):
        for i in range(0, len(features[j])-(n-1)):
            ts = ""
            for k in range(0, n):
                ts= ts + features[j][i+k]
            new_features[j][triplets.index(ts)] = new_features[j][triplets.index(ts)] + 1

    return new_features

In [None]:
def get_nonzero_labels(clean_labels):
    new_labels = []
    
    for i in clean_labels:
        if (i==1):
            new_labels.append(1)
        else:
            new_labels.append(-1)
    
    return new_labels

In [None]:
def get_zero_labels(clean_labels):
    new_labels = []
    
    for i in clean_labels:
        if (i==1):
            new_labels.append(1)
        else:
            new_labels.append(0)
    
    return new_labels

#### Mapping

In [None]:
clean_features, clean_labels = get_training()
clean_features_test = get_test()
triplet_features = get_triplet_space(clean_features, n=7)
X_report_test = get_triplet_space(clean_features_test, n=7)
non_zero_labels = get_nonzero_labels(clean_labels)

#### Cell used during development to random split the training set in train/test sets

In [None]:
from sklearn.model_selection import *
X_train, X_test, y_train, y_test = train_test_split(triplet_features, non_zero_labels, test_size=0.2)

### Kernel class implementation

In [None]:
class Kernel:
    # wrapper class for different kernels
    def __init__(self, kernel="rbf", gamma=0.1, degree=3, sigma=5.0, offset=0.0):
        self._kernel = kernel
        self._gamma = gamma
        self._degree = degree
        self._sigma = sigma
        self._offset = offset
        self._function = self._kfunction(kernel)
        return
    
    def kernel_function(self):
        return self._function
    
    def _kfunction(self, kernel):
        
        if kernel == "linear": # Linear Kernel
            
            def f(x, y):
                return np.inner(x, y)
            return f
        
        elif kernel == "rbf": # Radial Basis Function Kernel
            
            def f(x, y):
                exponent = - self._gamma * np.linalg.norm(x-y) ** 2
                return np.exp(exponent)
            return f
            
        elif kernel == "quadratic": # Quadratic Kernel
            
            def f(x, y):
                return (self._gamma * (self._offset + np.dot(x, y))) ** 2
            return f
        
        elif kernel == "polynomial": # Polynomial Kernel
            
            def f(x, y):
                return (self._gamma * (self._offset + np.dot(x, y))) ** self._degree
            return f
        
        elif kernel == "gaussian": # real Gaussian Kernel
            
            def f(x, y):
                return np.exp(-linalg.norm(x-y)**2 / (2 * (self._sigma ** 2)))
            return f
    
        else:
            print ("[ERROR] The required kernel is not implemented.")
            exit(3)

### SVM class implementation

In [None]:
import numpy as np
from numpy import linalg
import cvxopt
import cvxopt.solvers


class SVM:
    # init function for the SVM classifier
    def __init__(self, kernel='linear', gamma=0.1, degree=3, C=1.0, offset=0.0, sigma=5.0):
        self._kernel_name = kernel
        self._offset = offset
        self._sigma=sigma
        self.kernel = Kernel(kernel, gamma=gamma, degree=degree, offset=offset, sigma=sigma).kernel_function()
        self.C = C
        if self.C is not None: self.C = float(self.C)
            
    
    
    def fit(self, X, y):
        X=np.array([np.array(xi) for xi in X])
        y=np.array([np.array(yi) for yi in y])
        n_samples, n_features = X.shape

        # Computation of the gram matrix
        K = np.zeros((n_samples, n_samples))
        for i in range(n_samples):
            for j in range(n_samples):
                K[i,j] = self.kernel(X[i], X[j])

        P = cvxopt.matrix(np.outer(y,y) * K)
        q = cvxopt.matrix(np.ones(n_samples) * -1)
        A = cvxopt.matrix(y, (1,n_samples), 'd')
        b = cvxopt.matrix(0.0)

        if self.C is None:
            G = cvxopt.matrix(np.diag(np.ones(n_samples) * -1))
            h = cvxopt.matrix(np.zeros(n_samples))
        else:
            tmp1 = np.diag(np.ones(n_samples) * -1)
            tmp2 = np.identity(n_samples)
            G = cvxopt.matrix(np.vstack((tmp1, tmp2)))
            tmp1 = np.zeros(n_samples)
            tmp2 = np.ones(n_samples) * self.C
            h = cvxopt.matrix(np.hstack((tmp1, tmp2)))

        # Obtaining Lagrange multipliers
        solution = cvxopt.solvers.qp(P, q, G, h, A, b)
        a = np.ravel(solution['x'])

        # Support vectors have "non zero" lagrange multipliers -> threshold = 1e-5
        sv = a > 1e-5
        ind = np.arange(len(a))[sv]
        self.a = a[sv]
        self.sv = X[sv]
        self.sv_y = y[sv]
        print("%d support vectors out of %d points" % (len(self.a), n_samples))
        
        self.b = 0
        for n in range(len(self.a)):
            self.b += self.sv_y[n]
            self.b -= np.sum(self.a * self.sv_y * K[ind[n],sv])
        self.b /= len(self.a)

        if self._kernel_name == 'linear':
            self.w = np.zeros(n_features)
            for n in range(len(self.a)):
                self.w += self.a[n] * self.sv_y[n] * self.sv[n]
        else:
            self.w = None
    
    def predict(self, X):
        ''' Computes the SVM prediction on the given samples X. '''
        return np.sign(self.project(X))
        
    def project(self, X):
        ''' Funtion used for the prediction '''
        if self.w is not None:
            return np.dot(X, self.w) + self.b
        else:
            y_predict = np.zeros(len(X))
            for i in range(len(X)):
                s = 0
                for a, sv_y, sv in zip(self.a, self.sv_y, self.sv):
                    s += a * sv_y * self.kernel(X[i], sv)
                y_predict[i] = s
            return y_predict + self.b
    

In [None]:
clf = SVM(kernel="rbf")
clf.fit(X_train, y_train)

### Cell used to test during development (test/train)

In [None]:
y_predict = clf.predict(X_test)    
correct = np.sum(y_predict == y_test)

print("[RESULTS] %d out of %d predictions correct, accuracy: %f" % (correct, len(y_predict), correct/len(y_predict)))

### Write test labels on file (result.csv)

In [None]:
def write_test_labels(clf, X_test, filename="result.csv"):
    import csv
    
    y_predict_test = clf.predict(X_test)
    y_predict_test = get_zero_labels(y_predict_test)
    
    with open(filename, 'w') as csvfile:
        fieldnames = ['Id', 'Bound']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for i in range(0, len(y_predict_test)):
            writer.writerow({'Id': i, 'Bound': y_predict_test[i]})

    print ("Done, find results on " + filename)

In [None]:
write_test_labels(clf, X_report_test, filename="results_s7654321_quadratic_standard.csv")