In [1]:
from itertools import product
from tqdm import tqdm
ngrams = lambda a, n: list(zip(*[a[i:] for i in range(n)]))  # function that extract all the n grams in a given sequence


def Combinations(proteins, n):
    return list(product(proteins, repeat=n))


def get_spectrum_embeddings(Seq, combinations, n):
    kmers = ngrams(Seq, n)
    embedding = np.zeros(len(combinations))
    for ngram in kmers:
        index = combinations.index(ngram)
        embedding[index] += 1
    return embedding


def get_mismatch_embeddings(Seq, combinations, n):
    proteins = ['A', 'C', 'G', 'T']
    decompose_seq = ngrams(Seq, n)
    embedding = np.zeros(len(combinations))
    for kmer in decompose_seq:
        index = combinations.index(kmer)
        embedding[index] += 1
        kmer_seq = list(kmer)
        for ind, cur_protein in enumerate(kmer_seq):
            for protein in proteins:
                if protein != cur_protein:
                    mismatch_kmer = list(kmer_seq)
                    mismatch_kmer[ind] = protein
                    mismatch_kmer = tuple(mismatch_kmer)
                    index_ = combinations.index(mismatch_kmer)
                    embedding[index_] += 0.3
    return embedding


def get_gram_matrix(X1, X2=[]):

    n2 = len(X2)
    n1 = len(X1)
    if n2 == 0:
        gram_matrix = X1 @ X1.T
        gram_matrix_copy = X1 @ X1.T
        gram_matrix = gram_matrix.astype(np.float32)
        for i in range(n1):
            for j in range(n1):
                gram_matrix[i, j] /= (gram_matrix_copy[i, i] * gram_matrix_copy[j, j]) ** 0.5
        print('Gram Matrix Computed for X1')
        return gram_matrix
    else:
        gram_matrix = X1 @ X2.T
        gram_matrix = gram_matrix.astype(np.float32)
        gram_X1 = X1 @ X1.T
        gram_X2 = X2 @ X2.T

        for i in range(n1):
            for j in range(n2):
                gram_matrix[i, j] /= (gram_X2[j, j] * gram_X1[i, i]) ** 0.5
        print('Gram Matrix Computed for X2')
        return gram_matrix

In [2]:

def get_gram_matrix(X1, X2=[]):

    n2 = len(X2)
    n1 = len(X1)
    if n2 == 0:
        gram_matrix = X1 @ X1.T
        gram_matrix_copy = X1 @ X1.T
        gram_matrix = gram_matrix.astype(np.float32)
        for i in range(n1):
            for j in range(n1):
                gram_matrix[i, j] /= (gram_matrix_copy[i, i] * gram_matrix_copy[j, j]) ** 0.5
        print('Gram Matrix Computed for X1')
        return gram_matrix
    else:
        gram_matrix = X1 @ X2.T
        gram_matrix = gram_matrix.astype(np.float32)
        gram_X1 = X1 @ X1.T
        gram_X2 = X2 @ X2.T

        for i in range(n1):
            for j in range(n2):
                gram_matrix[i, j] /= (gram_X2[j, j] * gram_X1[i, i]) ** 0.5
        print('Gram Matrix Computed for X2')
        return gram_matrix

In [3]:
import pandas as pd
import numpy as np
import os

X_train_0 = (pd.read_csv('data/Xtr0.csv',header=None).values).tolist()
Y_train_0 = (pd.read_csv('data/Ytr0.csv',sep=',',index_col=0).values)
X_train_0 = (np.array(X_train_0)[1:,1]).tolist()
X_test_0 = (pd.read_csv('data/Xte0.csv',header=None).values).tolist()
X_test_0 = (np.array(X_test_0)[1:,1]).tolist()
Y_train_0[Y_train_0 == 0] = -1
from sklearn.model_selection import train_test_split



In [4]:
kernels_0 = {'mismatch_6': [6, get_mismatch_embeddings],
             'mismatch_7': [7, get_mismatch_embeddings],
            'spectrum_7': [7, get_spectrum_embeddings],
            'spectrum_6': [6, get_spectrum_embeddings]}

In [5]:
gram_matrices_0 = {}
for key in kernels_0.keys():
    train_filename = 'gram_matrices/train_' + key + '_dataset0.npy'
    test_filename = 'gram_matrices/test_' + key + '_dataset0.npy'
    length = kernels_0[key][0]
    embedding_func = kernels_0[key][1]
    DNA_combinations = Combinations(proteins=['A', 'C', 'G', 'T'], n=length)
    
    if os.path.exists(train_filename):
        print(train_filename, ' already exists !')
        gram_train_0 = np.load(train_filename)
    else:
        print('Creating ', train_filename)
        train_embeddings_0 = np.empty([len(X_train_0), len(DNA_combinations)])
        for i in tqdm(range(len(X_train_0))):
            train_embeddings_0[i, :] = embedding_func(Seq=X_train_0[i], combinations=DNA_combinations, n=length)
        gram_train_0 = get_gram_matrix(train_embeddings_0)
        np.save(train_filename, gram_train_0)
    if os.path.exists(test_filename):
        print(test_filename, ' already exists !')
        gram_test_0 = np.load(test_filename)
    else:
        print('Creating ', test_filename)
        test_embeddings_0 = np.empty([len(X_test_0), len(DNA_combinations)])
        for i in tqdm(range(len(X_test_0))):
            test_embeddings_0[i, :] = embedding_func(Seq=X_test_0[i], combinations=DNA_combinations, n=length)
        gram_test_0 = get_gram_matrix(train_embeddings_0, test_embeddings_0)
        np.save(test_filename, gram_test_0)
    gram_matrices_0[key] = {'train': gram_train_0,
                           'test': gram_test_0}

gram_matrices/train_mismatch_6_dataset0.npy  already exists !
gram_matrices/test_mismatch_6_dataset0.npy  already exists !
gram_matrices/train_mismatch_7_dataset0.npy  already exists !
gram_matrices/test_mismatch_7_dataset0.npy  already exists !
gram_matrices/train_spectrum_7_dataset0.npy  already exists !
gram_matrices/test_spectrum_7_dataset0.npy  already exists !
gram_matrices/train_spectrum_6_dataset0.npy  already exists !
gram_matrices/test_spectrum_6_dataset0.npy  already exists !


In [6]:
import cvxopt
cvxopt.solvers.options['show_progress'] = False

class SVMC:

    def __init__(self, c=1, min_sv=1e-4):
        self.alpha_ = None
        self.c = c  # corresponds to (1/2*lambda)
        # if y_train is not None: self.C = float(self.C)
        self.min_sv = min_sv

    def fit(self, kernel_train, label):
        n = label.shape[0]
        p = kernel_train.shape[1]
        diag = np.zeros((n, n))
        np.fill_diagonal(diag, label)

        # P = kernel_train.copy()
        # for i in range(n):
        #     P[i, :] = P[i, :]*label[i]

        P = np.dot(diag, np.dot(kernel_train, diag))
        Pcvx = cvxopt.matrix(P)

        Pcvx = cvxopt.matrix(np.outer(label,label) * kernel_train)
        qcvx = cvxopt.matrix(np.ones(n) * -1)

        if self.c is None:
            G = cvxopt.matrix(np.diag(np.ones(n) * -1))
            h = cvxopt.matrix(np.zeros(n))
        else:
            Ginf = np.diag(np.ones(n) * -1)
            Gsup = np.identity(n)
            G = cvxopt.matrix(np.vstack((Ginf, Gsup)))
            hinf = np.zeros(n)
            hsup = np.ones(n) * self.c
            h = cvxopt.matrix(np.hstack((hinf, hsup)))

        A = label.transpose()
        A = A.astype('double')
        Acvx = cvxopt.matrix(A)
        bcvx = cvxopt.matrix(0.0)

        # Solve QP problem using cvxopt solver for qp problems
        u = cvxopt.solvers.qp(Pcvx, qcvx, G, h, Acvx, bcvx)

        # take Lagrange multipliers, and the solution of the dual problem
        alpha = np.ravel(u['x'])

        sv = alpha > self.min_sv
        ind = np.arange(len(alpha))[sv]

        self.alpha_ = alpha[sv]
        self.sv = np.argwhere(sv == True)
        self.sv_label = label[sv]
        print("%d support vectors out of %d points" % (len(self.alpha_), n))

        # Bias value/intercept
        self.b = 0 * 1.0
        # self.b = self.b.astype(np.float64)
        for i in range(len(self.alpha_)):
            self.b += self.sv_label[i]
            self.b -= np.sum(self.alpha_ * self.sv_label[:, 0] * kernel_train[sv, ind[i]])
        self.b /= len(self.alpha_)

    def get_coef(self):
        return list(self.alpha_)

    def predict(self, kernel_test):
        
        y_predict = np.zeros(kernel_test.shape[1])

        for i in range(kernel_test.shape[1]):
            y_predict[i] = sum(alpha * sv_label * kernel_test[sv, i] for alpha, sv, sv_label in
                               zip(self.alpha_, self.sv, self.sv_label[:, 0]))
        return y_predict + self.b

        prediction = np.sign(y_predict + self.b)

        return prediction

    def predict_class(self, kernel_test):
        
        prediction = np.array(self.predict(kernel_test) >= 0, dtype=int)
        prediction[prediction == 0] = -1
        return prediction

In [7]:
from sklearn.preprocessing import StandardScaler
class KernelPCA():
    
    def __init__(self, n_components):
        
        self.number_components = n_components
    
    # @staticmethod
    def get_wanted_eigenvectors_eigenvalues(self, w, v, ):
        
        L = [(w[i], v[i, :]) for i in range(w.shape[0])]
        L = sorted(L, key=lambda x: x[0], reverse=True)
        return np.array([L[i][0] for i in range(self.number_components)]),\
               np.array([L[i][1] for i in range(self.number_components)])


    def fit_transform(self, K, eps=1e-6):
        
        n = K.shape[0]
        U = (1/n) * np.ones((n, n))
        centred_K = (np.eye(n) - U)@K@(np.eye(n) - U)
        
        w, v = np.linalg.eig(centred_K)
        w = np.array(list(map(lambda x: x.real if x.real > 0 else eps, w)))
        v = np.real(v)
        w, v = self.get_wanted_eigenvectors_eigenvalues(w, v)
        
        alpha = v/np.sqrt(w[:, None])
        self.alpha = alpha
        
        return K @ alpha.T
    def transform(self, X):

        return X@self.alpha.T

In [8]:
# With PCA
list_c = [0.5, 1, 1.5, 2, 2.5, 3] 
sv = 1e-4
lambda_log_reg = 1
tolerance = 0.001
list_kernels = list(gram_matrices_0.keys())
list_of_prediction_test_0 = []
NComponents = [100*i for i in range(1, 11)]
from sklearn.metrics import accuracy_score
for kernel in ['mismatch_6']:
    ValAcc = np.zeros((len(NComponents), len(list_c)))
    for i in range(len(NComponents)):
        for j in range(len(list_c)):
            
            n_component = NComponents[i]
            c = list_c[j]

            print(f'Predicting with {kernel} and n_component={n_component}')
            gram_train_0 = gram_matrices_0[kernel]['train']
            
            pca = KernelPCA(n_components=n_component)
            gram_train_0 = pca.fit_transform(gram_train_0)
            gram_train_0 = gram_train_0@gram_train_0.T

            test_size = 0.2
            list_train, list_val = train_test_split(list(range(2000)), test_size=test_size)
            gram_train = gram_train_0[list_train, :][:, list_train]
            gram_val = gram_train_0[list_train, :][:, list_val]
            y_train_split_0 = Y_train_0[list_train]
            y_val_split_0 = Y_train_0[list_val]

            svm_test = SVMC(c=c, min_sv=sv)

            svm_test.fit(gram_train, y_train_split_0)
            y_val_pred = svm_test.predict_class(gram_val).reshape(-1)
            val_acc = accuracy_score(y_val_split_0.reshape(-1), y_val_pred)
            ValAcc[i, j] = val_acc
            print('Val Accuracy =', val_acc)
    
    best_ncomponent_arg, best_c_arg = np.unravel_index(np.argmax(ValAcc, axis=None), ValAcc.shape) 
    best_ncomponent = NComponents[best_ncomponent_arg]
    best_c = list_c[best_c_arg]
    print('--------------------------------------------------')
    print('For the kernel {} the best accuracy is {} with n_components={} and c={}'.format(kernel, np.max(ValAcc), best_ncomponent, best_c))
    print('--------------------------------------------------')

    svm_test = SVMC(c=best_c, min_sv=sv)
    pca = KernelPCA(n_components=best_ncomponent)

    gram_train_0 = gram_matrices_0[kernel]['train']
    gram_train_0_pca = pca.fit_transform(gram_train_0)
    gram_train_0 = gram_train_0_pca@gram_train_0_pca.T
    svm_test.fit(gram_train_0, Y_train_0)
    
    gram_test_0 = gram_matrices_0[kernel]['test'].T
    gram_test_0_pca = pca.transform(gram_test_0)
    gram_test_0 = gram_train_0_pca@gram_test_0_pca.T

    y_test_pred_0 = svm_test.predict_class(gram_test_0)
    y_test_pred_0[y_test_pred_0 == -1] = 0
    list_of_prediction_test_0.append(y_test_pred_0)

y_pred_0 = np.array(np.array(list_of_prediction_test_0).mean(axis=0).reshape((-1,))>0.5,dtype=int)


Predicting with mismatch_6 and n_component=100
1533 support vectors out of 1600 points
Val Accuracy = 0.6425
Predicting with mismatch_6 and n_component=100
1479 support vectors out of 1600 points
Val Accuracy = 0.61
Predicting with mismatch_6 and n_component=100
1449 support vectors out of 1600 points
Val Accuracy = 0.61
Predicting with mismatch_6 and n_component=100
1433 support vectors out of 1600 points
Val Accuracy = 0.625
Predicting with mismatch_6 and n_component=100
1419 support vectors out of 1600 points
Val Accuracy = 0.63
Predicting with mismatch_6 and n_component=100
1394 support vectors out of 1600 points
Val Accuracy = 0.6025
Predicting with mismatch_6 and n_component=200
1447 support vectors out of 1600 points
Val Accuracy = 0.6475
Predicting with mismatch_6 and n_component=200
1384 support vectors out of 1600 points
Val Accuracy = 0.6025
Predicting with mismatch_6 and n_component=200
1371 support vectors out of 1600 points
Val Accuracy = 0.62
Predicting with mismatch_6 a

# Part 1

In [9]:

X_train_1 = (pd.read_csv('data/Xtr1.csv',header=None).values).tolist()
Y_train_1 = (pd.read_csv('data/Ytr1.csv',sep=',',index_col=0).values)
X_train_1 = (np.array(X_train_1)[1:,1]).tolist()
X_test_1 = (pd.read_csv('data/Xte1.csv',header=None).values).tolist()
X_test_1 = (np.array(X_test_1)[1:,1]).tolist()
Y_train_1[Y_train_1 == 0] = -1
from sklearn.model_selection import train_test_split

In [10]:
kernels_1 = {'mismatch_6': [6, get_mismatch_embeddings],
             'mismatch_7': [7, get_mismatch_embeddings],
            'spectrum_7': [7, get_spectrum_embeddings],
            'spectrum_6': [6, get_spectrum_embeddings]}

gram_matrices_1 = {}
for key in kernels_1.keys():
    train_filename = 'gram_matrices/train_' + key + '_dataset1.npy'
    test_filename = 'gram_matrices/test_' + key + '_dataset1.npy'
    length = kernels_1[key][0]
    embedding_func = kernels_1[key][1]
    DNA_combinations = Combinations(proteins=['A', 'C', 'G', 'T'], n=length)
    
    if os.path.exists(train_filename):
        print(train_filename, ' already exists !')
        gram_train_1 = np.load(train_filename)
    else:
        print('Creating ', train_filename)
        train_embeddings_1 = np.empty([len(X_train_1), len(DNA_combinations)])
        for i in tqdm(range(len(X_train_1))):
            train_embeddings_1[i, :] = embedding_func(Seq=X_train_1[i], combinations=DNA_combinations, n=length)
        gram_train_1 = get_gram_matrix(train_embeddings_1)
        np.save(train_filename, gram_train_1)
    if os.path.exists(test_filename):
        print(test_filename, ' already exists !')
        gram_test_1 = np.load(test_filename)
    else:
        print('Creating ', test_filename)
        test_embeddings_1 = np.empty([len(X_test_1), len(DNA_combinations)])
        for i in tqdm(range(len(X_test_1))):
            test_embeddings_1[i, :] = embedding_func(Seq=X_test_1[i], combinations=DNA_combinations, n=length)
        gram_test_1 = get_gram_matrix(train_embeddings_1, test_embeddings_1)
        np.save(test_filename, gram_test_1)
    gram_matrices_1[key] = {'train': gram_train_1,
                           'test': gram_test_1}

gram_matrices/train_mismatch_6_dataset1.npy  already exists !
gram_matrices/test_mismatch_6_dataset1.npy  already exists !
gram_matrices/train_mismatch_7_dataset1.npy  already exists !
gram_matrices/test_mismatch_7_dataset1.npy  already exists !
gram_matrices/train_spectrum_7_dataset1.npy  already exists !
gram_matrices/test_spectrum_7_dataset1.npy  already exists !
gram_matrices/train_spectrum_6_dataset1.npy  already exists !
gram_matrices/test_spectrum_6_dataset1.npy  already exists !


In [11]:
# With PCA
list_c = [0.5, 1, 1.5, 2, 2.5, 3]  
sv = 1e-4
lambda_log_reg = 1
tolerance = 0.001
list_kernels = list(gram_matrices_1.keys())
list_of_prediction_test_1 = []
NComponents = [100*i for i in range(1, 11)]
from sklearn.metrics import accuracy_score
for kernel in ['mismatch_6']:
    ValAcc = np.zeros((len(NComponents), len(list_c)))
    for i in range(len(NComponents)):
        for j in range(len(list_c)):
            
            n_component = NComponents[i]
            c = list_c[j]

            print(f'Predicting with {kernel} and n_component={n_component}')
            gram_train_1 = gram_matrices_1[kernel]['train']
            
            pca = KernelPCA(n_components=n_component)
            gram_train_1 = pca.fit_transform(gram_train_1)
            gram_train_1 = gram_train_1@gram_train_1.T

            test_size = 0.2
            list_train, list_val = train_test_split(list(range(2000)), test_size=test_size)
            gram_train = gram_train_1[list_train, :][:, list_train]
            gram_val = gram_train_1[list_train, :][:, list_val]
            y_train_split_1 = Y_train_1[list_train]
            y_val_split_1 = Y_train_1[list_val]

            svm_test = SVMC(c=c, min_sv=sv)

            svm_test.fit(gram_train, y_train_split_1)
            y_val_pred = svm_test.predict_class(gram_val).reshape(-1)
            val_acc = accuracy_score(y_val_split_1.reshape(-1), y_val_pred)
            ValAcc[i, j] = val_acc
            print('Val Accuracy =', val_acc)
    
    best_ncomponent_arg, best_c_arg = np.unravel_index(np.argmax(ValAcc, axis=None), ValAcc.shape) 
    best_ncomponent = NComponents[best_ncomponent_arg]
    best_c = list_c[best_c_arg]
    print('--------------------------------------------------')
    print('For the kernel {} the best accuracy is {} with n_components={} and c={}'.format(kernel, np.max(ValAcc), best_ncomponent, best_c))
    print('--------------------------------------------------')

    svm_test = SVMC(c=best_c, min_sv=sv)
    pca = KernelPCA(n_components=best_ncomponent)

    gram_train_1 = gram_matrices_1[kernel]['train']
    gram_train_1_pca = pca.fit_transform(gram_train_1)
    gram_train_1 = gram_train_1_pca@gram_train_1_pca.T
    svm_test.fit(gram_train_1, Y_train_1)
    
    gram_test_1 = gram_matrices_1[kernel]['test'].T
    gram_test_1_pca = pca.transform(gram_test_1)
    gram_test_1 = gram_train_1_pca@gram_test_1_pca.T

    y_test_pred_1 = svm_test.predict_class(gram_test_1)
    y_test_pred_1[y_test_pred_1 == -1] = 0
    list_of_prediction_test_1.append(y_test_pred_1)

y_pred_1 = np.array(np.array(list_of_prediction_test_1).mean(axis=0).reshape((-1,))>0.5,dtype=int)


Predicting with mismatch_6 and n_component=100
1577 support vectors out of 1600 points
Val Accuracy = 0.575
Predicting with mismatch_6 and n_component=100
1552 support vectors out of 1600 points
Val Accuracy = 0.59
Predicting with mismatch_6 and n_component=100
1539 support vectors out of 1600 points
Val Accuracy = 0.5525
Predicting with mismatch_6 and n_component=100
1535 support vectors out of 1600 points
Val Accuracy = 0.57
Predicting with mismatch_6 and n_component=100
1515 support vectors out of 1600 points
Val Accuracy = 0.5575
Predicting with mismatch_6 and n_component=100
1510 support vectors out of 1600 points
Val Accuracy = 0.63
Predicting with mismatch_6 and n_component=200
1510 support vectors out of 1600 points
Val Accuracy = 0.535
Predicting with mismatch_6 and n_component=200
1492 support vectors out of 1600 points
Val Accuracy = 0.595
Predicting with mismatch_6 and n_component=200
1460 support vectors out of 1600 points
Val Accuracy = 0.59
Predicting with mismatch_6 and

# PART 2

In [12]:

X_train_2 = (pd.read_csv('data/Xtr2.csv',header=None).values).tolist()
Y_train_2 = (pd.read_csv('data/Ytr2.csv',sep=',',index_col=0).values)
X_train_2 = (np.array(X_train_2)[1:,1]).tolist()
X_test_2 = (pd.read_csv('data/Xte2.csv',header=None).values).tolist()
X_test_2 = (np.array(X_test_2)[1:,1]).tolist()
Y_train_2[Y_train_2 == 0] = -1
from sklearn.model_selection import train_test_split

In [13]:
kernels_2 = {'mismatch_6': [6, get_mismatch_embeddings],
             'mismatch_7': [7, get_mismatch_embeddings],
            'spectrum_7': [7, get_spectrum_embeddings],
            'spectrum_6': [6, get_spectrum_embeddings]}

gram_matrices_2 = {}
for key in kernels_2.keys():
    train_filename = 'gram_matrices/train_' + key + '_dataset2.npy'
    test_filename = 'gram_matrices/test_' + key + '_dataset2.npy'
    length = kernels_2[key][0]
    embedding_func = kernels_2[key][1]
    DNA_combinations = Combinations(proteins=['A', 'C', 'G', 'T'], n=length)
    
    if os.path.exists(train_filename):
        print(train_filename, ' already exists !')
        gram_train_2 = np.load(train_filename)
    else:
        print('Creating ', train_filename)
        train_embeddings_2 = np.empty([len(X_train_2), len(DNA_combinations)])
        for i in tqdm(range(len(X_train_2))):
            train_embeddings_2[i, :] = embedding_func(Seq=X_train_2[i], combinations=DNA_combinations, n=length)
        gram_train_2 = get_gram_matrix(train_embeddings_2)
        np.save(train_filename, gram_train_2)
    if os.path.exists(test_filename):
        print(test_filename, ' already exists !')
        gram_test_2 = np.load(test_filename)
    else:
        print('Creating ', test_filename)
        test_embeddings_2 = np.empty([len(X_test_2), len(DNA_combinations)])
        for i in tqdm(range(len(X_test_2))):
            test_embeddings_2[i, :] = embedding_func(Seq=X_test_2[i], combinations=DNA_combinations, n=length)
        gram_test_2 = get_gram_matrix(train_embeddings_2, test_embeddings_2)
        np.save(test_filename, gram_test_2)
    gram_matrices_2[key] = {'train': gram_train_2,
                           'test': gram_test_2}

gram_matrices/train_mismatch_6_dataset2.npy  already exists !
gram_matrices/test_mismatch_6_dataset2.npy  already exists !
gram_matrices/train_mismatch_7_dataset2.npy  already exists !
gram_matrices/test_mismatch_7_dataset2.npy  already exists !
gram_matrices/train_spectrum_7_dataset2.npy  already exists !
gram_matrices/test_spectrum_7_dataset2.npy  already exists !
gram_matrices/train_spectrum_6_dataset2.npy  already exists !
gram_matrices/test_spectrum_6_dataset2.npy  already exists !


In [14]:
# With PCA
list_c = [0.5, 1, 1.5, 2, 2.5, 3]  
sv = 1e-4
lambda_log_reg = 1
tolerance = 0.001
list_kernels = list(gram_matrices_2.keys())
list_of_prediction_test_2 = []
NComponents = [100*i for i in range(1, 11)]
from sklearn.metrics import accuracy_score
for kernel in ['mismatch_6']:
    ValAcc = np.zeros((len(NComponents), len(list_c)))
    for i in range(len(NComponents)):
        for j in range(len(list_c)):
            
            n_component = NComponents[i]
            c = list_c[j]

            print(f'Predicting with {kernel} and n_component={n_component}')
            gram_train_2 = gram_matrices_2[kernel]['train']
            
            pca = KernelPCA(n_components=n_component)
            gram_train_2 = pca.fit_transform(gram_train_2)
            gram_train_2 = gram_train_2@gram_train_2.T

            test_size = 0.2
            list_train, list_val = train_test_split(list(range(2000)), test_size=test_size)
            gram_train = gram_train_2[list_train, :][:, list_train]
            gram_val = gram_train_2[list_train, :][:, list_val]
            y_train_split_2 = Y_train_2[list_train]
            y_val_split_2 = Y_train_2[list_val]

            svm_test = SVMC(c=c, min_sv=sv)

            svm_test.fit(gram_train, y_train_split_2)
            y_val_pred = svm_test.predict_class(gram_val).reshape(-1)
            val_acc = accuracy_score(y_val_split_2.reshape(-1), y_val_pred)
            ValAcc[i, j] = val_acc
            print('Val Accuracy =', val_acc)
    
    best_ncomponent_arg, best_c_arg = np.unravel_index(np.argmax(ValAcc, axis=None), ValAcc.shape) 
    best_ncomponent = NComponents[best_ncomponent_arg]
    best_c = list_c[best_c_arg]
    print('--------------------------------------------------')
    print('For the kernel {} the best accuracy is {} with n_components={} and c={}'.format(kernel, np.max(ValAcc), best_ncomponent, best_c))
    print('--------------------------------------------------')

    svm_test = SVMC(c=best_c, min_sv=sv)
    pca = KernelPCA(n_components=best_ncomponent)

    gram_train_2 = gram_matrices_2[kernel]['train']
    gram_train_2_pca = pca.fit_transform(gram_train_2)
    gram_train_2 = gram_train_2_pca@gram_train_2_pca.T
    svm_test.fit(gram_train_2, Y_train_2)
    
    gram_test_2 = gram_matrices_2[kernel]['test'].T
    gram_test_2_pca = pca.transform(gram_test_2)
    gram_test_2 = gram_train_2_pca@gram_test_2_pca.T

    y_test_pred_2 = svm_test.predict_class(gram_test_2)
    y_test_pred_2[y_test_pred_2 == -1] = 0
    list_of_prediction_test_2.append(y_test_pred_2)

y_pred_2 = np.array(np.array(list_of_prediction_test_2).mean(axis=0).reshape((-1,))>0.5,dtype=int)

Predicting with mismatch_6 and n_component=100
1400 support vectors out of 1600 points
Val Accuracy = 0.655
Predicting with mismatch_6 and n_component=100
1320 support vectors out of 1600 points
Val Accuracy = 0.705
Predicting with mismatch_6 and n_component=100
1286 support vectors out of 1600 points
Val Accuracy = 0.6825
Predicting with mismatch_6 and n_component=100
1261 support vectors out of 1600 points
Val Accuracy = 0.72
Predicting with mismatch_6 and n_component=100
1260 support vectors out of 1600 points
Val Accuracy = 0.6975
Predicting with mismatch_6 and n_component=100
1216 support vectors out of 1600 points
Val Accuracy = 0.6375
Predicting with mismatch_6 and n_component=200
1255 support vectors out of 1600 points
Val Accuracy = 0.665
Predicting with mismatch_6 and n_component=200
1214 support vectors out of 1600 points
Val Accuracy = 0.6775
Predicting with mismatch_6 and n_component=200
1184 support vectors out of 1600 points
Val Accuracy = 0.715
Predicting with mismatch_

In [15]:
y_pred = list(y_pred_0) + list(y_pred_1) + list(y_pred_2)
with open("outputs/ensembling_kernels_16.csv", 'w') as f:
    f.write('Id,Bound\n')
    for i in range(len(y_pred)):
        f.write(str(i)+','+str(y_pred[i])+'\n')