In [1]:
from itertools import product
from tqdm import tqdm
ngrams = lambda a, n: list(zip(*[a[i:] for i in range(n)]))  # function that extract all the n grams in a given sequence


def Combinations(proteins, n):
    return list(product(proteins, repeat=n))


def get_spectrum_embeddings(Seq, combinations, n):
    kmers = ngrams(Seq, n)
    embedding = np.zeros(len(combinations))
    for ngram in kmers:
        index = combinations.index(ngram)
        embedding[index] += 1
    return embedding


def get_mismatch_embeddings(Seq, combinations, n):
    proteins = ['A', 'C', 'G', 'T']
    decompose_seq = ngrams(Seq, n)
    embedding = np.zeros(len(combinations))
    for kmer in decompose_seq:
        index = combinations.index(kmer)
        embedding[index] += 1
        kmer_seq = list(kmer)
        for ind, cur_protein in enumerate(kmer_seq):
            for protein in proteins:
                if protein != cur_protein:
                    mismatch_kmer = list(kmer_seq)
                    mismatch_kmer[ind] = protein
                    mismatch_kmer = tuple(mismatch_kmer)
                    index_ = combinations.index(mismatch_kmer)
                    embedding[index_] += 0.3
    return embedding


def get_gram_matrix(X1, X2=[]):

    n2 = len(X2)
    n1 = len(X1)
    if n2 == 0:
        gram_matrix = X1 @ X1.T
        gram_matrix_copy = X1 @ X1.T
        gram_matrix = gram_matrix.astype(np.float32)
        for i in range(n1):
            for j in range(n1):
                gram_matrix[i, j] /= (gram_matrix_copy[i, i] * gram_matrix_copy[j, j]) ** 0.5
        print('Gram Matrix Computed for X1')
        return gram_matrix
    else:
        gram_matrix = X1 @ X2.T
        gram_matrix = gram_matrix.astype(np.float32)
        gram_X1 = X1 @ X1.T
        gram_X2 = X2 @ X2.T

        for i in range(n1):
            for j in range(n2):
                gram_matrix[i, j] /= (gram_X2[j, j] * gram_X1[i, i]) ** 0.5
        print('Gram Matrix Computed for X2')
        return gram_matrix

In [2]:

def get_gram_matrix(X1, X2=[]):

    n2 = len(X2)
    n1 = len(X1)
    if n2 == 0:
        gram_matrix = X1 @ X1.T
        gram_matrix_copy = X1 @ X1.T
        gram_matrix = gram_matrix.astype(np.float32)
        for i in range(n1):
            for j in range(n1):
                gram_matrix[i, j] /= (gram_matrix_copy[i, i] * gram_matrix_copy[j, j]) ** 0.5
        print('Gram Matrix Computed for X1')
        return gram_matrix
    else:
        gram_matrix = X1 @ X2.T
        gram_matrix = gram_matrix.astype(np.float32)
        gram_X1 = X1 @ X1.T
        gram_X2 = X2 @ X2.T

        for i in range(n1):
            for j in range(n2):
                gram_matrix[i, j] /= (gram_X2[j, j] * gram_X1[i, i]) ** 0.5
        print('Gram Matrix Computed for X2')
        return gram_matrix

In [3]:
import pandas as pd
import numpy as np
import os

X_train_0 = (pd.read_csv('data/Xtr0.csv',header=None).values).tolist()
Y_train_0 = (pd.read_csv('data/Ytr0.csv',sep=',',index_col=0).values)
X_train_0 = (np.array(X_train_0)[1:,1]).tolist()
X_test_0 = (pd.read_csv('data/Xte0.csv',header=None).values).tolist()
X_test_0 = (np.array(X_test_0)[1:,1]).tolist()
Y_train_0[Y_train_0 == 0] = -1
from sklearn.model_selection import train_test_split



In [18]:
kernels_0 = {'mismatch_6': [6, get_mismatch_embeddings],
             'mismatch_7': [7, get_mismatch_embeddings],
            'spectrum_7': [7, get_spectrum_embeddings],
            'spectrum_6': [6, get_spectrum_embeddings],
            'spectrum_8': [8, get_spectrum_embeddings],
            'mismatch_8': [8, get_mismatch_embeddings]}

In [19]:
from os import path
gram_matrices_0 = {}
for key in kernels_0.keys():
    train_filename = 'gram_matrices/train_' + key + '_dataset0.npy'
    test_filename = 'gram_matrices/test_' + key + '_dataset0.npy'
    length = kernels_0[key][0]
    embedding_func = kernels_0[key][1]
    DNA_combinations = Combinations(proteins=['A', 'C', 'G', 'T'], n=length)
    
    if path.exists(train_filename):
        print(train_filename, ' already exists !')
        gram_train_0 = np.load(train_filename)
    else:
        print('Creating ', train_filename)
        train_embeddings_0 = np.empty([len(X_train_0), len(DNA_combinations)])
        for i in tqdm(range(len(X_train_0))):
            train_embeddings_0[i, :] = embedding_func(Seq=X_train_0[i], combinations=DNA_combinations, n=length)
        gram_train_0 = get_gram_matrix(train_embeddings_0)
        np.save(train_filename, gram_train_0)
    if path.exists(test_filename):
        print(test_filename, ' already exists !')
        gram_test_0 = np.load(test_filename)
    else:
        print('Creating ', test_filename)
        test_embeddings_0 = np.empty([len(X_test_0), len(DNA_combinations)])
        for i in tqdm(range(len(X_test_0))):
            test_embeddings_0[i, :] = embedding_func(Seq=X_test_0[i], combinations=DNA_combinations, n=length)
        gram_test_0 = get_gram_matrix(train_embeddings_0, test_embeddings_0)
        np.save(test_filename, gram_test_0)
    gram_matrices_0[key] = {'train': gram_train_0,
                           'test': gram_test_0}

  0%|                                                                                         | 0/2000 [00:00<?, ?it/s]

gram_matrices/train_mismatch_6_dataset0.npy  already exists !
gram_matrices/test_mismatch_6_dataset0.npy  already exists !
gram_matrices/train_mismatch_7_dataset0.npy  already exists !
gram_matrices/test_mismatch_7_dataset0.npy  already exists !
gram_matrices/train_spectrum_7_dataset0.npy  already exists !
gram_matrices/test_spectrum_7_dataset0.npy  already exists !
gram_matrices/train_spectrum_6_dataset0.npy  already exists !
gram_matrices/test_spectrum_6_dataset0.npy  already exists !
Creating  gram_matrices/train_spectrum_8_dataset0.npy


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [03:15<00:00, 10.22it/s]
  0%|                                                                                         | 0/1000 [00:00<?, ?it/s]

Gram Matrix Computed for X1
Creating  gram_matrices/test_spectrum_8_dataset0.npy


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:37<00:00, 10.27it/s]
  0%|                                                                                         | 0/2000 [00:00<?, ?it/s]

Gram Matrix Computed for X2
Creating  gram_matrices/train_mismatch_8_dataset0.npy


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [47:37<00:00,  1.43s/it]
  0%|                                                                                         | 0/1000 [00:00<?, ?it/s]

Gram Matrix Computed for X1
Creating  gram_matrices/test_mismatch_8_dataset0.npy


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [24:13<00:00,  1.45s/it]


Gram Matrix Computed for X2


In [7]:
import cvxopt
class SVMC:

    def __init__(self, c=1, min_sv=1e-4):
        self.alpha_ = None
        self.c = c  # corresponds to (1/2*lambda)
        # if y_train is not None: self.C = float(self.C)
        self.min_sv = min_sv

    def fit(self, kernel_train, label):
        n = label.shape[0]
        diag = np.zeros((n, n))
        np.fill_diagonal(diag, label)
        P = np.dot(diag, np.dot(kernel_train, diag))
        Pcvx = cvxopt.matrix(P)

        # Pcvx = cvxopt.matrix(np.outer(label,label) * kernel_train)
        qcvx = cvxopt.matrix(np.ones(n) * -1)

        if self.c is None:
            G = cvxopt.matrix(np.diag(np.ones(n) * -1))
            h = cvxopt.matrix(np.zeros(n))
        else:
            Ginf = np.diag(np.ones(n) * -1)
            Gsup = np.identity(n)
            G = cvxopt.matrix(np.vstack((Ginf, Gsup)))
            hinf = np.zeros(n)
            hsup = np.ones(n) * self.c
            h = cvxopt.matrix(np.hstack((hinf, hsup)))

        A = label.transpose()
        A = A.astype('double')
        Acvx = cvxopt.matrix(A)
        bcvx = cvxopt.matrix(0.0)

        # Solve QP problem using cvxopt solver for qp problems
        u = cvxopt.solvers.qp(Pcvx, qcvx, G, h, Acvx, bcvx)

        # take Lagrange multipliers, and the solution of the dual problem
        alpha = np.ravel(u['x'])

        sv = alpha > self.min_sv
        ind = np.arange(len(alpha))[sv]

        self.alpha_ = alpha[sv]
        self.sv = np.argwhere(sv == True)
        self.sv_label = label[sv]
        print("%d support vectors out of %d points" % (len(self.alpha_), n))

        # Bias value/intercept
        self.b = 0 * 1.0;
        # self.b = self.b.astype(np.float64)
        for i in range(len(self.alpha_)):
            self.b += self.sv_label[i]
            self.b -= np.sum(self.alpha_ * self.sv_label[:, 0] * kernel_train[sv, ind[i]])
        self.b /= len(self.alpha_)

    def get_coef(self):
        return list(self.alpha_)

    def predict(self, kernel_test):
        
        y_predict = np.zeros(kernel_test.shape[1])

        for i in range(kernel_test.shape[1]):
            y_predict[i] = sum(alpha * sv_label * kernel_test[sv, i] for alpha, sv, sv_label in
                               zip(self.alpha_, self.sv, self.sv_label[:, 0]))
        return y_predict + self.b

        prediction = np.sign(y_predict + self.b)

        return prediction

    def predict_class(self, kernel_test):
        
        prediction = np.array(self.predict(kernel_test) >= 0, dtype=int)
        prediction[prediction == 0] = -1
        return prediction

In [27]:
import random
c = 0.5
sv = 1e-4
lambda_log_reg = 1
tolerance = 0.001
list_kernels = list(gram_matrices_0.keys())
list_of_prediction_test_0 = []
from sklearn.metrics import accuracy_score
for i in range(13):
    gram_train_0 = np.zeros((2000, 2000))
    gram_test_0 = np.zeros((2000, 1000))
    sampled_kernels = random.sample(list_kernels, 5)
    for kernel in sampled_kernels:
        print(f'Using {kernel}')
        gram_train_0 += gram_matrices_0[kernel]['train']
        gram_test_0 += gram_matrices_0[kernel]['test']
    gram_train_0 /= 5
    gram_test_0 /= 5
    test_size = 0.2
    list_train, list_val = train_test_split(list(range(2000)), test_size=test_size)
    gram_train = gram_train_0[list_train, :][:, list_train]
    gram_val = gram_train_0[list_train, :][:, list_val]
    y_train_split_0 = Y_train_0[list_train]
    y_val_split_0 = Y_train_0[list_val]

    svm_test = SVMC(c=c, min_sv=sv)
    svm_test.fit(gram_train, y_train_split_0)
    y_val_pred = svm_test.predict_class(gram_val).reshape(-1)
    print('Val Accuracy =', accuracy_score(y_val_split_0.reshape(-1), y_val_pred))
    y_test_pred_0 = svm_test.predict_class(gram_test_0[list_train, :])
    y_test_pred_0[y_test_pred_0 == -1] = 0
    list_of_prediction_test_0.append(y_test_pred_0)

y_pred_0 = np.array(np.array(list_of_prediction_test_0).mean(axis=0).reshape((-1,))>0.5,dtype=int)


Using mismatch_6
Using spectrum_6
Using spectrum_8
Using mismatch_7
Using spectrum_7
     pcost       dcost       gap    pres   dres
 0: -6.1356e+02 -1.9034e+03  1e+04  5e+00  2e-15
 1: -4.2125e+02 -1.5381e+03  1e+03  1e-01  1e-15
 2: -4.6272e+02 -5.9710e+02  1e+02  1e-02  1e-15
 3: -5.1695e+02 -5.4427e+02  3e+01  2e-03  1e-15
 4: -5.2799e+02 -5.3308e+02  5e+00  1e-04  1e-15
 5: -5.3004e+02 -5.3053e+02  5e-01  8e-06  1e-15
 6: -5.3026e+02 -5.3028e+02  2e-02  3e-07  1e-15
 7: -5.3027e+02 -5.3027e+02  8e-04  8e-09  1e-15
 8: -5.3027e+02 -5.3027e+02  2e-05  1e-10  1e-15
Optimal solution found.
1475 support vectors out of 1600 points
Val Accuracy = 0.675
Using mismatch_8
Using spectrum_7
Using spectrum_6
Using spectrum_8
Using mismatch_6
     pcost       dcost       gap    pres   dres
 0: -5.9549e+02 -1.9242e+03  1e+04  5e+00  2e-15
 1: -4.1501e+02 -1.5581e+03  1e+03  2e-01  1e-15
 2: -4.5382e+02 -5.8887e+02  1e+02  1e-02  1e-15
 3: -5.0877e+02 -5.4011e+02  3e+01  2e-03  1e-15
 4: -5.2183e

 4: -5.2106e+02 -5.2725e+02  6e+00  2e-04  2e-15
 5: -5.2360e+02 -5.2425e+02  7e-01  8e-06  2e-15
 6: -5.2388e+02 -5.2391e+02  3e-02  3e-07  2e-15
 7: -5.2389e+02 -5.2389e+02  1e-03  9e-09  2e-15
 8: -5.2389e+02 -5.2389e+02  5e-05  2e-10  2e-15
Optimal solution found.
1436 support vectors out of 1600 points
Val Accuracy = 0.65


In [28]:

X_train_1 = (pd.read_csv('data/Xtr1.csv',header=None).values).tolist()
Y_train_1 = (pd.read_csv('data/Ytr1.csv',sep=',',index_col=0).values)
X_train_1 = (np.array(X_train_1)[1:,1]).tolist()
X_test_1 = (pd.read_csv('data/Xte1.csv',header=None).values).tolist()
X_test_1 = (np.array(X_test_1)[1:,1]).tolist()
Y_train_1[Y_train_1 == 0] = -1
from sklearn.model_selection import train_test_split

In [29]:
kernels_1 = {'mismatch_6': [6, get_mismatch_embeddings],
             'mismatch_7': [7, get_mismatch_embeddings],
            'spectrum_7': [7, get_spectrum_embeddings],
            'spectrum_6': [6, get_spectrum_embeddings],
            'spectrum_8': [8, get_spectrum_embeddings],
            'mismatch_8': [8, get_mismatch_embeddings]}

gram_matrices_1 = {}
for key in kernels_1.keys():
    train_filename = 'gram_matrices/train_' + key + '_dataset1.npy'
    test_filename = 'gram_matrices/test_' + key + '_dataset1.npy'
    length = kernels_1[key][0]
    embedding_func = kernels_1[key][1]
    DNA_combinations = Combinations(proteins=['A', 'C', 'G', 'T'], n=length)
    
    if path.exists(train_filename):
        print(train_filename, ' already exists !')
        gram_train_1 = np.load(train_filename)
    else:
        print('Creating ', train_filename)
        train_embeddings_1 = np.empty([len(X_train_1), len(DNA_combinations)])
        for i in tqdm(range(len(X_train_1))):
            train_embeddings_1[i, :] = embedding_func(Seq=X_train_1[i], combinations=DNA_combinations, n=length)
        gram_train_1 = get_gram_matrix(train_embeddings_1)
        np.save(train_filename, gram_train_1)
    if path.exists(test_filename):
        print(test_filename, ' already exists !')
        gram_test_1 = np.load(test_filename)
    else:
        print('Creating ', test_filename)
        test_embeddings_1 = np.empty([len(X_test_1), len(DNA_combinations)])
        for i in tqdm(range(len(X_test_1))):
            test_embeddings_1[i, :] = embedding_func(Seq=X_test_1[i], combinations=DNA_combinations, n=length)
        gram_test_1 = get_gram_matrix(train_embeddings_1, test_embeddings_1)
        np.save(test_filename, gram_test_1)
    gram_matrices_1[key] = {'train': gram_train_1,
                           'test': gram_test_1}

  0%|                                                                                 | 2/2000 [00:00<01:57, 16.98it/s]

gram_matrices/train_mismatch_6_dataset1.npy  already exists !
gram_matrices/test_mismatch_6_dataset1.npy  already exists !
gram_matrices/train_mismatch_7_dataset1.npy  already exists !
gram_matrices/test_mismatch_7_dataset1.npy  already exists !
gram_matrices/train_spectrum_7_dataset1.npy  already exists !
gram_matrices/test_spectrum_7_dataset1.npy  already exists !
gram_matrices/train_spectrum_6_dataset1.npy  already exists !
gram_matrices/test_spectrum_6_dataset1.npy  already exists !
Creating  gram_matrices/train_spectrum_8_dataset1.npy


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [02:02<00:00, 16.39it/s]
  0%|▏                                                                                | 2/1000 [00:00<01:05, 15.31it/s]

Gram Matrix Computed for X1
Creating  gram_matrices/test_spectrum_8_dataset1.npy


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:59<00:00, 16.71it/s]
  0%|                                                                                         | 0/2000 [00:00<?, ?it/s]

Gram Matrix Computed for X2
Creating  gram_matrices/train_mismatch_8_dataset1.npy


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [48:01<00:00,  1.44s/it]
  0%|                                                                                         | 0/1000 [00:00<?, ?it/s]

Gram Matrix Computed for X1
Creating  gram_matrices/test_mismatch_8_dataset1.npy


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [24:33<00:00,  1.47s/it]


Gram Matrix Computed for X2


In [30]:
c = 0.5
sv = 1e-4
tolerance = 0.001
list_kernels = list(gram_matrices_1.keys())
list_of_prediction_test_1 = []
from sklearn.metrics import accuracy_score
for i in range(13):
    gram_train_1 = np.zeros((2000, 2000))
    gram_test_1 = np.zeros((2000, 1000))
    sampled_kernels = random.sample(list_kernels, 5)
    for kernel in sampled_kernels:
        print(f'Using {kernel}')
        gram_train_1 += gram_matrices_1[kernel]['train']
        gram_test_1 += gram_matrices_1[kernel]['test']
    gram_train_1 /= 5
    gram_test_1 /= 5
    test_size = 0.2
    list_train, list_val = train_test_split(list(range(2000)), test_size=test_size)
    gram_train = gram_train_1[list_train, :][:, list_train]
    gram_val = gram_train_1[list_train, :][:, list_val]
    y_train_split_1 = Y_train_1[list_train]
    y_val_split_1 = Y_train_1[list_val]

    svm_test = SVMC(c=c, min_sv=sv)
    svm_test.fit(gram_train, y_train_split_1)
    y_val_pred = svm_test.predict_class(gram_val).reshape(-1)
    print('Val Accuracy =', accuracy_score(y_val_split_1.reshape(-1), y_val_pred))
    y_test_pred_1 = svm_test.predict_class(gram_test_1[list_train, :])
    y_test_pred_1[y_test_pred_1 == -1] = 0
    list_of_prediction_test_1.append(y_test_pred_1)

y_pred_1 = np.array(np.array(list_of_prediction_test_1).mean(axis=0).reshape((-1,))>0.5,dtype=int)


Using mismatch_6
Using mismatch_8
Using spectrum_8
Using spectrum_6
Using mismatch_7
     pcost       dcost       gap    pres   dres
 0: -6.0377e+02 -1.7779e+03  1e+04  4e+00  2e-15
 1: -4.3045e+02 -1.4094e+03  1e+03  1e-14  2e-15
 2: -4.8744e+02 -5.9479e+02  1e+02  3e-15  2e-15
 3: -5.3198e+02 -5.5210e+02  2e+01  2e-14  1e-15
 4: -5.4108e+02 -5.4367e+02  3e+00  2e-14  2e-15
 5: -5.4235e+02 -5.4249e+02  1e-01  1e-14  2e-15
 6: -5.4243e+02 -5.4243e+02  6e-03  2e-14  2e-15
 7: -5.4243e+02 -5.4243e+02  2e-04  3e-14  2e-15
Optimal solution found.
1537 support vectors out of 1600 points
Val Accuracy = 0.6225
Using spectrum_8
Using spectrum_7
Using mismatch_6
Using spectrum_6
Using mismatch_8
     pcost       dcost       gap    pres   dres
 0: -5.8872e+02 -1.7200e+03  9e+03  4e+00  2e-15
 1: -4.3200e+02 -1.3536e+03  9e+02  4e-15  1e-15
 2: -4.8838e+02 -5.8801e+02  1e+02  9e-16  1e-15
 3: -5.2827e+02 -5.5038e+02  2e+01  9e-15  1e-15
 4: -5.3801e+02 -5.4129e+02  3e+00  2e-14  1e-15
 5: -5.3957

In [31]:

X_train_2 = (pd.read_csv('data/Xtr2.csv',header=None).values).tolist()
Y_train_2 = (pd.read_csv('data/Ytr2.csv',sep=',',index_col=0).values)
X_train_2 = (np.array(X_train_2)[1:,1]).tolist()
X_test_2 = (pd.read_csv('data/Xte2.csv',header=None).values).tolist()
X_test_2 = (np.array(X_test_2)[1:,1]).tolist()
Y_train_2[Y_train_2 == 0] = -1
from sklearn.model_selection import train_test_split

In [32]:
kernels_2 = {'mismatch_6': [6, get_mismatch_embeddings],
             'mismatch_7': [7, get_mismatch_embeddings],
            'spectrum_7': [7, get_spectrum_embeddings],
            'spectrum_6': [6, get_spectrum_embeddings],
            'spectrum_8': [8, get_spectrum_embeddings],
            'mismatch_8': [8, get_mismatch_embeddings]}

gram_matrices_2 = {}
for key in kernels_2.keys():
    train_filename = 'gram_matrices/train_' + key + '_dataset2.npy'
    test_filename = 'gram_matrices/test_' + key + '_dataset2.npy'
    length = kernels_2[key][0]
    embedding_func = kernels_2[key][1]
    DNA_combinations = Combinations(proteins=['A', 'C', 'G', 'T'], n=length)
    
    if path.exists(train_filename):
        print(train_filename, ' already exists !')
        gram_train_2 = np.load(train_filename)
    else:
        print('Creating ', train_filename)
        train_embeddings_2 = np.empty([len(X_train_2), len(DNA_combinations)])
        for i in tqdm(range(len(X_train_2))):
            train_embeddings_2[i, :] = embedding_func(Seq=X_train_2[i], combinations=DNA_combinations, n=length)
        gram_train_2 = get_gram_matrix(train_embeddings_2)
        np.save(train_filename, gram_train_2)
    if path.exists(test_filename):
        print(test_filename, ' already exists !')
        gram_test_2 = np.load(test_filename)
    else:
        print('Creating ', test_filename)
        test_embeddings_2 = np.empty([len(X_test_2), len(DNA_combinations)])
        for i in tqdm(range(len(X_test_2))):
            test_embeddings_2[i, :] = embedding_func(Seq=X_test_2[i], combinations=DNA_combinations, n=length)
        gram_test_2 = get_gram_matrix(train_embeddings_2, test_embeddings_2)
        np.save(test_filename, gram_test_2)
    gram_matrices_2[key] = {'train': gram_train_2,
                           'test': gram_test_2}

  0%|                                                                                 | 1/2000 [00:00<03:21,  9.93it/s]

gram_matrices/train_mismatch_6_dataset2.npy  already exists !
gram_matrices/test_mismatch_6_dataset2.npy  already exists !
gram_matrices/train_mismatch_7_dataset2.npy  already exists !
gram_matrices/test_mismatch_7_dataset2.npy  already exists !
gram_matrices/train_spectrum_7_dataset2.npy  already exists !
gram_matrices/test_spectrum_7_dataset2.npy  already exists !
gram_matrices/train_spectrum_6_dataset2.npy  already exists !
gram_matrices/test_spectrum_6_dataset2.npy  already exists !
Creating  gram_matrices/train_spectrum_8_dataset2.npy


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [03:08<00:00, 10.60it/s]
  0%|▏                                                                                | 2/1000 [00:00<01:02, 15.91it/s]

Gram Matrix Computed for X1
Creating  gram_matrices/test_spectrum_8_dataset2.npy


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:59<00:00, 16.82it/s]
  0%|                                                                                         | 0/2000 [00:00<?, ?it/s]

Gram Matrix Computed for X2
Creating  gram_matrices/train_mismatch_8_dataset2.npy


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [48:07<00:00,  1.44s/it]
  0%|                                                                                         | 0/1000 [00:00<?, ?it/s]

Gram Matrix Computed for X1
Creating  gram_matrices/test_mismatch_8_dataset2.npy


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [24:27<00:00,  1.47s/it]


Gram Matrix Computed for X2


In [33]:
c = 0.5
sv = 1e-4
tolerance = 0.001
list_kernels = list(gram_matrices_2.keys())
list_of_prediction_test_2 = []
from sklearn.metrics import accuracy_score
for i in range(13):
    gram_train_2 = np.zeros((2000, 2000))
    gram_test_2 = np.zeros((2000, 1000))
    sampled_kernels = random.sample(list_kernels, 5)
    for kernel in sampled_kernels:
        print(f'Using {kernel}')
        gram_train_2 += gram_matrices_2[kernel]['train']
        gram_test_2 += gram_matrices_2[kernel]['test']
    gram_train_2 /= 5
    gram_test_2 /= 5
    test_size = 0.2
    list_train, list_val = train_test_split(list(range(2000)), test_size=test_size)
    gram_train = gram_train_2[list_train, :][:, list_train]
    gram_val = gram_train_2[list_train, :][:, list_val]
    y_train_split_2 = Y_train_2[list_train]
    y_val_split_2 = Y_train_2[list_val]

    svm_test = SVMC(c=c, min_sv=sv)
    svm_test.fit(gram_train, y_train_split_2)
    y_val_pred = svm_test.predict_class(gram_val).reshape(-1)
    print('Val Accuracy =', accuracy_score(y_val_split_2.reshape(-1), y_val_pred))
    y_test_pred_2 = svm_test.predict_class(gram_test_2[list_train, :])
    y_test_pred_2[y_test_pred_2 == -1] = 0
    list_of_prediction_test_2.append(y_test_pred_2)

y_pred_2 = np.array(np.array(list_of_prediction_test_2).mean(axis=0).reshape((-1,))>0.5,dtype=int)


Using spectrum_7
Using spectrum_6
Using mismatch_6
Using mismatch_7
Using spectrum_8
     pcost       dcost       gap    pres   dres
 0: -4.9861e+02 -1.8494e+03  1e+04  5e+00  2e-15
 1: -3.5339e+02 -1.4810e+03  1e+03  2e-01  1e-15
 2: -3.8142e+02 -5.2165e+02  1e+02  2e-02  1e-15
 3: -4.2647e+02 -4.6470e+02  4e+01  3e-03  1e-15
 4: -4.4041e+02 -4.4619e+02  6e+00  2e-04  1e-15
 5: -4.4250e+02 -4.4340e+02  9e-01  2e-05  1e-15
 6: -4.4286e+02 -4.4293e+02  7e-02  1e-06  1e-15
 7: -4.4289e+02 -4.4290e+02  1e-02  2e-07  1e-15
 8: -4.4289e+02 -4.4289e+02  3e-04  2e-09  1e-15
Optimal solution found.
1335 support vectors out of 1600 points
Val Accuracy = 0.7575
Using mismatch_7
Using mismatch_8
Using spectrum_8
Using spectrum_7
Using mismatch_6
     pcost       dcost       gap    pres   dres
 0: -4.9269e+02 -1.8287e+03  1e+04  5e+00  2e-15
 1: -3.5459e+02 -1.4631e+03  1e+03  2e-01  1e-15
 2: -3.8236e+02 -5.2576e+02  1e+02  2e-02  1e-15
 3: -4.2547e+02 -4.6752e+02  4e+01  3e-03  1e-15
 4: -4.4041

 3: -4.1428e+02 -4.5718e+02  4e+01  4e-03  9e-16
 4: -4.2965e+02 -4.3790e+02  8e+00  4e-04  1e-15
 5: -4.3263e+02 -4.3383e+02  1e+00  3e-05  1e-15
 6: -4.3308e+02 -4.3321e+02  1e-01  3e-06  1e-15
 7: -4.3313e+02 -4.3314e+02  3e-03  4e-08  1e-15
 8: -4.3314e+02 -4.3314e+02  9e-05  6e-10  1e-15
Optimal solution found.
1368 support vectors out of 1600 points
Val Accuracy = 0.71


In [34]:
y_pred = list(y_pred_0) + list(y_pred_1) + list(y_pred_2)
with open("outputs/summing_6kernels_13models_ensembled.csv", 'w') as f:
    f.write('Id,Bound\n')
    for i in range(len(y_pred)):
        f.write(str(i)+','+str(y_pred[i])+'\n')