In [2]:
import os
os.chdir('/content/drive/MyDrive/MVA/KKML')

# Kernel Methods: Challenge

Julia Linhart, Roman Castagné, Louis Bouvier

In [2]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from functools import partial
from scipy.spatial import distance_matrix
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import GridSearchCV
import cvxpy as cp
import warnings
import time
from itertools import product
from numba import jit

from utils import run_model, write_csv


warnings.filterwarnings("ignore", category=DeprecationWarning)

# I) Preprocessing

In [3]:
data_folder = 'data' # 'machine-learning-with-kernel-methods-2021'

X_train_1 = pd.read_csv(f'{data_folder}/Xtr2_mat100.csv', sep = ' ', index_col=False, header=None)
y_train_1 = pd.read_csv(f'{data_folder}/Ytr2.csv')

In [4]:
y_train_1.describe()

Unnamed: 0,Id,Bound
count,2000.0,2000.0
mean,4999.5,0.4985
std,577.494589,0.500123
min,4000.0,0.0
25%,4499.75,0.0
50%,4999.5,0.0
75%,5499.25,1.0
max,5999.0,1.0


In [5]:
y_train_1 = np.array(y_train_1)[:,1]

In [6]:
X_train_1.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,0.010565,0.010201,0.010375,0.011587,0.011609,0.010707,0.009359,0.011957,0.009571,0.010582,0.009424,0.009793,0.012848,0.012092,0.011196,0.010364,0.009875,0.010962,0.010185,0.008342,0.010734,0.010038,0.011554,0.008995,0.010283,0.008647,0.008886,0.008826,0.007821,0.009761,0.008533,0.011864,0.009299,0.010641,0.00956,0.008929,0.010217,0.009641,0.00988,0.010038,...,0.009511,0.010614,0.011957,0.009641,0.011772,0.0095,0.008783,0.010005,0.01087,0.009147,0.013565,0.010587,0.009793,0.010908,0.0095,0.009772,0.009103,0.010147,0.008587,0.010538,0.010897,0.008913,0.00863,0.00838,0.009016,0.011478,0.008832,0.009989,0.010587,0.008625,0.007951,0.009457,0.008554,0.009283,0.008261,0.009614,0.011141,0.009777,0.008217,0.008565
std,0.012278,0.010723,0.011467,0.011453,0.012182,0.010478,0.009789,0.012444,0.013805,0.013652,0.012934,0.011163,0.027178,0.01816,0.0112,0.010356,0.010089,0.019951,0.010631,0.00992,0.011238,0.010962,0.011475,0.009723,0.010922,0.009933,0.009622,0.009861,0.010099,0.010628,0.009945,0.010829,0.010358,0.01046,0.011039,0.009612,0.010705,0.012258,0.020208,0.011266,...,0.010436,0.011172,0.012915,0.010912,0.011305,0.016977,0.014644,0.012108,0.0118,0.009647,0.011868,0.011752,0.013102,0.010237,0.009652,0.009687,0.011871,0.010457,0.012348,0.01101,0.011005,0.010695,0.009248,0.010494,0.009279,0.011204,0.010571,0.015973,0.009745,0.011904,0.009605,0.009701,0.00935,0.009741,0.012341,0.010338,0.010863,0.010402,0.009709,0.009283
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.0,0.01087,0.0,0.01087,0.0,0.01087,0.01087,0.01087,0.01087,0.0,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.0,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.0,0.01087,...,0.01087,0.01087,0.01087,0.01087,0.01087,0.0,0.0,0.01087,0.01087,0.01087,0.01087,0.01087,0.0,0.01087,0.01087,0.01087,0.0,0.01087,0.0,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.0,0.01087,0.0,0.01087,0.01087,0.01087,0.01087,0.0,0.01087,0.01087,0.01087,0.01087,0.01087
75%,0.01087,0.021739,0.021739,0.021739,0.021739,0.021739,0.01087,0.021739,0.01087,0.021739,0.01087,0.01087,0.01087,0.01087,0.021739,0.021739,0.01087,0.01087,0.021739,0.01087,0.021739,0.01087,0.021739,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.021739,0.01087,0.021739,0.01087,0.01087,0.021739,0.01087,0.01087,0.01087,...,0.01087,0.01087,0.021739,0.01087,0.021739,0.01087,0.01087,0.021739,0.021739,0.01087,0.021739,0.01087,0.01087,0.021739,0.01087,0.01087,0.01087,0.01087,0.01087,0.021739,0.021739,0.01087,0.01087,0.01087,0.01087,0.021739,0.01087,0.021739,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.021739,0.01087,0.01087,0.01087
max,0.086957,0.065217,0.097826,0.065217,0.065217,0.054348,0.054348,0.076087,0.097826,0.184783,0.108696,0.065217,0.23913,0.141304,0.076087,0.054348,0.065217,0.141304,0.054348,0.065217,0.076087,0.108696,0.076087,0.054348,0.065217,0.065217,0.054348,0.054348,0.065217,0.065217,0.097826,0.065217,0.065217,0.065217,0.119565,0.043478,0.054348,0.141304,0.206522,0.065217,...,0.065217,0.086957,0.119565,0.065217,0.065217,0.119565,0.23913,0.076087,0.086957,0.065217,0.065217,0.076087,0.086957,0.065217,0.043478,0.054348,0.086957,0.054348,0.097826,0.065217,0.076087,0.065217,0.043478,0.065217,0.043478,0.065217,0.086957,0.108696,0.065217,0.097826,0.054348,0.065217,0.054348,0.054348,0.086957,0.065217,0.076087,0.065217,0.065217,0.043478


In [7]:
X_train_1 = np.array(X_train_1)
X_train_1 = (X_train_1 - X_train_1.mean(axis=0))/X_train_1.std(axis=0)

# II) First linear models of the mat100 input

## A) Logistic regression

In [None]:
from utils import run_model

run_model('logreg')

  inv_hess, _, _, _ = np.linalg.lstsq(hess, np.eye(hess.shape[0]))
  inv_hess, _, _, _ = np.linalg.lstsq(hess, np.eye(hess.shape[0]))


Accuracy on train set 0: 0.62
Accuracy on test set 0 : 0.56
Accuracy on train set 1: 0.60
Accuracy on test set 1 : 0.59
Accuracy on train set 2: 0.70
Accuracy on test set 2 : 0.66


In [None]:
from utils import write_csv

ids = np.arange(all_y_eval.shape[0])
filename = "results/submission_log_reg.csv"

# write_csv(ids, all_y_eval, filename)

## B) Ridge regression

In [None]:
run_model('rr')

Accuracy on train set 0: 0.65
Accuracy on test set 0 : 0.60
Accuracy on train set 1: 0.64
Accuracy on test set 1 : 0.57
Accuracy on train set 2: 0.73
Accuracy on test set 2 : 0.69


# III) Kernel baselines 

## A) Gaussian Kernel

### a) Kernel Ridge Regression

In [None]:
## run kernel ridge regression with gaussian kernel
run_model('krr', kernel='gaussian', prop_test=0.2, use_grid_search=True)

{'kernel': 'gaussian', 'lamb': 0.1, 'sigma': 0.5789473684210527}
Accuracy on train set 0: 1.00
Accuracy on test set 0 : 0.57
{'kernel': 'gaussian', 'lamb': 0.1, 'sigma': 0.6578947368421053}
Accuracy on train set 1: 1.00
Accuracy on test set 1 : 0.59
{'kernel': 'gaussian', 'lamb': 0.1, 'sigma': 0.5}
Accuracy on train set 2: 1.00
Accuracy on test set 2 : 0.67


### b) Kernel SVM

In [None]:
## run kernel SVM with gaussian kernel
run_model('ksvm', kernel='gaussian', prop_test=0.2)

{'kernel': 'gaussian', 'lamb': 1e-07, 'sigma': 100.0}
Accuracy on train set 0: 0.99
Accuracy on test set 0 : 0.62
{'kernel': 'gaussian', 'lamb': 1e-10, 'sigma': 1.0}
Accuracy on train set 1: 1.00
Accuracy on test set 1 : 0.60
{'kernel': 'gaussian', 'lamb': 1e-07, 'sigma': 100.0}
Accuracy on train set 2: 0.99
Accuracy on test set 2 : 0.72


## B) Spectrum kernel

In [None]:
from kernels import Spectrum_kernel

In [None]:
# Example when using a precomputed kernel
K = []
for name in [0, 1, 2]:
    X    = np.array(pd.read_csv(f'{data_folder}/Xtr{name}.csv')['seq'])
    X_ev = np.array(pd.read_csv(f'{data_folder}/Xte{name}.csv')['seq'])
    
    t0 = time.time()
    K_tr = Spectrum_kernel(X, X, k=6)
    print(f"Time to compute train kernel : {time.time() - t0}")
    K_te = Spectrum_kernel(X, X_ev, k=6)
    
    K.append({"train": K_tr, "eval": K_te})

Time to compute train kernel : 0.7150859832763672
Time to compute train kernel : 0.6163516044616699
Time to compute train kernel : 0.6303155422210693


In [None]:
## run kernel SVM with gaussian kernel
run_model('ksvm', kernel='spectrum', K=K, sequence=True, prop_test=0.2)

### a) Kernel SVM


## C) Substring kernel

In [None]:
from kernels import substring_similarity, substring_kernel

In [None]:
# Run similarity between two strings
t0 = time.time()
# k = substring_similarity("ATGCATGATGCATG", "ATGCATCATGATGT", 3, 1.)
# k = substring_similarity("ATGC", "ATGC", 3, 0.7)
k = substring_similarity("cat", "cat", 1, 0.7)
k_expected = 2 * 0.7 ** 4 + 0.7 ** 6
print(f"Time to compute : {time.time() - t0:.4f}s")
print(f"Value : {k}")
print(f"Expected value : {k_expected}")

Time to compute : 2.0617s
Value : 0.9799999999999999
Expected value : 0.5978489999999999


In [None]:
# Run kernel computation between N strings
X = pd.read_csv(f'{data_folder}/Xtr0.csv', sep = ',').to_numpy()
X = X[:100,1]
t0 = time.time()
K = substring_kernel(X, X, k=5, lambd=0.7)
print(f"Time to compute K : {time.time() - t0:.2f}s")
# print(K)

Time to compute K : 76.18s


## D) Mismatch kernel

In [None]:
from kernels import Mismatch_kernel

In [None]:
# Example when using a precomputed kernel
K = []
for name in [0, 1, 2]:
    X    = np.array(pd.read_csv(f'{data_folder}/Xtr{name}.csv')['seq'])
    X_ev = np.array(pd.read_csv(f'{data_folder}/Xte{name}.csv')['seq'])
    
    t0 = time.time()
    K_tr = Mismatch_kernel(X, X, k=7, m=3)
    print(f"Time to compute train kernel : {time.time() - t0}")
    K_te = Mismatch_kernel(X, X_ev, k=7, m=3)
    
    K.append({"train": K_tr, "eval": K_te})

Time to compute train kernel : 11390.832644939423
Time to compute train kernel : 289.070552110672
Time to compute train kernel : 269.14104986190796


In [None]:
y_eval = run_model('ksvm', kernel='mismatch', K=K, sequence=True, prop_test=0.001)

Accuracy on train set 0: 0.87
Accuracy on test set 0 : 0.50
Accuracy on train set 1: 0.88
Accuracy on test set 1 : 0.50
Accuracy on train set 2: 0.91
Accuracy on test set 2 : 0.00


In [None]:
ids = np.arange(y_eval.shape[0])
filename = "results/submission_mismatch_7_3.csv"
write_csv(ids, y_eval, filename)

## E) Fisher kernel

In [35]:
from utils import init_model, run_model

In [36]:
from kernels import spectrum_matrix, emission_probs, Fisher_kernel

In [37]:
data_folder = 'data'
X = pd.read_csv(f'{data_folder}/Xtr0.csv')
X = np.array(X['seq'])
y = pd.read_csv(f'{data_folder}/Ytr0.csv')
y = np.array(y['Bound'])
df = pd.DataFrame({'Sequence':X,'Label':y})

In [38]:
X_pos = np.array(df[df['Label']==1]['Sequence'])

In [39]:
k=4
spect_X_pos = spectrum_matrix(X_pos,k)
chars = np.array([''.join(s) for s in product(["A", "T", "G", "C"], repeat=k)])
em_probs = emission_probs(spect_X_pos,chars)

In [45]:
#precompute kernel 
K = []
for name in [0, 1, 2]:
    X    = np.array(pd.read_csv(f'{data_folder}/Xtr{name}.csv')['seq'])
    X_ev = np.array(pd.read_csv(f'{data_folder}/Xte{name}.csv')['seq'])
    
    t0 = time.time()
    K_tr = Fisher_kernel(X, X,X_pos, k=5)
    print(f"Time to compute train kernel : {time.time() - t0}")
    K_te = Fisher_kernel(X, X_ev, X_pos, k=4)
    
    K.append({"train": K_tr, "eval": K_te})


Time to compute train kernel : 1213.7264323234558
Time to compute train kernel : 1207.4278769493103
Time to compute train kernel : 1215.6039595603943


In [41]:
for name in [0, 1, 2]:
  K[name]['train'] = K_tr+10e-8*np.eye(K_tr.shape[0])





In [42]:
np.save('fisher_kernels_k4.npy',K)

In [43]:
default_params = {'lamb': 15, 'sigma': 1.2, 'k': [4,5, 6]}

In [44]:
y_eval = run_model('ksvm', kernel='fisher', K=K, sequence=True, prop_test=0.001,default_params=default_params)

Accuracy on train set 0: 0.53
Accuracy on test set 0 : 0.50
Accuracy on train set 1: 0.52
Accuracy on test set 1 : 0.50
Accuracy on train set 2: 0.52
Accuracy on test set 2 : 0.50


In [28]:
K_tr = K[0]['train']
np.max(K_tr)

4.512322310369321

In [29]:
eig = np.linalg.eig(K_tr)[0]

In [30]:
np.min(np.real(eig))

1.9999997152304172e-07

In [72]:
-100000000/100000

-1000.0