# Spectrum kernel with Ridge Regression

In [14]:
import sys
sys.path.append('General')
sys.path.append('Logistic regression')

import pandas as pd
import numpy as np
from time import time

from kernel_functions import kernel_spectrum
from spectrum_toolbox import preindexation,Spectrum_embedding

from Kernel_logistic_regression import KernelLogisticRegression

from sklearn.model_selection import KFold
import scipy.sparse as sp

import matplotlib.pyplot as plt

from time import time

In [2]:
def GridSearch_spectrum(X,y,hyperparameters,K = 6):
    n_lengths = len(hyperparameters['lengths'])
    n_lambdas = len(hyperparameters['lambdas'])
    
    scores_mean = np.zeros((n_lengths,n_lambdas))
    scores_std = np.zeros((n_lengths,n_lambdas))
    
    kf = KFold(n_splits = K)
    
    params = dict()
    
    for i in range(n_lengths):
        params['k'] = hyperparameters['lengths'][i]
        preindex = preindexation(params['k'])
        X_emb = Spectrum_embedding(X,params['k'],preindex = preindex)
        
        for j in range(n_lambdas):
            l = hyperparameters['lambdas'][j]
            acc = []
            
            for train_idx,test_idx in kf.split(X):
                model = KernelLogisticRegression("spectrum",params)
                Xtrain,Xtest = X_emb[train_idx,:],X_emb[test_idx,:]
                ytrain,ytest = y[train_idx],y[test_idx]
                print('l')
                start = time()
                model.fit(Xtrain,ytrain,reg = l)
                print(start-time())
                ypred = model.predict(Xtest)

                acc.append((ypred==ytest).mean())
            scores_mean[i,j] = np.array(acc).mean()
            scores_std[i,j] = np.array(acc).std()
        del(preindex)
    return(scores_mean,scores_std)

## K = 0

In [3]:
X = pd.read_csv('data/Xtr0.csv')
y = pd.read_csv('data/ytr0.csv')
X.set_index('Id',inplace = True)

X.head()

Unnamed: 0_level_0,seq
Id,Unnamed: 1_level_1
0,TCCTGTGCACATCTGCACCCCTGTTGTGGCCACAAAATGATCCGGC...
1,TTAAGTGTATATCTAATAATTTTTTTGCCTACATTCCTGTGTTACC...
2,GTGCTCAATTAGTTGCCTACAAATAGTAGCCTGGCACAGTGTAAGC...
3,CACCTGGAAAATACAAACAGGCGCAAGAAGAGTTAACCCACAGATC...
4,AAATCACTGCCTATCCTTGGGCCAAAAGGTTTCTACAGGAAGCTGC...


In [4]:
hyperparameters = dict()
hyperparameters['lengths'] = np.arange(2,3)
hyperparameters['lambdas'] = np.logspace(-5,5,2)

In [5]:
start = time()
mean,std = GridSearch_spectrum(X['seq'].to_numpy(),y['Bound'].to_numpy(),hyperparameters)
print(time()-start)

l
-157.84230494499207
l


SolverError: Solver 'ECOS' failed. Try another solver, or solve with verbose=True for more information.

In [None]:
for k in range(len(hyperparameters['lengths'])):
    plt.semilogx(hyperparameters['lambdas'],mean[k,:],label = "$k = {0}$".format(hyperparameters['lengths'][k]))
plt.legend()


## K = 1

In [None]:
X = pd.read_csv('data/Xtr1.csv')
y = pd.read_csv('data/ytr1.csv')
X.set_index('Id',inplace = True)

X.head()

In [None]:
from spectrum_toolbox import preindexation,Spectrum_embedding

start = time()
mean,std = GridSearch_spectrum(X['seq'].to_numpy(),y['Bound'].to_numpy(),hyperparameters)
print(time()-start)

In [None]:
for k in range(len(hyperparameters['lengths'])):
    plt.semilogx(hyperparameters['lambdas'],mean[k,:],label = "$k = {0}$".format(hyperparameters['lengths'][k]))
plt.legend()


## K = 2

In [None]:
X = pd.read_csv('data/Xtr2.csv')
y = pd.read_csv('data/ytr2.csv')
X.set_index('Id',inplace = True)

X.head()

In [None]:
start = time()
mean,std = GridSearch_spectrum(X['seq'].to_numpy(),y['Bound'].to_numpy(),hyperparameters)
print(time()-start)

In [None]:
for k in range(len(hyperparameters['lengths'])):
    plt.semilogx(hyperparameters['lambdas'],mean[k,:],label = "$k = {0}$".format(hyperparameters['lengths'][k]))
plt.legend()


In [7]:
import cvxpy as cp

np.random.seed(1)
n = 2000
m = 2000
def sigmoid(z):
  return 1/(1 + np.exp(-z))

beta_true = np.array([1, 0.5, -0.5] + [0]*(n - 3))
X = (np.random.random((m, n)) - 0.5)*10
Y = np.round(sigmoid(X @ beta_true + np.random.randn(m)*0.5))

X_test = (np.random.random((2*m, n)) - 0.5)*10
Y_test = np.round(sigmoid(X_test @ beta_true + np.random.randn(2*m)*0.5))

In [9]:
beta = cp.Variable(n)
lambd = cp.Parameter(nonneg=True)
log_likelihood = cp.sum(
    cp.multiply(Y, X @ beta) - cp.logistic(X @ beta)
)
problem = cp.Problem(cp.Maximize(log_likelihood/n - lambd * cp.norm(beta, 1)))

In [10]:
def error(scores, labels):
  scores[scores > 0] = 1
  scores[scores <= 0] = 0
  return np.sum(np.abs(scores - labels)) / float(np.size(labels))

In [13]:
trials = 1
train_error = np.zeros(trials)
test_error = np.zeros(trials)
lambda_vals = np.logspace(-1, 0, trials)
beta_vals = []
for i in range(trials):
    lambd.value = lambda_vals[i]
    problem.solve()
    train_error[i] = error( (X @ beta).value, Y)
    test_error[i] = error( (X_test @ beta).value, Y_test)
    beta_vals.append(beta.value)