In [2]:
import os
os.chdir('/content/drive/MyDrive/MVA/KKML')

# Kernel Methods: Challenge

Julia Linhart, Roman Castagné, Louis Bouvier

Preliminary functions:

In [1]:
def write_csv(ids, labels, filename):
    """
    inputs:
        - ids: list of ids, should be an increasing list of integers
        - labels: list of corresponding labels, either 0 or 1
        - file: string containing the name that should be given to the submission file    
    """
    df = pd.DataFrame({"Id": ids, "Bound": labels})
    df["Bound"] = df["Bound"].replace([-1], 0)
    df.to_csv(filename, sep=',', index=False)

# I) Preprocessing

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from functools import partial
from scipy.spatial import distance_matrix
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import GridSearchCV
import cvxpy as cp
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

In [4]:
data_folder = 'data' # 'machine-learning-with-kernel-methods-2021'

X_train_1 = pd.read_csv(f'{data_folder}/Xtr2_mat100.csv', sep = ' ', index_col=False, header=None)
y_train_1 = pd.read_csv(f'{data_folder}/Ytr2.csv')

In [11]:
y_train_1.describe()

Unnamed: 0,Id,Bound
count,2000.0,2000.0
mean,4999.5,0.4985
std,577.494589,0.500123
min,4000.0,0.0
25%,4499.75,0.0
50%,4999.5,0.0
75%,5499.25,1.0
max,5999.0,1.0


In [12]:
y_train_1 = np.array(y_train_1)[:,1]

In [15]:
X_train_1.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,0.010565,0.010201,0.010375,0.011587,0.011609,0.010707,0.009359,0.011957,0.009571,0.010582,0.009424,0.009793,0.012848,0.012092,0.011196,0.010364,0.009875,0.010962,0.010185,0.008342,0.010734,0.010038,0.011554,0.008995,0.010283,0.008647,0.008886,0.008826,0.007821,0.009761,0.008533,0.011864,0.009299,0.010641,0.00956,0.008929,0.010217,0.009641,0.00988,0.010038,...,0.009511,0.010614,0.011957,0.009641,0.011772,0.0095,0.008783,0.010005,0.01087,0.009147,0.013565,0.010587,0.009793,0.010908,0.0095,0.009772,0.009103,0.010147,0.008587,0.010538,0.010897,0.008913,0.00863,0.00838,0.009016,0.011478,0.008832,0.009989,0.010587,0.008625,0.007951,0.009457,0.008554,0.009283,0.008261,0.009614,0.011141,0.009777,0.008217,0.008565
std,0.012278,0.010723,0.011467,0.011453,0.012182,0.010478,0.009789,0.012444,0.013805,0.013652,0.012934,0.011163,0.027178,0.01816,0.0112,0.010356,0.010089,0.019951,0.010631,0.00992,0.011238,0.010962,0.011475,0.009723,0.010922,0.009933,0.009622,0.009861,0.010099,0.010628,0.009945,0.010829,0.010358,0.01046,0.011039,0.009612,0.010705,0.012258,0.020208,0.011266,...,0.010436,0.011172,0.012915,0.010912,0.011305,0.016977,0.014644,0.012108,0.0118,0.009647,0.011868,0.011752,0.013102,0.010237,0.009652,0.009687,0.011871,0.010457,0.012348,0.01101,0.011005,0.010695,0.009248,0.010494,0.009279,0.011204,0.010571,0.015973,0.009745,0.011904,0.009605,0.009701,0.00935,0.009741,0.012341,0.010338,0.010863,0.010402,0.009709,0.009283
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.0,0.01087,0.0,0.01087,0.0,0.01087,0.01087,0.01087,0.01087,0.0,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.0,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.0,0.01087,...,0.01087,0.01087,0.01087,0.01087,0.01087,0.0,0.0,0.01087,0.01087,0.01087,0.01087,0.01087,0.0,0.01087,0.01087,0.01087,0.0,0.01087,0.0,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.0,0.01087,0.0,0.01087,0.01087,0.01087,0.01087,0.0,0.01087,0.01087,0.01087,0.01087,0.01087
75%,0.01087,0.021739,0.021739,0.021739,0.021739,0.021739,0.01087,0.021739,0.01087,0.021739,0.01087,0.01087,0.01087,0.01087,0.021739,0.021739,0.01087,0.01087,0.021739,0.01087,0.021739,0.01087,0.021739,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.021739,0.01087,0.021739,0.01087,0.01087,0.021739,0.01087,0.01087,0.01087,...,0.01087,0.01087,0.021739,0.01087,0.021739,0.01087,0.01087,0.021739,0.021739,0.01087,0.021739,0.01087,0.01087,0.021739,0.01087,0.01087,0.01087,0.01087,0.01087,0.021739,0.021739,0.01087,0.01087,0.01087,0.01087,0.021739,0.01087,0.021739,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.021739,0.01087,0.01087,0.01087
max,0.086957,0.065217,0.097826,0.065217,0.065217,0.054348,0.054348,0.076087,0.097826,0.184783,0.108696,0.065217,0.23913,0.141304,0.076087,0.054348,0.065217,0.141304,0.054348,0.065217,0.076087,0.108696,0.076087,0.054348,0.065217,0.065217,0.054348,0.054348,0.065217,0.065217,0.097826,0.065217,0.065217,0.065217,0.119565,0.043478,0.054348,0.141304,0.206522,0.065217,...,0.065217,0.086957,0.119565,0.065217,0.065217,0.119565,0.23913,0.076087,0.086957,0.065217,0.065217,0.076087,0.086957,0.065217,0.043478,0.054348,0.086957,0.054348,0.097826,0.065217,0.076087,0.065217,0.043478,0.065217,0.043478,0.065217,0.086957,0.108696,0.065217,0.097826,0.054348,0.065217,0.054348,0.054348,0.086957,0.065217,0.076087,0.065217,0.065217,0.043478


In [16]:
X_train_1 = np.array(X_train_1)
print(X_train_1.shape)
X_train_1 = (X_train_1 - X_train_1.mean(axis=0))/X_train_1.std(axis=0)

(2000, 100)


In [17]:
print(y_train_1.shape)

(2000,)


# II) First linear models of the mat100 input

## A) Logistic regression

In [18]:
def g(z):
    """
    input:
    - z (any size): an array-like element
    ouput:
    - the element-wize application of the sigmoïd function on z
    """
    return 1/(1+np.exp(-z))

In [19]:
def compute_loss(X,y,w,b):
    """
    inputs:
    - X (size: Nxd): the points we want to classify
    - y (size: Nx1): the values of the classes
    - w (size: 1xd): the weights of the affine mapping of x
    - b (size: 1x1): the constant of the affine mapping of x
    output:
    - the opposite of the log-likelihood of the Logistic Regression model computed with respect to
    the points (X,y) and the parameters w,b
    """
    X_tilde = np.hstack([X, np.ones((X.shape[0], 1))])
    w_tilde = np.hstack((w,b))
    return -np.sum(y * np.log(g(w_tilde@X_tilde.T)) + (1-y) * np.log(1-g(w_tilde@X_tilde.T)), axis=1)

In [20]:
def compute_grad(X,y,w,b):
    """
    inputs:
    - X (size: Nxd): the points we want to classify
    - y (size: Nx1): the values of the classes
    - w (size: 1xd): the weights of the affine mapping of x
    - b (size: 1x1): the constant of the affine mapping of x
    output:
    - the gradient of the loss of the Logistic Regression model computed 
    with respect to (w,b) = w_tilde having the points (X,y) 
    """
    X_tilde = np.hstack([X, np.ones((X.shape[0], 1))])
    w_tilde = np.hstack((w,b))
    return -X_tilde.T @ (y - g(w_tilde@X_tilde.T).reshape(-1,))

In [21]:
def compute_hess(X,y,w,b):
    """
    inputs:
    - X (size: Nxd): the points we want to classify
    - y (size: Nx1): the values of the classes
    - w (size: 1xd): the weights of the affine mapping of x
    - b (size: 1x1): the constant of the affine mapping of x
    output:
    - the hessian of the loss of the Logistic Regression model computed 
    with respect to (w,b) = w_tilde having the points (X,y) 
    """
    X_tilde = np.hstack([X, np.ones((X.shape[0], 1))])
    w_tilde = np.hstack((w,b))    
    temp = (g(w_tilde @ X_tilde.T) * (g(w_tilde @ X_tilde.T) - 1)).reshape(-1,)
    return -X_tilde.T @ np.diag(temp) @ X_tilde

In [22]:
def backtracking(X,y,w,b,delta,grad,alpha=0.1,beta=0.7):
    """
    inputs:
    - X (size: Nxd): the points we want to classify
    - y (size: Nx1): the values of the classes
    - w (size: 1xd): the weights of the affine mapping of x
    - b (size: 1x1): the constant of the affine mapping of x
    - delta (size n): direction of the search
    - grad (size n): value of the gradient at point (w,b)
    - alpha: factor of the slope of the line in the backtracking line search
    - beta: factor of reduction of the step length
    
    outputs:
    - t: the step length for the Newton step on the objective function
    computed with backtracking line search towards delta"""
        
    t = 1
    while(compute_loss(X, y, w+t*delta[:-1], b+t*delta[-1])>
            compute_loss(X,y,w,b) + alpha*t*grad.T @ delta):
        t = beta*t
    return t

In [23]:
def Newton(X, y, w0, b0, eps=pow(10,-1)):
    """
    inputs:
    - X (size: Nxd): the points we want to classify
    - y (size: Nx1): the values of the classes
    - w0 (size: 1xd): the initial weights of the affine mapping of x
    - b0 (size: 1x1): the initial constant of the affine mapping of x
    output:
    - the paramer vector w_tilde_hat = (w_hat, b_hat) which maximizes the log-likelihood of 
    the sample (X,y) in the Logistic Regression model (or minimizes the loss)
    - the cached values of the loss evaluated along training
    """
    w_, b_ = w0, b0
    grad = compute_grad(X, y, w0, b0)
    hess = compute_hess(X, y, w0, b0)
    
#     inv_hess = np.linalg.inv(compute_hess(X,y,w0,b0))
    inv_hess, _, _, _ = np.linalg.lstsq(hess, np.eye(hess.shape[0]))
    dec_2 = grad.T@inv_hess@grad
    Loss_hist = [compute_loss(X,y,w0,b0)]
    while dec_2/2 > eps: # condition on the Newton decrement
        grad = compute_grad(X,y,w_,b_)
        hess = compute_hess(X,y,w_,b_)
        
#         inv_hess = np.linalg.inv(compute_hess(X,y,w_,b_))
        inv_hess, _, _, _ = np.linalg.lstsq(hess, np.eye(hess.shape[0]))
        dec_2 = grad.T@inv_hess@grad
        delta = - inv_hess@grad
        t_bt = backtracking(X, y, w_, b_, delta, grad)
        w_ = w_ + t_bt*delta[:-1]
        b_ = b_ + t_bt*delta[-1]
        Loss_hist.append(compute_loss(X,y,w_,b_))
    return w_, b_, Loss_hist

In [24]:
def predict_LogReg(x,w,b):
    """
    inputs:
    - x (size 1xd): a point in R^d
    - w (size: 1xd): the weights of the affine mapping of x
    - b (size: 1x1): the constant of the affine mapping of x
    output:
     - the predicted class for the associated y given the
    Logistic Regression parameters
    """    
    return (w.T@x + b > 0).astype("int")

In [25]:
class LogisticRegressor(BaseEstimator, ClassifierMixin):
    
    def __init__(self, lamb=1.):
        """
        This class implements methods for fitting and predicting with a LogesticRegression for classification 
        inputs:
        - lamb : the regularisation parameter
        """
        self.lamb = lamb
    
    def fit(self, X, y):
        """
        inputs:
        - X (size: Nxd): the points we want to classify
        - y (size: Nx1): the values of the classes
        outputs:
        - the value of MLE estimation (w_hat, b_hat) in the Linear regression model
        """
        w0, b0 = np.random.randn(1, 100)*0.07, np.zeros((1,1))
        self.w_, self.b_, _ = Newton(X, y, w0, b0)
        
        return self
        
    def predict(self, X):
        """
        inputs:
        - X (size Nxd): a point in R^d
        - w (size: 1xd): the weights of the affine mapping of x
        - b (size: 1x1): the constant of the affine mapping of x
        output:
         - the predicted class for the associated y given the
        Linear Regression parameters
        """    
        return (self.w_@X.T + self.b_ > 1/2).astype("int")
        
    def score(self, X, y):
        """
        inputs:
        - X (size Nxd): the points in R^d we want to classify
        - y (size Nx1): the labels of the points
        """
        y_pred = self.predict(X)
        return np.sum(y_pred == y)/y.shape[0]

In [38]:
dim = 100
Nb_samples = 2000
prop_test = 0.05

all_y_eval = []

np.random.seed(1)
for name in [0, 1, 2]:
    X = pd.read_csv(f'{data_folder}/Xtr{name}_mat100.csv', sep = ' ', index_col=False, header=None).to_numpy()
    y = pd.read_csv(f'{data_folder}/Ytr{name}.csv')
    y = y["Bound"].to_numpy()
    
    mean, std = X.mean(axis=0), X.std(axis=0)
    X = (X - mean)/std

    tr_indices = np.random.choice(Nb_samples, size=int((1-prop_test)*Nb_samples), replace=False)
    te_indices = [idx for idx in range(Nb_samples) if idx not in tr_indices]

    X_tr = X[tr_indices]
    X_te = X[te_indices]
    
    y_tr = y[tr_indices]
    y_te = y[te_indices]
    
    assert X_tr.shape[0] + X_te.shape[0] == X.shape[0]
    assert y_tr.shape[0] + y_te.shape[0] == y.shape[0]
    
    # Fitting
    logreg = LogisticRegressor()
    
    logreg.fit(X_tr, y_tr)

    
    print(f"Accuracy on train set {name}: {logreg.score(X_tr, y_tr):.2f}")
    print(f"Accuracy on test set {name} : {logreg.score(X_te, y_te):.2f}")   
    
    # Prediction on the new set
    X_eval = pd.read_csv(f'{data_folder}/Xte{name}_mat100.csv', sep = ' ', index_col=False, header=None).to_numpy()
    X_eval = (X_eval - mean)/std
    y_eval = logreg.predict(X_eval)
    all_y_eval.append(y_eval)
    
all_y_eval = np.hstack(all_y_eval).reshape(-1)



Accuracy on train set 0: 0.62
Accuracy on test set 0 : 0.56
Accuracy on train set 1: 0.60
Accuracy on test set 1 : 0.59
Accuracy on train set 2: 0.70
Accuracy on test set 2 : 0.66


In [39]:
ids = np.arange(all_y_eval.shape[0])
filename = "results/submission_log_reg.csv"

# write_csv(ids, all_y_eval, filename)

## B) Ridge regression

In [40]:
def compute_RR_MLE(X,y,lamb):
    """
    inputs:
    - X (size: Nxd): the points we want to classify
    - y (size: Nx1): the values of the classes
    outputs:
    - the value of MLE estimation (w_hat, b_hat) in the Linear regression model
    """
    X_tilde = np.vstack((X,np.ones(X.shape[1])))
    temp = np.linalg.inv(X_tilde@X_tilde.T + lamb*X.shape[1]*np.eye(1+X.shape[0]))@X_tilde@y.T
    return temp[:-1], temp[-1]

In [41]:
def predict_RR(x,w,b):
    """
    inputs:
    - x (size 1xd): a point in R^d
    - w (size: 1xd): the weights of the affine mapping of x
    - b (size: 1x1): the constant of the affine mapping of x
    output:
     - the predicted class for the associated y given the
    Linear Regression parameters
    """    
    return (w.T@x+b>1/2).astype("int")

In [42]:
class RidgeRegressor(BaseEstimator, ClassifierMixin):
    
    def __init__(self, lamb=1.):
        """
        This class implements methods for fitting and predicting with a RidgeRegressor used for classification 
        (by thresholding the value regressed).
        inputs:
        - lamb : the regularisation parameter
        """
        self.lamb = lamb
    
    def fit(self, X, y):
        """
        inputs:
        - X (size: Nxd): the points we want to classify
        - y (size: Nx1): the values of the classes
        outputs:
        - the value of MLE estimation (w_hat, b_hat) in the Linear regression model
        """
        X_tilde = np.hstack((X, np.ones((X.shape[0], 1))))
        temp = np.linalg.inv(X_tilde.T @ X_tilde + self.lamb * X.shape[0] * np.eye(X_tilde.shape[1])) @ (X_tilde.T @ y)
        self.w_ = temp[:-1]
        self.b_ = temp[-1]

        return self
        
    def predict(self, X):
        """
        inputs:
        - x (size Nxd): a point in R^d
        - w (size: 1xd): the weights of the affine mapping of x
        - b (size: 1x1): the constant of the affine mapping of x
        output:
         - the predicted class for the associated y given the
        Linear Regression parameters
        """    
        return (self.w_@X.T + self.b_ > 1/2).astype("int")
        
    def score(self, X, y):
        """
        inputs:
        - X (size Nxd): the points in R^d we want to classify
        - y (size Nx1): the labels of the points
        """
        y_pred = self.predict(X)
        return np.sum(y_pred == y)/y.shape[0]

In [43]:
dim = 100
Nb_samples = 2000
prop_test = 0.05
lamb = 0.1

all_y_eval = []

np.random.seed(1)
for name in [0, 1, 2]:
    # Data processing
    X = pd.read_csv(f'{data_folder}/Xtr{name}_mat100.csv', sep = ' ', index_col=False, header=None).to_numpy()
    y = pd.read_csv(f'{data_folder}/Ytr{name}.csv')
    y = y["Bound"].to_numpy()
    
    mean, std = X.mean(axis=0), X.std(axis=0)
    X = (X - mean)/std

    tr_indices = np.random.choice(Nb_samples, size=int((1-prop_test)*Nb_samples), replace=False)
    te_indices = [idx for idx in range(Nb_samples) if idx not in tr_indices]

    X_tr = X[tr_indices]
    X_te = X[te_indices]
    
    y_tr = y[tr_indices]
    y_te = y[te_indices]
    
    assert X_tr.shape[0] + X_te.shape[0] == X.shape[0]
    assert y_tr.shape[0] + y_te.shape[0] == y.shape[0]
    
    # Fitting the classifier
    params = {'lamb': np.linspace(0.001, 0.1, 20)}
    rr = GridSearchCV(RidgeRegressor(), params)
#     rr = RidgeRegressor(lamb=lamb)

    rr.fit(X_tr, y_tr)
    
    print(rr.best_params_)
    
    print(f"Accuracy on train set {name}: {rr.score(X_tr, y_tr):.2f}")
    print(f"Accuracy on test set {name} : {rr.score(X_te, y_te):.2f}")
    
    # Prediction on the new set
    X_eval = pd.read_csv(f'{data_folder}/Xte{name}_mat100.csv', sep = ' ', index_col=False, header=None).to_numpy()
    X_eval = (X_eval - mean)/std
    y_eval = rr.predict(X_eval)
    all_y_eval.append(y_eval)
    
all_y_eval = np.hstack(all_y_eval).reshape(-1)

{'lamb': 0.021842105263157895}
Accuracy on train set 0: 0.65
Accuracy on test set 0 : 0.60
{'lamb': 0.001}
Accuracy on train set 1: 0.64
Accuracy on test set 1 : 0.57
{'lamb': 0.04268421052631579}
Accuracy on train set 2: 0.73
Accuracy on test set 2 : 0.69


# III) Kernel baselines 

## A) Kernels

In [102]:
def Gaussian_kernel(X1, X2, sig):
    """inputs:
    - X1 (size N1xd): a set of points
    - X2 (size N2xd): another one  
    - sig (float): the std of the kernel
    ouput:
    - the associated (N1xN2) Gaussian kernel
    """
    return np.exp(-distance_matrix(X1,X2)/(2*sig**2))

In [98]:
from itertools import product

def spectrum(x,k):
  l = len(x)
  spectrum_x = np.array([x[i:(i + k)] for i in range(l - k + 1)])
  return np.array(spectrum_x)

def Spectrum_kernel(X1, X2, k):
  """inputs:
    - X1 (size N1xd): a set of sequences
    - X2 (size N2xd): another one  
    - k (int): the length of the substrings 
    ouput:
    - the associated (N1xN2) Spectrum kernel
    """
  # substrings: all possible combinations of A,T,G,C of length k
  A_k = [''.join(s) for s in product(["A", "T", "G", "C"], repeat=k)]

  # nb of occurances of the elements of A_k in the k-spectrum of X1 (resp. X2)
  phi_spect_X1 = np.array([[np.sum(spectrum(x,k)==u) for u in A_k] for x in X1])
  phi_spect_X2 = np.array([[np.sum(spectrum(x,k)==u) for u in A_k] for x in X2])
  
  return phi_spect_X1 @ phi_spect_X2.T


## B) Algorithms
### 1. Kernel Ridge Regression

In [45]:
def compute_KRR_MLE(X, y, lamb, sig=10):
    """
    inputs:
    - X (size: N_trxd): the points of the training set
    - y (size: N_trx1): the values of the classes
    outputs:
    - the value of MLE estimation (w_hat, b_hat) in the kernel ridge regression model
    """
    K = Gaussian_kernel(X, X, sig=sig)
    alpha = np.linalg.inv(K+lamb*X.shape[1]*np.eye(X.shape[1]))@y.T
    return alpha

In [46]:
def predict_KRR(X_tr, X_te, alpha, sig=10):
    """
    inputs:
    - X_tr (size N_trxd): the points of the training set
    - X_te (size N_texd): the points of the test set we want to classify
    - w (size: 1xd): the weights of the affine mapping 
    - b (size: 1x1): the constant of the affine mapping
    output:
     - the predicted class for the associated y_te given the
    Linear Regression parameters
    """    
    K_te_tr = Gaussian_kernel(X_tr, X_te, sig=sig)
    return 2*(alpha.T@K_te_tr>0).astype("int")-1

In [47]:
class KernelRidgeRegressor(BaseEstimator, ClassifierMixin):
    
    def __init__(self, lamb=1., sigma=1., kernel='gaussian'):
        """
        This class implements methods for fitting and predicting with a KernelRidgeRegressor used for classification 
        (by thresholding the value regressed). Any kernel can be used. 
        inputs:
        - lamb : the regularisation parameter 
        - sigma : the parameter of the Gaussian kernel (if Gaussian kernel selected)
        - kernel : the kernel we consider
        """
        self.lamb = lamb
        self.sigma = sigma
        self.kernel = kernel
        if self.kernel == 'gaussian':
            self.kernel_ = partial(Gaussian_kernel, sig=sigma)
        else:
            raise NotImplementedError(f"Kernel {self.kernel} is not implemented yet")
    
    def fit(self, X, y):
        """
        inputs:
        - X (size: N_trxd): the points of the training set
        - y (size: N_trx1): the values of the classes
        """
        # We keep values of training in memory for prediction
        self.X_tr_ = np.copy(X)
        K = self.kernel_(X, X, sig=self.sigma)
        self.alpha_ = np.linalg.inv(K+self.lamb*X.shape[0]*np.eye(X.shape[0]))@y
        
        return self
        
    def predict(self, X):
        """
        inputs:
        - X (size N_texd): the points in R^d we want to classify
        output:
         - the predicted class for the associated y given the
        Linear Regression parameters
        """
        K_tr_te = self.kernel_(self.X_tr_, X, sig=self.sigma)
        
        return 2 * (self.alpha_.T@K_tr_te > 0).reshape(-1, ).astype("int") - 1
        
    def score(self, X, y):
        """
        inputs:
        - X (size N_texd): the points in R^d we want to classify
        - y (size N_tex1): the labels of the points
        """
        y_pred = self.predict(X)
        
        return np.sum(y_pred == y)/y.shape[0]

In [48]:
dim = 100
Nb_samples = 2000
prop_test = 0.2
lamb = 0.5
sigma = 1.2

all_y_eval = []

np.random.seed(1)
for name in [0, 1, 2]:
    # Data Processing
    X = pd.read_csv(f'{data_folder}/Xtr{name}_mat100.csv', sep = ' ', index_col=False, header=None).to_numpy()
    y = pd.read_csv(f'{data_folder}/Ytr{name}.csv')
    y = y["Bound"].to_numpy()
    y[y==0] = -1
    
    mean, std = X.mean(axis=0), X.std(axis=0)
    X = (X - mean)/std

    tr_indices = np.random.choice(Nb_samples, size=int((1-prop_test)*Nb_samples), replace=False)
    te_indices = [idx for idx in range(Nb_samples) if idx not in tr_indices]

    X_tr = X[tr_indices]
    X_te = X[te_indices]
    
    y_tr = y[tr_indices]
    y_te = y[te_indices]
    
    assert X_tr.shape[0] + X_te.shape[0] == X.shape[0]
    assert y_tr.shape[0] + y_te.shape[0] == y.shape[0]
    
    # Fitting
    params = {'lamb': np.linspace(0.1, 2, 2), 'sigma': np.linspace(0.5, 2, 20), 'kernel': ['gaussian']}
    krr = GridSearchCV(KernelRidgeRegressor(), params)
#     krr = KernelRidgeRegressor()
    
    krr.fit(X_tr,y_tr)
    
    print(krr.best_params_)
    
    print(f"Accuracy on train set {name}: {krr.score(X_tr, y_tr):.2f}")
    print(f"Accuracy on test set {name} : {krr.score(X_te, y_te):.2f}")
    
    # Prediction on the new set
    X_eval = pd.read_csv(f'{data_folder}/Xte{name}_mat100.csv', sep = ' ', index_col=False, header=None).to_numpy()
    X_eval = (X_eval - mean)/std
    y_eval = krr.predict(X_eval)
    all_y_eval.append(y_eval)
    
all_y_eval = np.hstack(all_y_eval).reshape(-1)

{'kernel': 'gaussian', 'lamb': 0.1, 'sigma': 0.5789473684210527}
Accuracy on train set 0: 1.00
Accuracy on test set 0 : 0.57
{'kernel': 'gaussian', 'lamb': 0.1, 'sigma': 0.6578947368421053}
Accuracy on train set 1: 1.00
Accuracy on test set 1 : 0.59
{'kernel': 'gaussian', 'lamb': 0.1, 'sigma': 0.5}
Accuracy on train set 2: 1.00
Accuracy on test set 2 : 0.67


### 2. Kernel SVM

In [117]:
class KernelSVM(BaseEstimator, ClassifierMixin):
    
    def __init__(self, lamb=1., sigma=1., k = 3, kernel='gaussian'):
        """
        This class implements methods for fitting and predicting with a KernelRidgeRegressor used for classification 
        (by thresholding the value regressed). Any kernel can be used. 
        inputs:
        - lamb : the regularisation parameter 
        - sigma : the parameter of the Gaussian kernel (if Gaussian kernel selected)
        - kernel : the kernel we consider
        """
        self.lamb = lamb
        self.sigma = sigma
        self.k = k
        self.kernel = kernel
        if self.kernel == 'gaussian':
            self.kernel_ = partial(Gaussian_kernel, sig=sigma)
        elif self.kernel == 'spectrum':
            self.kernel_ = partial(Spectrum_kernel, k=k)
        else:
            raise NotImplementedError(f"Kernel {self.kernel} is not implemented yet")
    
    def fit(self, X, y):
        """
        inputs:
        - X (size: N_trxd): the points of the training set
        - y (size: N_trx1): the values of the classes
        """
        # We keep values of training in memory for prediction
        N_tr = X.shape[0]
        self.X_tr_ = np.copy(X)

        if self.kernel == 'gaussian':
          K = self.kernel_(X, X, sig=self.sigma)
        elif self.kernel == 'spectrum':
          K = self.kernel_(X, X, k=self.k)
        # Define QP and solve it with cvxpy
        alpha = cp.Variable(N_tr)
        objective = cp.Maximize(2*alpha.T@y - cp.quad_form(alpha, K))
        constraints = [0 <= cp.multiply(y,alpha), cp.multiply(y,alpha) <= 1/(2*self.lamb*N_tr)]
        prob = cp.Problem(objective, constraints)

        # The optimal objective value is returned by `prob.solve()`.
        result = prob.solve()
        # The optimal value for x is stored in `x.value`.
        self.alpha_ = alpha.value
        
        return self
        
    def predict(self, X):
        """
        inputs:
        - X (size N_texd): the points in R^d we want to classify
        output:
         - the predicted class for the associated y given the
        Linear Regression parameters
        """
        if self.kernel == 'gaussian':
          K_tr_te = self.kernel_(self.X_tr_, X, sig=self.sigma)
        elif self.kernel == 'spectrum':
          K_tr_te = self.kernel_(self.X_tr_, X, k=self.k)
        
        return 2 * (self.alpha_.T@K_tr_te > 0).reshape(-1, ).astype("int") - 1
        
    def score(self, X, y):
        """
        inputs:
        - X (size N_texd): the points in R^d we want to classify
        - y (size N_tex1): the labels of the points
        """
        y_pred = self.predict(X)
        
        return np.sum(y_pred == y)/y.shape[0]

#### Gaussian Kernel SVM

In [None]:
dim = 100
Nb_samples = 2000
prop_test = 0.2
lamb = 0.5
sigma = 1.2

all_y_eval = []

np.random.seed(1)
for name in [0, 1, 2]:
    # Data Processing
    X = pd.read_csv(f'{data_folder}/Xtr{name}_mat100.csv', sep = ' ', index_col=False, header=None).to_numpy()
    y = pd.read_csv(f'{data_folder}/Ytr{name}.csv')
    y = y["Bound"].to_numpy()
    y[y==0] = -1
    
    mean, std = X.mean(axis=0), X.std(axis=0)
    X = (X - mean)/std

    tr_indices = np.random.choice(Nb_samples, size=int((1-prop_test)*Nb_samples), replace=False)
    te_indices = [idx for idx in range(Nb_samples) if idx not in tr_indices]

    X_tr = X[tr_indices]
    X_te = X[te_indices]
    
    y_tr = y[tr_indices]
    y_te = y[te_indices]
    
    assert X_tr.shape[0] + X_te.shape[0] == X.shape[0]
    assert y_tr.shape[0] + y_te.shape[0] == y.shape[0]
    
    # Fitting
    params = {'lamb': np.logspace(-10., -7., 4), 'sigma': np.logspace(-1., 2., 4), 'kernel': ['gaussian']}
    ksvm = GridSearchCV(KernelSVM(), params)
#     krr = KernelRidgeRegressor()
    
    ksvm.fit(X_tr,y_tr)
    
    print(ksvm.best_params_)
    
    print(f"Accuracy on train set {name}: {ksvm.score(X_tr, y_tr):.2f}")
    print(f"Accuracy on test set {name} : {ksvm.score(X_te, y_te):.2f}")
    
    # Prediction on the new set
    X_eval = pd.read_csv(f'{data_folder}/Xte{name}_mat100.csv', sep = ' ', index_col=False, header=None).to_numpy()
    X_eval = (X_eval - mean)/std
    y_eval = ksvm.predict(X_eval)
    all_y_eval.append(y_eval)
    
all_y_eval = np.hstack(all_y_eval).reshape(-1)

#### Spectrum Kernel SVM

In [120]:
## Kernel SVM with Spectrum kernel

dim = 100
Nb_samples = 2000
prop_test = 0.2
lamb = 0.5
sigma = 1.2
k = 3

all_y_eval = []

np.random.seed(1)
for name in [0, 1, 2]:
    # Data Processing
    df = pd.read_csv(f'{data_folder}/Xtr{name}.csv')
    X = np.array(df['seq'])
    y = pd.read_csv(f'{data_folder}/Ytr{name}.csv')
    y = y["Bound"].to_numpy()
    y[y==0] = -1
    

    tr_indices = np.random.choice(Nb_samples, size=int((1-prop_test)*Nb_samples), replace=False)
    te_indices = [idx for idx in range(Nb_samples) if idx not in tr_indices]

    X_tr = X[tr_indices]
    X_te = X[te_indices]
    
    y_tr = y[tr_indices]
    y_te = y[te_indices]
    
    assert X_tr.shape[0] + X_te.shape[0] == X.shape[0]
    assert y_tr.shape[0] + y_te.shape[0] == y.shape[0]
    
    # Fitting
    # params = {'lamb': np.logspace(-10., -7., 4), 'k': np.array([3,4,5,6]), 'kernel': ['spectrum']}
    # ksvm = GridSearchCV(KernelSVM(), params)
#     krr = KernelRidgeRegressor()
    ksvm = KernelSVM(lamb = lamb, k=k, kernel='spectrum')
    
    ksvm.fit(X_tr,y_tr)
    
    print(ksvm.best_params_)
    
    print(f"Accuracy on train set {name}: {ksvm.score(X_tr, y_tr):.2f}")
    print(f"Accuracy on test set {name} : {ksvm.score(X_te, y_te):.2f}")
    
    # Prediction on the new set
    X_eval = np.array(pd.read_csv(f'{data_folder}/Xte{name}.csv')['seq'])
    y_eval = ksvm.predict(X_eval)
    all_y_eval.append(y_eval)
    
all_y_eval = np.hstack(all_y_eval).reshape(-1)

DCPError: ignored

In [122]:
from itertools import product

Xtr_0 = pd.read_csv('data/Xtr0.csv')
X_train_0 = np.array(Xtr_0['seq'])

SPK = Spectrum_kernel(X_train_0,X_train_0,k=3)
np.real(np.linalg.eig(SPK)[0])

array([ 3.67373760e+05,  4.32026053e+04,  3.23499785e+04, ...,
        7.12260801e-14, -1.66362557e-14,  2.67708609e-14])

In [None]:
ids = np.arange(all_y_eval.shape[0])
filename = "results/submission_gaussian_svm.csv"

# write_csv(ids, all_y_eval, filename)