# Kernel Methods: Challenge

Julia Linhart, Roman Castagné, Louis Bouvier

Preliminary functions:

In [20]:
def write_csv(ids, labels, filename):
    """
    inputs:
        - ids: list of ids, should be an increasing list of integers
        - labels: list of corresponding labels, either 0 or 1
        - file: string containing the name that should be given to the submission file    
    """
    df = pd.DataFrame({"Id": ids, "Bound": labels})
    df.to_csv(filename, sep=',', index=False)

# I) Preprocessing

In [58]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.spatial import distance_matrix

In [22]:
data_folder = 'data' # 'machine-learning-with-kernel-methods-2021'

X_train_1 = pd.read_csv(f'{data_folder}/Xtr2_mat100.csv', sep = ' ', index_col=False, header=None)
y_train_1 = pd.read_csv(f'{data_folder}/Ytr2.csv')

In [23]:
y_train_1.describe()

Unnamed: 0,Id,Bound
count,2000.0,2000.0
mean,4999.5,0.4985
std,577.494589,0.500123
min,4000.0,0.0
25%,4499.75,0.0
50%,4999.5,0.0
75%,5499.25,1.0
max,5999.0,1.0


In [24]:
y_train_1 = np.array(y_train_1)[:,1]

In [25]:
X_train_1.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,0.010565,0.010201,0.010375,0.011587,0.011609,0.010707,0.009359,0.011957,0.009571,0.010582,...,0.007951,0.009457,0.008554,0.009283,0.008261,0.009614,0.011141,0.009777,0.008217,0.008565
std,0.012278,0.010723,0.011467,0.011453,0.012182,0.010478,0.009789,0.012444,0.013805,0.013652,...,0.009605,0.009701,0.00935,0.009741,0.012341,0.010338,0.010863,0.010402,0.009709,0.009283
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.0,0.01087,...,0.01087,0.01087,0.01087,0.01087,0.0,0.01087,0.01087,0.01087,0.01087,0.01087
75%,0.01087,0.021739,0.021739,0.021739,0.021739,0.021739,0.01087,0.021739,0.01087,0.021739,...,0.01087,0.01087,0.01087,0.01087,0.01087,0.01087,0.021739,0.01087,0.01087,0.01087
max,0.086957,0.065217,0.097826,0.065217,0.065217,0.054348,0.054348,0.076087,0.097826,0.184783,...,0.054348,0.065217,0.054348,0.054348,0.086957,0.065217,0.076087,0.065217,0.065217,0.043478


In [26]:
X_train_1 = np.array(X_train_1)
print(X_train_1.shape)
X_train_1 = (X_train_1 - X_train_1.mean(axis=0))/X_train_1.std(axis=0)

(2000, 100)


In [27]:
print(y_train_1.shape)

(2000,)


# II) First linear models of the mat100 input

## A) Logistic regression

In [28]:
def g(z):
    """
    input:
    - z (any size): an array-like element
    ouput:
    - the element-wize application of the sigmoïd function on z
    """
    return 1/(1+np.exp(-z))

In [29]:
def compute_loss(X,y,w,b):
    """
    inputs:
    - X (size: dxN): the points we want to classify
    - y (size: 1xN): the values of the classes
    - w (size: dx1): the weights of the affine mapping of x
    - b (size: 1x1): the constant of the affine mapping of x
    output:
    - the opposite of the log-likelihood of the Logistic Regression model computed with respect to
    the points (X,y) and the parameters w,b
    """
    X_tilde = np.vstack((X,np.ones(X.shape[1])))
    w_tilde = np.vstack((w,b))
    return -np.sum(y*np.log(g(w_tilde.T@X_tilde)) + (1-y)*np.log(1-g(w_tilde.T@X_tilde)), axis=1)

In [30]:
def compute_grad(X,y,w,b):
    """
    inputs:
    - X (size: dxN): the points we want to classify
    - y (size: 1xN): the values of the classes
    - w (size: dx1): the weights of the affine mapping of x
    - b (size: 1x1): the constant of the affine mapping of x
    output:
    - the gradient of the loss of the Logistic Regression model computed 
    with respect to (w,b) = w_tilde having the points (X,y) 
    """
    X_tilde = np.vstack((X,np.ones(X.shape[1])))
    w_tilde = np.vstack((w,b))    
    return -X_tilde@(y-g(w_tilde.T@X_tilde)).reshape(-1,1)

In [31]:
def compute_hess(X,y,w,b):
    """
    inputs:
    - X (size: dxN): the points we want to classify
    - y (size: 1xN): the values of the classes
    - w (size: dx1): the weights of the affine mapping of x
    - b (size: 1x1): the constant of the affine mapping of x
    output:
    - the hessian of the loss of the Logistic Regression model computed 
    with respect to (w,b) = w_tilde having the points (X,y) 
    """
    X_tilde = np.vstack((X,np.ones(X.shape[1])))
    w_tilde = np.vstack((w,b))    
    temp = (g(w_tilde.T@X_tilde)*(g(w_tilde.T@X_tilde)-1)).reshape(-1,)
    return -X_tilde@np.diag(temp)@X_tilde.T

In [32]:
def backtracking(X,y,w,b,delta,grad,alpha=0.1,beta=0.7):
    """
    inputs:
    - X (size: dxN): the points we want to classify
    - y (size: 1xN): the values of the classes
    - w (size: dx1): the weights of the affine mapping of x
    - b (size: 1x1): the constant of the affine mapping of x
    - delta (size n): direction of the search
    - grad (size n): value of the gradient at point (w,b)
    - alpha: factor of the slope of the line in the backtracking line search
    - beta: factor of reduction of the step length
    
    outputs:
    - t: the step length for the Newton step on the objective function
    computed with backtracking line search towards delta"""
        
    t = 1
    while(compute_loss(X,y,w+t*delta[:-1],b+t*delta[-1])>
            compute_loss(X,y,w,b) + alpha*t*grad.T@delta):
        t = beta*t
    return t

In [33]:
def Newton(X,y,w0,b0,eps=pow(10,-1)):
    """
    inputs:
    - X (size: dxN): the points we want to classify
    - y (size: 1xN): the values of the classes
    - w0 (size: dx1): the initial weights of the affine mapping of x
    - b0 (size: 1x1): the initial constant of the affine mapping of x
    output:
    - the paramer vector w_tilde_hat = (w_hat, b_hat) which maximizes the log-likelihood of 
    the sample (X,y) in the Logistic Regression model (or minimizes the loss)
    - the cached values of the loss evaluated along training
    """
    w_,b_ = w0,b0
    grad = compute_grad(X,y,w0,b0)
    hess = compute_hess(X,y,w0,b0)
    print(hess.shape)
#     inv_hess = np.linalg.inv(compute_hess(X,y,w0,b0))
    inv_hess, _, _, _ = np.linalg.lstsq(hess, np.eye(hess.shape[0]))
    dec_2 = grad.T@inv_hess@grad
    Loss_hist = [compute_loss(X,y,w0,b0)]
    while dec_2/2>eps:## condition on the Newton decrement
        grad = compute_grad(X,y,w_,b_)
        hess = compute_hess(X,y,w_,b_)
#         inv_hess = np.linalg.inv(compute_hess(X,y,w_,b_))
        inv_hess, _, _, _ = np.linalg.lstsq(hess, np.eye(hess.shape[0]))
        dec_2 = grad.T@inv_hess@grad
        delta = -inv_hess@grad
        t_bt = backtracking(X,y,w_,b_,delta,grad)
        w_ = w_ + t_bt*delta[:-1]
        b_ = b_ + t_bt*delta[-1]
        Loss_hist.append(compute_loss(X,y,w_,b_))
    return w_, b_, Loss_hist

In [34]:
def predict_LogReg(x,w,b):
    """
    inputs:
    - x (size dx1): a point in R^d
    - w (size: dx1): the weights of the affine mapping of x
    - b (size: 1x1): the constant of the affine mapping of x
    output:
     - the predicted class for the associated y given the
    Logistic Regression parameters
    """    
    return (w.T@x+b>0).astype("int")

In [37]:
dim = 100
Nb_samples = 2000
prop_test = 0.05
Train_indices = np.random.choice(a=Nb_samples, size=int((1-prop_test)*Nb_samples), replace=False)
mask_train = np.zeros(Nb_samples, dtype=bool)
np.put_along_axis(arr=mask_train, indices=Train_indices, values=True, axis=0)

X_tr = X_train_1[mask_train,:].T
X_te = X_train_1[np.logical_not(mask_train),:].T
print(X_tr.shape)
print(X_te.shape)
y_tr = y_train_1[mask_train].reshape(1,-1)
y_te = y_train_1[np.logical_not(mask_train)].reshape(1,-1)
print(y_tr.shape)
print(y_te.shape)

(100, 1900)
(100, 100)
(1, 1900)
(1, 100)


In [38]:
## compute the corresponding MLE on train set
w0, b0 = np.random.randn(100,1)*0.07, np.zeros((1,1))## we initialize parameters
w_hat, b_hat, _ = Newton(X_tr,y_tr,w0,b0)
    
## assess the convergence of the Newton Method
#print("w_hat = {}".format(w_hat))
#print("b_hat = {}".format(b_hat))
    
## predict on the two sets
y_predicted_train = predict_LogReg(X_tr,w_hat,b_hat)## prediction on train set
mis_class_err_train = np.sum(y_predicted_train!=y_tr)/y_tr.shape[1]
y_predicted_test = predict_LogReg(X_te,w_hat,b_hat)## prediction on train set
mis_class_err_test = np.sum(y_predicted_test!=y_te)/y_te.shape[1]
print("Misclassification error:")
print("On train set: {:.2f}%".format(100*mis_class_err_train))
print("On test set: {:.2f}%".format(100*mis_class_err_test))

(101, 101)
Misclassification error:
On train set: 26.53%
On test set: 32.00%




In [39]:
dim = 100
Nb_samples = 2000
prop_test = 0.05

all_y_eval = []

np.random.seed(1)
for name in [0, 1, 2]:
    X = pd.read_csv(f'{data_folder}/Xtr{name}_mat100.csv', sep = ' ', index_col=False, header=None).to_numpy()
    y = pd.read_csv(f'{data_folder}/Ytr{name}.csv')
    y = y["Bound"].to_numpy()
    
    mean, std = X.mean(axis=0), X.std(axis=0)
    
    X = (X - mean)/std

    tr_indices = np.random.choice(Nb_samples, size=int((1-prop_test)*Nb_samples), replace=False)
    te_indices = [idx for idx in range(Nb_samples) if idx not in tr_indices]
#     mask_train = np.zeros(Nb_samples, dtype=bool)
#     mask_train[tr_indices] = 1
#     np.put_along_axis(arr=mask_train, indices=tr_indices, values=True, axis=0)

    X_tr = X[tr_indices].T
    X_te = X[te_indices].T
    
    assert X_tr.shape[1] + X_te.shape[1] == X.shape[0]
#     print(X_tr.shape)
#     print(X_te.shape)
    y_tr = y[tr_indices].reshape(1,-1)
    y_te = y[te_indices].reshape(1,-1)
    assert y_tr.shape[1] + y_te.shape[1] == y.shape[0]
#     print(y_tr.shape)
#     print(y_te.shape)
    print(y.sum() / y.shape[0])
    
    ## compute the corresponding MLE on train set
    w0, b0 = np.random.randn(100,1)*0.07, np.zeros((1,1)) # we initialize parameters
    w_hat, b_hat, _ = Newton(X_tr,y_tr,w0,b0)

    ## assess the convergence of the Newton Method
#     print("w_hat = {}".format(w_hat))
#     print("b_hat = {}".format(b_hat))

    ## predict on the two sets
    y_predicted_train = predict_LogReg(X_tr,w_hat,b_hat) # prediction on train set
    mis_class_err_train = np.sum(y_predicted_train!=y_tr)/y_tr.shape[1]
    y_predicted_test = predict_LogReg(X_te,w_hat,b_hat) # prediction on test set
    mis_class_err_test = np.sum(y_predicted_test!=y_te)/y_te.shape[1]
    print(f"Misclassification error on set {name}:")
    print("On train set: {:.2f}%".format(100*mis_class_err_train))
    print("On test set: {:.2f}%".format(100*mis_class_err_test))
    
    # predict on the new set
    X_eval = pd.read_csv(f'{data_folder}/Xte{name}_mat100.csv', sep = ' ', index_col=False, header=None).to_numpy()
    X_eval = (X_eval - mean)/std
    y_eval = predict_LogReg(X_eval.T,w_hat,b_hat)
    all_y_eval.append(y_eval)
    print(y_eval.sum() / y_eval.shape[1])
    
all_y_eval = np.hstack(all_y_eval).reshape(-1)

0.481
(101, 101)
Misclassification error on set 0:
On train set: 35.21%
On test set: 38.00%
0.459
0.5005
(101, 101)




Misclassification error on set 1:
On train set: 36.89%
On test set: 36.00%
0.499
0.4985
(101, 101)
Misclassification error on set 2:
On train set: 28.11%
On test set: 21.00%
0.497


In [100]:
ids = np.arange(all_y_eval.shape[0])
filename = "results/submission_log_reg.csv"

write_csv(ids, all_y_eval, filename)

## B) Ridge regression

In [49]:
def compute_RR_MLE(X,y,lamb):
    """
    inputs:
    - X (size: dxN): the points we want to classify
    - y (size: 1xN): the values of the classes
    outputs:
    - the value of MLE estimation (w_hat, b_hat) in the Linear regression model
    """
    X_tilde = np.vstack((X,np.ones(X.shape[1])))
    temp = np.linalg.inv(X_tilde@X_tilde.T +lamb*X.shape[1]*np.eye(1+X.shape[0]))@X_tilde@y.T
    return temp[:-1], temp[-1]

In [54]:
def predict_RR(x,w,b):
    """
    inputs:
    - x (size dx1): a point in R^d
    - w (size: dx1): the weights of the affine mapping of x
    - b (size: 1x1): the constant of the affine mapping of x
    output:
     - the predicted class for the associated y given the
    Linear Regression parameters
    """    
    return (w.T@x+b>1/2).astype("int")

In [81]:
dim = 100
Nb_samples = 2000
prop_test = 0.05

all_y_eval = []

np.random.seed(1)
for name in [0, 1, 2]:
    X = pd.read_csv(f'{data_folder}/Xtr{name}_mat100.csv', sep = ' ', index_col=False, header=None).to_numpy()
    y = pd.read_csv(f'{data_folder}/Ytr{name}.csv')
    y = y["Bound"].to_numpy()
    
    mean, std = X.mean(axis=0), X.std(axis=0)
    
    X = (X - mean)/std

    tr_indices = np.random.choice(Nb_samples, size=int((1-prop_test)*Nb_samples), replace=False)
    te_indices = [idx for idx in range(Nb_samples) if idx not in tr_indices]

    X_tr = X[tr_indices].T
    X_te = X[te_indices].T
    
    assert X_tr.shape[1] + X_te.shape[1] == X.shape[0]
#     print(X_tr.shape)
#     print(X_te.shape)
    y_tr = y[tr_indices].reshape(1,-1)
    y_te = y[te_indices].reshape(1,-1)
    assert y_tr.shape[1] + y_te.shape[1] == y.shape[0]
#     print(y_tr.shape)
#     print(y_te.shape)
    print(y.sum() / y.shape[0])
    
    ## compute the corresponding MLE on train set
    w_hat, b_hat = compute_RR_MLE(X_tr,y_tr,0.1)

    ## assess the convergence of the Newton Method
#     print("w_hat = {}".format(w_hat))
#     print("b_hat = {}".format(b_hat))

    ## predict on the two sets
    y_predicted_train = predict_RR(X_tr,w_hat,b_hat) # prediction on train set
    mis_class_err_train = np.sum(y_predicted_train!=y_tr)/y_tr.shape[1]
    y_predicted_test = predict_RR(X_te,w_hat,b_hat) # prediction on test set
    mis_class_err_test = np.sum(y_predicted_test!=y_te)/y_te.shape[1]
    print(f"Misclassification error on set {name}:")
    print("On train set: {:.2f}%".format(100*mis_class_err_train))
    print("On test set: {:.2f}%".format(100*mis_class_err_test))
    
    # predict on the new set
    X_eval = pd.read_csv(f'{data_folder}/Xte{name}_mat100.csv', sep = ' ', index_col=False, header=None).to_numpy()
    X_eval = (X_eval - mean)/std
    y_eval = predict_RR(X_eval.T,w_hat,b_hat)
    all_y_eval.append(y_eval)
    print(y_eval.sum() / y_eval.shape[1])
    
all_y_eval = np.hstack(all_y_eval).reshape(-1)

0.481
Misclassification error on set 0:
On train set: 36.00%
On test set: 42.00%
0.364
0.5005
Misclassification error on set 1:
On train set: 36.68%
On test set: 44.00%
0.381
0.4985
Misclassification error on set 2:
On train set: 27.68%
On test set: 37.00%
0.423


# III) Kernel baselines 

## A) Kernel Ridge Regression

In [141]:
def Gaussian_kernel(X1, X2, sig):
    """inputs:
    - X1 (size dxN1): a set of points
    - X2 (size dxN2): another one  
    - sig (float): the std of the kernel
    ouput:
    - the associated (N1xN2) Gaussian kernel
    """
    return np.exp(-distance_matrix(X1.T,X2.T)/(2*sig**2))

In [142]:
def compute_KRR_MLE(X, y, lamb, sig=10):
    """
    inputs:
    - X (size: dxN): the points we want to classify
    - y (size: 1xN): the values of the classes
    outputs:
    - the value of MLE estimation (w_hat, b_hat) in the Linear regression model
    """
    K = Gaussian_kernel(X, X, sig=sig)
    alpha = np.linalg.inv(K+lamb*X.shape[1]*np.eye(X.shape[1]))@y.T
    return alpha

In [144]:
def predict_KRR(X_tr, X_te, alpha, sig=10):
    """
    inputs:
    - X_tr (size dx1): a point in R^d
    - w (size: dx1): the weights of the affine mapping of x
    - b (size: 1x1): the constant of the affine mapping of x
    output:
     - the predicted class for the associated y given the
    Linear Regression parameters
    """    
    K_te_tr = Gaussian_kernel(X_tr, X_te, sig=sig)
    return 2*(alpha.T@K_te_tr>0).astype("int")-1

In [155]:
dim = 100
Nb_samples = 2000
prop_test = 0.2
lamb = 0.5
sigma = 1.2

all_y_eval = []

np.random.seed(1)
for name in [0, 1, 2]:
    X = pd.read_csv(f'{data_folder}/Xtr{name}_mat100.csv', sep = ' ', index_col=False, header=None).to_numpy()
    y = pd.read_csv(f'{data_folder}/Ytr{name}.csv')
    y = y["Bound"].to_numpy()
    y[y==0] = -1
    
    mean, std = X.mean(axis=0), X.std(axis=0)
    print("Sigma = {}".format(sigma))
    X = (X - mean)/std

    tr_indices = np.random.choice(Nb_samples, size=int((1-prop_test)*Nb_samples), replace=False)
    te_indices = [idx for idx in range(Nb_samples) if idx not in tr_indices]

    X_tr = X[tr_indices].T
    X_te = X[te_indices].T
    
    assert X_tr.shape[1] + X_te.shape[1] == X.shape[0]
#     print(X_tr.shape)
#     print(X_te.shape)
    y_tr = y[tr_indices].reshape(1,-1)
    y_te = y[te_indices].reshape(1,-1)
    assert y_tr.shape[1] + y_te.shape[1] == y.shape[0]
#     print(y_tr.shape)
#     print(y_te.shape)
    print(y.sum() / y.shape[0])
    
    ## compute the corresponding MLE on train set
    alpha_hat = compute_KRR_MLE(X_tr,y_tr,lamb=lamb,sig=sigma)

    ## assess the convergence of the Newton Method
#     print("w_hat = {}".format(w_hat))
#     print("b_hat = {}".format(b_hat))

    ## predict on the two sets
    y_predicted_train = predict_KRR(X_tr, X_tr, alpha_hat, sig=sigma) # prediction on train set
    mis_class_err_train = np.sum(y_predicted_train!=y_tr)/y_tr.shape[1]
    y_predicted_test = predict_KRR(X_tr, X_te, alpha_hat, sig=sigma) # prediction on test set
    mis_class_err_test = np.sum(y_predicted_test!=y_te)/y_te.shape[1]
    print(f"Misclassification error on set {name}:")
    print("On train set: {:.2f}%".format(100*mis_class_err_train))
    print("On test set: {:.2f}%".format(100*mis_class_err_test))
    
    # predict on the new set
    X_eval = pd.read_csv(f'{data_folder}/Xte{name}_mat100.csv', sep = ' ', index_col=False, header=None).to_numpy()
    X_eval = (X_eval - mean)/std
    y_eval = predict_LogReg(X_eval.T,w_hat,b_hat)
    all_y_eval.append(y_eval)
    print(y_eval.sum() / y_eval.shape[1])
    
all_y_eval = np.hstack(all_y_eval).reshape(-1)

Sigma = 1.2
-0.038
Misclassification error on set 0:
On train set: 6.06%
On test set: 50.25%
0.99
Sigma = 1.2
0.001
Misclassification error on set 1:
On train set: 0.25%
On test set: 46.75%
0.994
Sigma = 1.2
-0.003
Misclassification error on set 2:
On train set: 2.50%
On test set: 36.50%
0.973
