In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from zipfile import ZipFile
from implementations import *
from helpers import *
from processing import *
from cross_validation import *

import seaborn as sns
%load_ext autoreload
%autoreload 2

In [None]:
path_dataset_train = './data/train.csv.zip'
path_dataset_test = './data/test.csv.zip'

y_train, x_train, ids_train = load_data(path_dataset_train)
_, x_test, ids_test = load_data(path_dataset_test)

## Cross Validation for Gradient descent


In [9]:
nb_division_test = 6
min_Qs = np.linspace(0, 20, num = 3)
max_Qs = np.linspace(80, 100, num = 3)
poly = np.linspace(2,5,4, dtype=int)
gammas = np.logspace(-6, -1, 4)
max_iters = 900
k_fold = 5

f_train = lambda y,x,lambda_,gamma, initial_w : mean_squared_error_gd(y, x, initial_w, max_iters, gamma)

result_gd, best_params_gd = grid_search_with_cross_validation(y_train, x_train, ids_train, nb_division_test, poly, min_Qs, max_Qs, k_fold, f_train,gammas = gammas)

Polynomial expension : 2, min_Q : 0, max_Q : 90
Polynomial expension : 2, min_Q : 0, max_Q : 100
Polynomial expension : 2, min_Q : 10, max_Q : 80
Polynomial expension : 2, min_Q : 10, max_Q : 90
Polynomial expension : 2, min_Q : 10, max_Q : 100
Polynomial expension : 2, min_Q : 20, max_Q : 80
Polynomial expension : 2, min_Q : 20, max_Q : 90


KeyboardInterrupt: 

## Cross Validation for Stochastic Gradient descent

In [None]:
nb_division_test = 6
min_Qs = np.linspace(0, 20, num = 3)
max_Qs = np.linspace(80, 100, num = 3)
poly = np.linspace(2,9,8, dtype=int)
gammas = np.logspace(-6, 0, 7)
max_iters = 1000
k_fold = 5

f_train = lambda y,x,lambda_,gamma, initial_w : mean_squared_error_sgd(y, x, initial_w, max_iters, gamma)

result_sgd, best_params_sgd = grid_search_with_cross_validation(y_train, x_train, ids_train, nb_division_test, poly, min_Qs, max_Qs, k_fold, f_train, gammas = gammas)

## Cross Validation for Least Squares


In [None]:
nb_division_test = 6
min_Qs = np.linspace(0, 25, num=6)
max_Qs = np.linspace(75, 100, num=6)
poly = np.linspace(2,7,6, dtype=int)
k_fold = 4

f_train = lambda y,x,lambda_,gamma, initial_w : least_squares(y,x)

result_ls, best_params_ls = grid_search_with_cross_validation(y_train, x_train, ids_train ,nb_division_test, poly, min_Qs, max_Qs, k_fold, f_train)

## Cross validation for Ridge Regression 

In [None]:
nb_division_test = 6
min_Qs = np.linspace(0, 25, num=6)
max_Qs = np.linspace(75, 100, num=6)
poly = np.linspace(2,9,8, dtype=int)
lambdas = np.logspace(-6, 0, 10)
k_fold = 4

f_train = lambda y,x,lambda_,gamma, initial_w : ridge_regression(y,x,lambda_)

result_rr, best_params_rr = grid_search_with_cross_validation(y_train, x_train, ids_train, nb_division_test, poly,  min_Qs, max_Qs, k_fold, f_train, lambdas = lambdas)

# Cross validation for classification 

### Cross validation for logistic regression

In [None]:
nb_division_test = 6
min_Qs = np.linspace(0, 25, num=6)
max_Qs = np.linspace(75, 100, num=6)
poly = np.linspace(2,9,8, dtype=int)
max_iters = 700
threshold_gd = 1e-8
gammas = np.logspace(-6, 0, 7)

f_train = lambda y,x,lambda_,gamma, initial_w : \
        logistic_regression(y, x, initial_w, max_iters, gamma, threshold_gd, stochastic = False)

result_lr, best_params_lr = grid_search_with_cross_validation(y_train, x_train, ids_train, nb_division_test, poly,  min_Qs, max_Qs, k_fold, f_train, gammas=gammas)

### Cross validation for regularized logistic regression

In [None]:
nb_division_test = 6
poly = np.linspace(2,9,8, dtype=int)
min_Qs = np.linspace(0, 25, num=6)
max_Qs = np.linspace(75, 100, num=6)
max_iter = 800
threshold_gd = 1e-8
gamma = np.logspace(-7, 0, 8)
lambda_ = np.logspace(-7, 0, 8)
k_fold = 4

f_train = lambda y,x,lambda_,gamma, initial_w : reg_logistic_regression(y, x, lambda_, initial_w, max_iter, gamma, threshold_gd, stochastic = False)

result_lr, best_params_lr = grid_search_with_cross_validation(y_train, x_train, ids_train, nb_division_test, poly,  min_Qs, max_Qs, k_fold, f_train, gammas=gamma, lambdas=lambda_)

# Get statistics for parameters and function 
With all the best parameters found, we want to compare the results we got with the 6 models. 

To do that we run a cross validation multiple times so that we get not only a value - the mean - but also the variance over this value. In order to do this we use boxplots  

In [None]:
nb_division_test = 6
min_Qs = [
    [20,10,20,20,0,20],
    [10,10,10,10,10,10],
    [10,10,10,10,10,10],
    [10,10,10,10,10,10],
    [10,10,10,10,10,10],
    [10,10,10,10,10,10]
]
max_Qs =  [
    [80,80,90,90,90,90],
    [90,90,90,90,90,90],
    [90,90,90,90,90,90],
    [90,90,90,90,90,90],
    [90,90,90,90,90,90],
    [90,90,90,90,90,90]
]
poly_exps = [
    [2,2,2,2,2,2],
    [2,2,2,2,2,2],
    [2,2,2,2,2,2],
    [2,2,2,2,2,2],
    [2,2,2,2,2,2],
    [2,2,2,2,2,2]
]
lambdas = [
    [-1,-1,-1,-1,-1,-1],
    [0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
    [-1,-1,-1,-1,-1,-1],
    [0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
]
gammas = [
    [0.001, 0.001, 0.001, 0.001, 0.001, 0.001],
    [0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
    [-1,-1,-1,-1,-1,-1],
    [-1,-1,-1,-1,-1,-1],
    [0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
    [0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
]
max_iter = 1000
threshold_gd = 1e-4
nb_iters = 10
k_fold = 4
stoch = True 
f_train_sgd = lambda y,x,lambda_,gamma, initial_w : mean_squared_error_sgd(y, x, initial_w, max_iters, gamma)
f_train_gd = lambda y,x,lambda_,gamma, initial_w : mean_squared_error_gd(y, x, initial_w, max_iters, gamma)
f_train_ls = lambda y,x,lambda_ ,gamma, initial_w: least_squares(y,x)
f_train_rr = lambda y,x,lambda_,gamma, initial_w : ridge_regression(y,x,lambda_)
f_train_lr = lambda y,x, lambda_,gamma, initial_w : \
            logistic_regression(y, x, initial_w, max_iter, gamma, threshold_gd, stochastic = stoch)
f_train_rlr = lambda y,x,lambda_,gamma, initial_w : \
            reg_logistic_regression(y, x, lambda_, initial_w, max_iter, gamma, threshold_gd, stochastic = stoch)

f_trains = [f_train_sgd, f_train_gd, f_train_ls, f_train_rr, f_train_lr, f_train_rlr]
losses_te_s = []

In [None]:
for i,f_train in enumerate(f_trains) :
    print(i)
    _, losses_te, _ = statistics_on_best_params(\
                x_train, ids_train, y_train, lambdas[i], nb_iters, k_fold,\
                f_train, gammas[i], poly_exps[i], min_Qs[i], max_Qs[i])
    losses_te_s.append(losses_te)
plt.boxplot(losses_te_s)
#plt.boxplot([[loss[0][0] for loss in losses ] for losses in losses_te_s])