# Hyperparameter Tuning for Model3

This notebook shows one process of hyperparameter tuning.  
We first prepare the datsets by first leaving 20% for testing at the end and separating the remaining 80% into 4 sets according to the categorical feature.  
We then run grid search with 4-fold cross-validation on:  
- Ridge regression for degree and lambda
- Logistic regression for degree and gamma
- Regularized logistic regression for degree, gamma and lambda

Each algorithm is run on all 4 subsets of the datset.
We then compare the three algorithms with the best parameters on the test set to decide which one is better.

In [136]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [143]:
from implementations import *
from cross_validation import *
from helpers import *
from run import *

In [144]:
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH, sub_sample=False)
seed = 1
train_percentage = 0.8

#Take 20% out for testing
y_train, tX_train, ids_train, y_test, tX_test, ids_test = partition(y, tX, ids, train_percentage, seed)

## Dropping columns with ```nan```

In [145]:
#Preparing train set.
#First change y vector from {-1, 1} to {0,1}
y_train_0 =y_train.copy()
y_train_0[y_train_0 == -1] = 0

#Drop columns with nans
tX_train_drop_invalid = tX_train.copy()
tX_train_drop_invalid = tX_train_drop_invalid[:, ~np.any(tX_train_drop_invalid == -999., axis=0)]

##############################################################################
#Preparing Test set
#First change y vector from {-1, 1} to {0,1}
y_test_0 =y_test.copy()
y_test_0[y_test_0 == -1] = 0
#Drop columns with nans
tX_test_drop_invalid = tX_test.copy()
tX_test_drop_invalid = tX_test_drop_invalid[:, ~np.any(tX_train == -999., axis=0)]

In [146]:
#Separate into 4 train datasets according to categorical feature
tX_0, tX_1, tX_2, tX_3, y_0, y_1, y_2, y_3, ids_0, ids_1, ids_2, ids_3 =\
    separate_data(tX_train_drop_invalid, y_train_0, ids_train)

## Cross validation with Ridge regression, Logistic regression and Regularized logistic regression

### Ridge regression tuning

In [83]:
lambdas = np.logspace(-8, -1, 8)
degrees = np.array([1,2,3,4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
K=4
training_errors, validation_errors =\
lambda_degree_ridge_cv(y_0, tX_0, lambdas,degrees, K,1)

val_error_deg_lambda = np.mean(validation_errors, axis=0)
np.nanargmin(val_error_deg_lambda)
idx_min = np.unravel_index(np.nanargmin(val_error_deg_lambda), val_error_deg_lambda.shape)
print(idx_min)
print(val_error_deg_lambda[idx_min])
print("Best degree {}".format(degrees[idx_min[0]]))
print("Best lambda {}".format(lambdas[idx_min[1]]))

Degree = 1
(79874, 18)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 2
(79874, 35)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 3
(79874, 52)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 4
(79874, 69)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 5
(79874, 86)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 6
(79874, 103)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 7
(79874, 120)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 8
(79874, 137)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 9
(79874, 154)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 10
(79874, 171)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 11
(79874, 188)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 12
(79874, 205)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 13
(79874, 222)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 14
(79874, 239)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 15
(79874, 256)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
(13, 0)
0.059493051908617166


In [84]:
lambdas = np.logspace(-8, -1, 8)
degrees = np.array([1,2,3,4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
K=4
training_errors, validation_errors =\
lambda_degree_ridge_cv(y_1, tX_1, lambdas,degrees, K,1)

val_error_deg_lambda = np.mean(validation_errors, axis=0)
np.nanargmin(val_error_deg_lambda)
idx_min = np.unravel_index(np.nanargmin(val_error_deg_lambda), val_error_deg_lambda.shape)
print(idx_min)
print(val_error_deg_lambda[idx_min])
print("Best degree {}".format(degrees[idx_min[0]]))
print("Best lambda {}".format(lambdas[idx_min[1]]))

Degree = 1
(62094, 19)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 2
(62094, 37)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 3
(62094, 55)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 4
(62094, 73)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 5
(62094, 91)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 6
(62094, 109)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 7
(62094, 127)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 8
(62094, 145)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 9
(62094, 163)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 10
(62094, 181)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 11
(62094, 199)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 12
(62094, 217)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 13
(62094, 235)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 14
(62094, 253)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 15
(62094, 271)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
(14, 0)
0.08189628227018213


In [82]:
lambdas = np.logspace(-8, -1, 8)
degrees = np.array([1,2,3,4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
K=4
training_errors, validation_errors =\
lambda_degree_ridge_cv(y_2, tX_2, lambdas,degrees, K,1)

val_error_deg_lambda = np.mean(validation_errors, axis=0)
np.nanargmin(val_error_deg_lambda)
idx_min = np.unravel_index(np.nanargmin(val_error_deg_lambda), val_error_deg_lambda.shape)
print(idx_min)
print(val_error_deg_lambda[idx_min])
print("Best degree {}".format(degrees[idx_min[0]]))
print("Best lambda {}".format(lambdas[idx_min[1]]))

Degree = 1
(40314, 19)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 2
(40314, 37)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 3
(40314, 55)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 4
(40314, 73)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 5
(40314, 91)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 6
(40314, 109)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 7
(40314, 127)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 8
(40314, 145)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 9
(40314, 163)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 10
(40314, 181)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 11
(40314, 199)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 12
(40314, 217)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 13
(40314, 235)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
(11, 0)
0.08415136316578789


In [85]:
lambdas = np.logspace(-8, -1, 8)
degrees = np.array([1,2,3,4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
K=4
training_errors, validation_errors =\
lambda_degree_ridge_cv(y_3, tX_3, lambdas,degrees, K,1)

val_error_deg_lambda = np.mean(validation_errors, axis=0)
np.nanargmin(val_error_deg_lambda)
idx_min = np.unravel_index(np.nanargmin(val_error_deg_lambda), val_error_deg_lambda.shape)
print(idx_min)
print(val_error_deg_lambda[idx_min])
print("Best degree {}".format(degrees[idx_min[0]]))
print("Best lambda {}".format(lambdas[idx_min[1]]))

Degree = 1
(17718, 19)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 2
(17718, 37)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 3
(17718, 55)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 4
(17718, 73)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 5
(17718, 91)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 6
(17718, 109)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 7
(17718, 127)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 8
(17718, 145)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 9
(17718, 163)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 10
(17718, 181)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 11
(17718, 199)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 12
(17718, 217)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 13
(17718, 235)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 14
(17718, 253)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 15
(17718, 271)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
(14, 0)
0.0814350243920801


### Logistic regression

In [88]:
lambdas = np.array([0])
gammas = np.logspace(-8, -1, 8)
degrees = np.array([1,2,3,4])
K=4
max_iters = 10000

training_errors, validation_errors =\
lambda_gamma_degree_sgd_cv(y_0, tX_0, "LOGISTIC_REGRESSION",lambdas, gammas, degrees, K,max_iters,1,1)

val_error_deg_lambda = np.mean(validation_errors, axis=0)
np.nanargmin(val_error_deg_lambda)
idx_min = np.unravel_index(np.nanargmin(val_error_deg_lambda), val_error_deg_lambda.shape)
print(idx_min)
print(val_error_deg_lambda[idx_min])
print("Best degree {}".format(degrees[idx_min[0]]))
print("Best lambda {}".format(lambdas[idx_min[1]]))
print("Best gamma {}".format(gammas[idx_min[2]]))

Degree = 1
(79923, 18)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 2
(79923, 35)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 3
(79923, 52)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 4
(79923, 69)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
(0, 0, 5)
1.5223211012187239


In [89]:
lambdas = np.array([0])
gammas = np.logspace(-8, -1, 8)
degrees = np.array([1,2,3,4])
K=4
max_iters = 10000
training_errors, validation_errors =\
lambda_gamma_degree_sgd_cv(y_1, tX_1, "LOGISTIC_REGRESSION",lambdas, gammas, degrees, K,max_iters,1,1)

val_error_deg_lambda = np.mean(validation_errors, axis=0)
np.nanargmin(val_error_deg_lambda)
idx_min = np.unravel_index(np.nanargmin(val_error_deg_lambda), val_error_deg_lambda.shape)
print(idx_min)
print(val_error_deg_lambda[idx_min])
print("Best degree {}".format(degrees[idx_min[0]]))
print("Best lambda {}".format(lambdas[idx_min[1]]))
print("Best gamma {}".format(gammas[idx_min[2]]))

Degree = 1
(61985, 19)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 2
(61985, 37)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 3
(61985, 55)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 4
(61985, 73)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
(0, 0, 5)
1.3540926797478172


In [90]:
lambdas = np.array([0])
gammas = np.logspace(-8, -1, 8)
degrees = np.array([1,2,3,4])
K=4
max_iters = 10000
training_errors, validation_errors =\
lambda_gamma_degree_sgd_cv(y_2, tX_2, "LOGISTIC_REGRESSION",lambdas, gammas, degrees, K,max_iters,1,1)

val_error_deg_lambda = np.mean(validation_errors, axis=0)
np.nanargmin(val_error_deg_lambda)
idx_min = np.unravel_index(np.nanargmin(val_error_deg_lambda), val_error_deg_lambda.shape)
print(idx_min)
print(val_error_deg_lambda[idx_min])
print("Best degree {}".format(degrees[idx_min[0]]))
print("Best lambda {}".format(lambdas[idx_min[1]]))
print("Best gamma {}".format(gammas[idx_min[2]]))

Degree = 1
(40333, 19)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 2
(40333, 37)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 3
(40333, 55)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 4
(40333, 73)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
(0, 0, 6)
0.7095592721709326


In [91]:
lambdas = np.array([0])
gammas = np.logspace(-8, -1, 8)
degrees = np.array([1,2,3,4])
K=4
max_iters = 10000
training_errors, validation_errors =\
lambda_gamma_degree_sgd_cv(y_3, tX_3, "LOGISTIC_REGRESSION",lambdas, gammas, degrees, K,max_iters,1,1)

val_error_deg_lambda = np.mean(validation_errors, axis=0)
np.nanargmin(val_error_deg_lambda)
idx_min = np.unravel_index(np.nanargmin(val_error_deg_lambda), val_error_deg_lambda.shape)
print(idx_min)
print(val_error_deg_lambda[idx_min])
print("Best degree {}".format(degrees[idx_min[0]]))
print("Best lambda {}".format(lambdas[idx_min[1]]))
print("Best gamma {}".format(gammas[idx_min[2]]))

Degree = 1
(17759, 19)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 2
(17759, 37)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 3
(17759, 55)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 4
(17759, 73)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
(0, 0, 5)
1.0762818409576567


### Regularized logistic regression

In [93]:
lambdas = np.logspace(-8, -1, 8)
gammas = np.logspace(-8, -1, 8)
degrees = np.array([1,2,3,4])
K=4
max_iters = 10000

training_errors, validation_errors =\
lambda_gamma_degree_sgd_cv(y_0, tX_0, "REGULARIZED_LOGISTIC_REGRESSION",lambdas, gammas, degrees, K,max_iters,1,1)

val_error_deg_lambda = np.mean(validation_errors, axis=0)
np.nanargmin(val_error_deg_lambda)
idx_min = np.unravel_index(np.nanargmin(val_error_deg_lambda), val_error_deg_lambda.shape)
print(idx_min)
print(val_error_deg_lambda[idx_min])
print("Best degree {}".format(degrees[idx_min[0]]))
print("Best lambda {}".format(lambdas[idx_min[1]]))
print("Best gamma {}".format(gammas[idx_min[2]]))

Degree = 1
(79923, 18)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 2
(79923, 35)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 3
(79923, 52)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 4
(79923, 69)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
(0, 7, 5)
0.7322578540182074


In [94]:
lambdas = np.logspace(-8, -1, 8)
gammas = np.logspace(-8, -1, 8)
degrees = np.array([1,2,3,4])
K=4
max_iters = 10000

training_errors, validation_errors =\
lambda_gamma_degree_sgd_cv(y_1, tX_1, "REGULARIZED_LOGISTIC_REGRESSION",lambdas, gammas, degrees, K,max_iters,1,1)

val_error_deg_lambda = np.mean(validation_errors, axis=0)
np.nanargmin(val_error_deg_lambda)
idx_min = np.unravel_index(np.nanargmin(val_error_deg_lambda), val_error_deg_lambda.shape)
print(idx_min)
print(val_error_deg_lambda[idx_min])
print("Best degree {}".format(degrees[idx_min[0]]))
print("Best lambda {}".format(lambdas[idx_min[1]]))
print("Best gamma {}".format(gammas[idx_min[2]]))

Degree = 1
(61985, 19)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 2
(61985, 37)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 3
(61985, 55)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 4
(61985, 73)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
(0, 7, 5)
0.5302130309828004


In [95]:
lambdas = np.logspace(-8, -1, 8)
gammas = np.logspace(-8, -1, 8)
degrees = np.array([1,2,3,4])
K=4
max_iters = 10000

training_errors, validation_errors =\
lambda_gamma_degree_sgd_cv(y_2, tX_2, "REGULARIZED_LOGISTIC_REGRESSION",lambdas, gammas, degrees, K,max_iters,1,1)

val_error_deg_lambda = np.mean(validation_errors, axis=0)
np.nanargmin(val_error_deg_lambda)
idx_min = np.unravel_index(np.nanargmin(val_error_deg_lambda), val_error_deg_lambda.shape)
print(idx_min)
print(val_error_deg_lambda[idx_min])
print("Best degree {}".format(degrees[idx_min[0]]))
print("Best lambda {}".format(lambdas[idx_min[1]]))
print("Best gamma {}".format(gammas[idx_min[2]]))

Degree = 1
(40333, 19)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 2
(40333, 37)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 3
(40333, 55)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 4
(40333, 73)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
(0, 7, 6)
0.3186083884677893


In [96]:
lambdas = np.logspace(-8, -1, 8)
gammas = np.logspace(-8, -1, 8)
degrees = np.array([1,2,3,4])
K=4
max_iters = 10000

training_errors, validation_errors =\
lambda_gamma_degree_sgd_cv(y_3, tX_3, "REGULARIZED_LOGISTIC_REGRESSION",lambdas, gammas, degrees, K,max_iters,1,1)

val_error_deg_lambda = np.mean(validation_errors, axis=0)
np.nanargmin(val_error_deg_lambda)
idx_min = np.unravel_index(np.nanargmin(val_error_deg_lambda), val_error_deg_lambda.shape)
print(idx_min)
print(val_error_deg_lambda[idx_min])
print("Best degree {}".format(degrees[idx_min[0]]))
print("Best lambda {}".format(lambdas[idx_min[1]]))
print("Best gamma {}".format(gammas[idx_min[2]]))

Degree = 1
(17759, 19)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 2
(17759, 37)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 3
(17759, 55)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
Degree = 4
(17759, 73)
Fold = 1
Fold = 2
Fold = 3
Fold = 4
(0, 7, 5)
0.5256940557782919


## Create best models and compare on test set

### Ridge regression

In [103]:
# Expand and standadrdize the 4 train sets
tx_0_expanded, tx_1_expanded, tx_2_expanded, tx_3_expanded, means, stds =\
standardize_train(tX_0, tX_1, tX_2, tX_3, [14,15,12,15])

In [104]:
#Separate into 4 test datasets and standardize them with the means and variances of the train sets
list_tX_test, list_y_test, list_ids_test =\
    prepare_test(tX_test_drop_invalid, y_test_0, means, stds, ids_test,[14,15,12,15])

In [99]:
(w_0, loss_0) = ridge_regression(y_0, tx_0_expanded, 1e-8)
loss = compute_mse_loss(list_y_test[0].reshape(-1,1), list_tX_test[0], w_0)
acc = get_accuracy(list_y_test[0].reshape(-1,1), list_tX_test[0], w_0, 0.5)

print(loss, acc)

159.81115118485693 0.836168084042021


In [40]:
(w_1, loss_1) = ridge_regression(y_1, tx_1_expanded, 1e-8)
loss = compute_mse_loss(list_y_test[1].reshape(-1,1), list_tX_test[1], w_1)
acc = get_accuracy(list_y_test[1].reshape(-1,1), list_tX_test[1], w_1, 0.5)

print(loss, acc)

0.09070264549620927 0.7625811427469632


In [47]:
(w_2, loss_2) = ridge_regression(y_2, tx_2_expanded, 1e-8)
loss = compute_mse_loss(list_y_test[2].reshape(-1,1), list_tX_test[2], w_2)
acc = get_accuracy(list_y_test[2].reshape(-1,1), list_tX_test[2], w_2, 0.5)

print(loss, acc)

0.10194207006528733 0.7617957395978499


In [65]:
(w_3, loss_3) = ridge_regression(y_3, tx_3_expanded, 1e-8)
loss = compute_mse_loss(list_y_test[3].reshape(-1,1), list_tX_test[3], w_3)
acc = get_accuracy(list_y_test[3].reshape(-1,1), list_tX_test[3], w_3, 0.5)

print(loss, acc)

30.83312731500105 0.7661748013620885


In [66]:
weights = [w_0, w_1, w_2, w_3]

In [62]:
a = predict_4sets(list_tX, list_ids, weights)

array([[100000,     -1],
       [100000,     -1],
       [100007,      1],
       ...,
       [349990,     -1],
       [349997,     -1],
       [349998,      1]])

In [86]:
correct=0
n_total = 0
for i in range(len(list_tX_test)):
    curr_pred = list_tX_test[i]@weights[i]
    curr_predicted_class = [0 if p<0.5 else 1 for p in curr_pred]
    n = len(list_y[i])
    n_total += n
    for j in range(n):
        if curr_predicted_class[j] == list_y_test[i][j]:
            correct+=1

  
print("Test accuracy {}".format(correct/n_total))

0.79216


### Logistic regression

In [105]:
# Expand and standadrdize the 4 train sets
tx_0_expanded, tx_1_expanded, tx_2_expanded, tx_3_expanded, means, stds =\
standardize_train(tX_0, tX_1, tX_2, tX_3, [1,1,1,1])

In [106]:
#Separate into 4 test datasets and standardize them with the means and variances of the train sets
list_tX_test, list_y_test, list_ids_test =\
    prepare_test(tX_test_drop_invalid, y_test_0, means, stds, ids_test, [1,1,1,1])

In [117]:
initial_w = np.ones((1, tx_0_expanded.shape[1]))
(w_0, loss_0) = logistic_regression(y_0, tx_0_expanded, initial_w, 10000, 0.001)
loss = compute_mse_loss(list_y_test[0].reshape(-1,1), list_tX_test[0], w_0)
acc = get_accuracy(list_y_test[0].reshape(-1,1), list_tX_test[0], w_0, 0.5)

print(loss, acc)

1.4282561538454663 0.7838919459729865


In [118]:
initial_w = np.ones((1, tx_1_expanded.shape[1]))

In [119]:
(w_1, loss_1) = logistic_regression(y_1, tx_1_expanded, initial_w, 10000, 0.001)
loss = compute_mse_loss(list_y_test[1].reshape(-1,1), list_tX_test[1], w_1)
acc = get_accuracy(list_y_test[1].reshape(-1,1), list_tX_test[1], w_1, 0.5)

print(loss, acc)

1.3867306455519959 0.6719583520791824


In [120]:
(w_2, loss_2) = logistic_regression(y_2, tx_2_expanded, initial_w, 10000, 0.001)
loss = compute_mse_loss(list_y_test[2].reshape(-1,1), list_tX_test[2], w_2)
acc = get_accuracy(list_y_test[2].reshape(-1,1), list_tX_test[2], w_2, 0.5)

print(loss, acc)

1.113285508398752 0.643838343619351


In [121]:
(w_3, loss_3) = logistic_regression(y_3, tx_3_expanded, initial_w, 10000, 0.001)
loss = compute_mse_loss(list_y_test[3].reshape(-1,1), list_tX_test[3], w_3)
acc = get_accuracy(list_y_test[3].reshape(-1,1), list_tX_test[3], w_3, 0.5)

print(loss, acc)

1.0759227820434205 0.6767309875141885


In [122]:
weights = [w_0,w_1,w_2,w_3]

In [123]:
a = predict_4sets(list_tX_test, list_ids_test, weights)

array([[100000,      1],
       [100005,     -1],
       [100007,     -1],
       ...,
       [349990,     -1],
       [349997,     -1],
       [349998,      1]])

In [124]:
correct=0
n_total = 0
for i in range(len(list_tX_test)):
    curr_pred = list_tX_test[i]@weights[i]
    curr_predicted_class = [0 if p<0.5 else 1 for p in curr_pred]
    n = len(list_y[i])
    n_total += n
    for j in range(n):
        if curr_predicted_class[j] == list_y_test[i][j]:
            correct+=1

  
print("Test accuracy {}".format(correct/n_total))

0.71148


### Regularized logistic regression

In [125]:
# Expand and standadrdize the 4 train sets
tx_0_expanded, tx_1_expanded, tx_2_expanded, tx_3_expanded, means, stds =\
standardize_train(tX_0, tX_1, tX_2, tX_3, [1,1,1,1])

In [126]:
#Separate into 4 test datasets and standardize them with the means and variances of the train sets
list_tX_test, list_y_test, list_ids_test =\
    prepare_test(tX_test_drop_invalid, y_test_0, means, stds, ids_test, [1,1,1,1])

In [127]:
initial_w = np.ones((1, tx_0_expanded.shape[1]))
(w_0, loss_0) = reg_logistic_regression(y_0, tx_0_expanded, 0.1, initial_w, 10000, 0.001)
loss = compute_mse_loss(list_y_test[0].reshape(-1,1), list_tX_test[0], w_0)
acc = get_accuracy(list_y_test[0].reshape(-1,1), list_tX_test[0], w_0, 0.5)

print(loss, acc)

0.6864129811995152 0.7826413206603302


In [128]:
initial_w = np.ones((1, tx_1_expanded.shape[1]))


In [135]:
(w_1, loss_1) = reg_logistic_regression(y_1, tx_1_expanded, 0.1, initial_w, 10000, 0.001)
loss = compute_mse_loss(list_y_test[1].reshape(-1,1), list_tX_test[1], w_1)
acc = get_accuracy(list_y_test[1].reshape(-1,1), list_tX_test[1], w_1, 0.5)

print(loss, acc)

0.5068415467335277 0.6785783148017225


In [130]:
(w_2, loss_2) = reg_logistic_regression(y_2, tx_2_expanded, 0.1, initial_w, 10000, 0.01)
loss = compute_mse_loss(list_y_test[2].reshape(-1,1), list_tX_test[2], w_2)
acc = get_accuracy(list_y_test[2].reshape(-1,1), list_tX_test[2], w_2, 0.5)

print(loss, acc)

0.34919907357333063 0.6285088592474617


In [131]:
(w_3, loss_3) = reg_logistic_regression(y_3, tx_3_expanded, 0.1, initial_w, 10000, 0.001)
loss = compute_mse_loss(list_y_test[3].reshape(-1,1), list_tX_test[3], w_3)
acc = get_accuracy(list_y_test[3].reshape(-1,1), list_tX_test[3], w_3, 0.5)

print(loss, acc)

0.5190600039569986 0.6951191827468786


In [132]:
weights = [w_0,w_1,w_2,w_3]
a = predict_4sets(list_tX_test, list_ids_test, weights)

array([[100000,     -1],
       [100005,     -1],
       [100007,     -1],
       ...,
       [349990,     -1],
       [349997,     -1],
       [349998,      1]])

In [133]:
correct=0
n_total = 0
for i in range(len(list_tX_test)):
    curr_pred = list_tX_test[i]@weights[i]
    curr_predicted_class = [0 if p<0.5 else 1 for p in curr_pred]
    n = len(list_y[i])
    n_total += n
    for j in range(n):
        if curr_predicted_class[j] == list_y_test[i][j]:
            correct+=1

  
print("Test accuracy {}".format(correct/n_total))

0.70898


# Make prediction

In [68]:
DATA_TEST_PATH = '../data/test.csv'
y_test_aiCrowd, tX_test_aiCrowd, ids_test_aiCrowd = load_csv_data(DATA_TEST_PATH, sub_sample=False)

In [81]:
tX_test_aiCrowd_drop = tX_test_aiCrowd.copy()
tX_test_aiCrowd_drop = tX_test_aiCrowd_drop[:, ~np.any(tX_train == -999., axis=0)]

In [82]:
list_tX_test_ai, list_y_test_ai, list_ids_test_ai =\
    prepare_test(tX_test_aiCrowd_drop, y_test_aiCrowd, means, stds, ids_test_aiCrowd)

In [84]:
a = predict_4sets(list_tX_test_ai, list_y_test_ai, list_ids_test_ai, weights)

(568238, 2)

In [85]:
create_csv_submission(a[:,0], a[:,1], "test_model3_co")