## 1. Load data

In [1]:
import numpy as np
from helpers import load_csv_data

data_path = "./data/train.csv"
test_path = "./data/test.csv"
sample_path = "./data/sample-submission.csv"

In [2]:
labels, features, ids = load_csv_data(data_path, sub_sample=False)

print(features.shape, labels.shape, ids.shape)
print(features[0], labels[0], ids[0])

(250000, 30) (250000,) (250000,)
[ 1.38470e+02  5.16550e+01  9.78270e+01  2.79800e+01  9.10000e-01
  1.24711e+02  2.66600e+00  3.06400e+00  4.19280e+01  1.97760e+02
  1.58200e+00  1.39600e+00  2.00000e-01  3.26380e+01  1.01700e+00
  3.81000e-01  5.16260e+01  2.27300e+00 -2.41400e+00  1.68240e+01
 -2.77000e-01  2.58733e+02  2.00000e+00  6.74350e+01  2.15000e+00
  4.44000e-01  4.60620e+01  1.24000e+00 -2.47500e+00  1.13497e+02] 1.0 100000


## 2. Preprocess data

In [3]:
from preprocessing import Preprocessor

prep = Preprocessor()

In [4]:
np.isnan(features).sum()

0

## 3. Train model with all features

In [5]:
# split data
from helpers import train_test_split
features = prep.remove_outlier_features(features)
features = prep.process(features, poly_degree=5)
features_tr, features_te, labels_tr, labels_te = train_test_split(labels, features, 0.8, seed=432)
# features_tr = prep.remove_outlier_features(features_tr)
# features_te = prep.remove_outlier_features(features_te)
# features_tr = prep.process_train(features_tr, apply_mapping=False, poly_degree=5)
# features_te = prep.process_test(features_te)

print(features_tr.shape, labels_tr.shape)
print(features_te.shape, labels_te.shape)

0
#TRAIN# (250000, 11) # (250000, 30)
(200000, 162) (200000,)
(50000, 162) (50000,)


In [7]:
print(features_tr.max())
print(features_tr.min())


1.0
0.0


### 3.1 Least square GD

In [8]:
from helpers import train_test_split, compute_loss, accuracy_score,make_prediction
from implementations import mean_squared_error_gd

x_tr, x_te, y_tr, y_te = train_test_split(labels_tr, features_tr, ratio=0.8, seed=42)

max_iter = 100
# gammas = [1e-1, 3e-2, 1e-2, 3e-3, 1e-3]
gammas = np.logspace(-2,-1, 5)
mse = np.zeros((len(gammas), 2))

initial_w = np.zeros((x_tr.shape[1]))


for i, gamma in enumerate(gammas):

    w, l_tr = mean_squared_error_gd(y_tr, x_tr, initial_w, max_iter, gamma)
    l_te = compute_loss(y_te, x_te, w)

    mse[i,:] = [l_tr, l_te]
    acc_tr = accuracy_score(y_tr, make_prediction(x_tr @ w))
    acc_te = accuracy_score(y_te, make_prediction(x_te @ w))

    print(f"gamma: {gamma} \ttrain: [loss={mse[i,0]:.5f}, acc={acc_te:.4f}]\
        \ttest: [loss={mse[i,1]:.5f}, accuracy={acc_te:.4f}]")


gamma: 0.01 	train: [loss=0.40128, acc=0.6938]        	test: [loss=0.40070, accuracy=0.6938]
gamma: 0.01778279410038923 	train: [loss=0.38843, acc=0.6968]        	test: [loss=0.38793, accuracy=0.6968]
gamma: 0.03162277660168379 	train: [loss=0.37589, acc=0.7099]        	test: [loss=0.37548, accuracy=0.7099]
gamma: 0.05623413251903491 	train: [loss=0.36454, acc=0.7192]        	test: [loss=0.36425, accuracy=0.7192]
gamma: 0.1 	train: [loss=0.35446, acc=0.7251]        	test: [loss=0.35435, accuracy=0.7251]


In [9]:
# Compute accuracy on test set

gamma = 3e-2
max_iter = 500
initial_w = np.zeros((features_tr.shape[1]))
w, l_tr = mean_squared_error_gd(labels_tr, features_tr, initial_w, max_iter, gamma)

acc_tr = accuracy_score(labels_tr, make_prediction(features_tr @ w))
acc_te = accuracy_score(labels_te, make_prediction(features_te @ w))

print(f"train acc={acc_tr:.5f}, test acc={acc_te:.5f}")

train acc=0.72828, test acc=0.72784


### 3.2 Least Square SGD

In [11]:
from helpers import train_test_split, compute_loss, accuracy_score,make_prediction
from implementations import mean_squared_error_sgd

x_tr, x_te, y_tr, y_te = train_test_split(labels_tr, features_tr, ratio=0.8, seed=42)

max_iter = 3
gammas = [1e-2, 3e-3, 1e-3, 3e-4, 1e-4]
# gammas = np.logspace(-0.8,-2, 5)
mse = np.zeros((len(gammas), 2))

initial_w = np.zeros((x_tr.shape[1]))


for i, gamma in enumerate(gammas):

    w, l_tr = mean_squared_error_sgd(y_tr, x_tr, initial_w, max_iter, gamma)
    l_te = compute_loss(y_te, x_te, w)

    mse[i,:] = [l_tr, l_te]
    acc_tr = accuracy_score(y_tr, make_prediction(x_tr @ w))
    acc_te = accuracy_score(y_te, make_prediction(x_te @ w))

    print(f"gamma: {gamma:.4f} \ttrain: [loss={mse[i,0]:.5f}, acc={acc_te:.4f}]\
        \ttest: [loss={mse[i,1]:.5f}, accuracy={acc_te:.4f}]")


gamme: 0.0100 	train: [loss=0.33470, acc=0.7572]        	test: [loss=0.33509, accuracy=0.7572]
gamme: 0.0030 	train: [loss=0.33267, acc=0.7523]        	test: [loss=0.33319, accuracy=0.7523]
gamme: 0.0010 	train: [loss=0.33819, acc=0.7421]        	test: [loss=0.33878, accuracy=0.7421]
gamme: 0.0003 	train: [loss=0.34642, acc=0.7312]        	test: [loss=0.34730, accuracy=0.7312]
gamme: 0.0001 	train: [loss=0.35332, acc=0.7238]        	test: [loss=0.35428, accuracy=0.7238]


In [12]:
# Compute accuracy on test set

gamma = 3e-4
max_iter = 3
initial_w = np.zeros((features_tr.shape[1]))
w, l_tr = mean_squared_error_sgd(labels_tr, features_tr, initial_w, max_iter, gamma)

acc_tr = accuracy_score(labels_tr, make_prediction(features_tr @ w))
acc_te = accuracy_score(labels_te, make_prediction(features_te @ w))

print(f"train acc={acc_tr:.5f}, test acc={acc_te:.5f}")

train acc=0.73146, test acc=0.73176


### 3.3 Least Squares with normal equation

In [13]:
from implementations import least_squares

# No hyperparameter to chosse

w, l_tr = least_squares(labels_tr, features_tr)
l_te = compute_loss(labels_te, features_te, w)
acc_tr = accuracy_score(labels_tr, make_prediction(features_tr @ w))
acc_te = accuracy_score(labels_te, make_prediction(features_te @ w))

print(f"train loss={l_tr:.5f}, test loss={l_te:.5f}")
print(f"train acc={acc_tr:.5f}, test acc={acc_te:.5f}")

LinAlgError: Singular matrix

### 3.4 Ridge Regression with normal equation

In [12]:
from helpers import train_test_split, compute_loss, accuracy_score,make_prediction
from implementations import ridge_regression

x_tr, x_te, y_tr, y_te = train_test_split(labels_tr, features_tr, ratio=0.8, seed=42)

max_iter = 3
lambdas = np.logspace(-5, -1, 10)
mse = np.zeros((len(lambdas), 2))


for i, lambda_ in enumerate(lambdas):

    w, l_tr = ridge_regression(y_tr, x_tr, lambda_)
    l_te = compute_loss(y_te, x_te, w)

    mse[i,:] = [l_tr, l_te]
    acc_tr = accuracy_score(y_tr, make_prediction(x_tr @ w))
    acc_te = accuracy_score(y_te, make_prediction(x_te @ w))

    print(f"lambda_: {lambda_:.6f} \ttrain: [loss={mse[i,0]:.5f}, acc={acc_te:.5f}]\
        \ttest: [loss={mse[i,1]:.5f}, accuracy={acc_te:.5f}]")


lambda_: 0.000010 	train: [loss=0.30227, acc=0.78663]        	test: [loss=0.30360, accuracy=0.78663]
lambda_: 0.000028 	train: [loss=0.30406, acc=0.78443]        	test: [loss=0.30514, accuracy=0.78443]
lambda_: 0.000077 	train: [loss=0.30688, acc=0.78092]        	test: [loss=0.30774, accuracy=0.78092]
lambda_: 0.000215 	train: [loss=0.31094, acc=0.77630]        	test: [loss=0.31162, accuracy=0.77630]
lambda_: 0.000599 	train: [loss=0.31669, acc=0.76818]        	test: [loss=0.31728, accuracy=0.76818]
lambda_: 0.001668 	train: [loss=0.32413, acc=0.75800]        	test: [loss=0.32465, accuracy=0.75800]
lambda_: 0.004642 	train: [loss=0.33209, acc=0.74900]        	test: [loss=0.33253, accuracy=0.74900]
lambda_: 0.012915 	train: [loss=0.34141, acc=0.73650]        	test: [loss=0.34163, accuracy=0.73650]
lambda_: 0.035938 	train: [loss=0.35487, acc=0.72545]        	test: [loss=0.35478, accuracy=0.72545]
lambda_: 0.100000 	train: [loss=0.37297, acc=0.70998]        	test: [loss=0.37263, accuracy

In [None]:
# TODO this model is clearly underfitting, need features engineering before word on the model

### 3.5 Logistic Regression

In [28]:
from helpers import train_test_split, sigmoid, compute_loss_logistic, accuracy_score, make_prediction
from implementations import logistic_regression

# convert labels from {-1,1} to {0,1}
labels_tr01 = 0.5 + labels_tr / 2.
x_tr, x_te, y_tr, y_te = train_test_split(labels_tr01, features_tr, ratio=0.8, seed=42)

max_iter = 300
gammas = np.logspace(-0.04, -0.3, 5)
# gammas = np.logspace(-0.8,-2, 5)
mse = np.zeros((len(gammas), 2))

initial_w = np.zeros((x_tr.shape[1]))


for i, gamma in enumerate(gammas):

    w, l_tr = logistic_regression(y_tr, x_tr, initial_w, max_iter, gamma)
    l_te = compute_loss_logistic(y_te, x_te, w)

    mse[i,:] = [l_tr, l_te]
    acc_tr = accuracy_score(y_tr, make_prediction(sigmoid(x_tr @ w), logistic=True, zero_one=True))
    acc_te = accuracy_score(y_te, make_prediction(sigmoid(x_te @ w), logistic=True, zero_one=True))

    print(f"gamma: {gamma:.4f} \ttrain: [loss={l_tr:.5f}, acc={acc_te:.4f}]\
        \ttest: [loss={l_te:.5f}, accuracy={acc_te:.4f}]")


gamma: 0.9120 	train: [loss=0.55508, acc=0.7200]        	test: [loss=0.55655, accuracy=0.7200]
gamma: 0.7852 	train: [loss=0.50150, acc=0.7480]        	test: [loss=0.50207, accuracy=0.7480]
gamma: 0.6761 	train: [loss=0.50324, acc=0.7463]        	test: [loss=0.50377, accuracy=0.7463]
gamma: 0.5821 	train: [loss=0.50502, acc=0.7440]        	test: [loss=0.50551, accuracy=0.7440]
gamma: 0.5012 	train: [loss=0.50692, acc=0.7420]        	test: [loss=0.50736, accuracy=0.7420]


In [27]:
from helpers import train_test_split, sigmoid, compute_loss_logistic, accuracy_score, make_prediction
from implementations import logistic_regression

# convert labels from {-1,1} to {0,1}
labels_tr01 = 0.5 + labels_tr / 2.
labels_te01 = 0.5 + labels_te / 2.

max_iter = 300
gamma = 0.7

initial_w = np.zeros((x_tr.shape[1]))

w, l_tr = logistic_regression(labels_tr, features_tr, initial_w, max_iter, gamma)
l_te = compute_loss_logistic(labels_te, features_te, w)

acc_tr = accuracy_score(labels_tr01, make_prediction(sigmoid(features_tr @ w), logistic=True, zero_one=True))
acc_te = accuracy_score(labels_te01, make_prediction(sigmoid(features_te @ w), logistic=True, zero_one=True))

print(f"train acc={acc_tr:.5f}, test acc={acc_te:.5f}")

  return 1.0 / (1 + np.exp(-t))


train acc=0.65716, test acc=0.65802


### 3.6 Regularized Logisic Regression

In [29]:
from helpers import train_test_split, sigmoid, compute_loss_logistic, accuracy_score, make_prediction
from implementations import reg_logistic_regression

# convert labels from {-1,1} to {0,1}
labels_tr01 = 0.5 + labels_tr / 2.
x_tr, x_te, y_tr, y_te = train_test_split(labels_tr01, features_tr, ratio=0.8, seed=42)

max_iter = 300
lambda_ = 0.001
lambdas = np.logspace(-1,-4, 5)
gamma = 0.08
gammas = np.logspace(0, -1, 5)
# gammas = np.logspace(-0.8,-2, 5)
mse = np.zeros((len(gammas), 2))

initial_w = np.zeros((x_tr.shape[1]))


for i, gamma in enumerate(gammas):
    for i, lambda_ in enumerate(lambdas):

        w, l_tr = reg_logistic_regression(y_tr, x_tr, lambda_, initial_w, max_iter, gamma)
        l_te = compute_loss_logistic(y_te, x_te, w)

        mse[i,:] = [l_tr, l_te]
        acc_tr = accuracy_score(y_tr, make_prediction(sigmoid(x_tr @ w), logistic=True, zero_one=True))
        acc_te = accuracy_score(y_te, make_prediction(sigmoid(x_te @ w), logistic=True, zero_one=True))

        print(f"gamma: {gamma:.4f}, lambda_: {lambda_:.4f}\
            \ttrain: [loss={l_tr:.5f}, acc={acc_te:.4f}]\
            \ttest: [loss={l_te:.5f}, accuracy={acc_te:.4f}]")


gamma: 1.0000, lambda_: 0.1000            	train: [loss=0.80562, acc=0.5553]            	test: [loss=0.76570, accuracy=0.5553]
gamma: 1.0000, lambda_: 0.0178            	train: [loss=0.80194, acc=0.5828]            	test: [loss=0.76254, accuracy=0.5828]
gamma: 1.0000, lambda_: 0.0032            	train: [loss=0.70622, acc=0.6608]            	test: [loss=0.67664, accuracy=0.6608]
gamma: 1.0000, lambda_: 0.0006            	train: [loss=0.63472, acc=0.6887]            	test: [loss=0.62592, accuracy=0.6887]
gamma: 1.0000, lambda_: 0.0001            	train: [loss=0.61164, acc=0.6973]            	test: [loss=0.61138, accuracy=0.6973]
gamma: 0.5623, lambda_: 0.1000            	train: [loss=0.61133, acc=0.6958]            	test: [loss=0.59015, accuracy=0.6958]
gamma: 0.5623, lambda_: 0.0178            	train: [loss=0.57238, acc=0.7171]            	test: [loss=0.54822, accuracy=0.7171]
gamma: 0.5623, lambda_: 0.0032            	train: [loss=0.53356, acc=0.7339]            	test: [loss=0.51645, a

In [None]:
from helpers import train_test_split, sigmoid, compute_loss_logistic, accuracy_score, make_prediction
from implementations import reg_logistic_regression

# convert labels from {-1,1} to {0,1}
labels_tr01 = 0.5 + labels_tr / 2.
labels_te01 = 0.5 + labels_te / 2.

max_iter = 300
gamma = 0.7

initial_w = np.zeros((x_tr.shape[1]))

w, l_tr = reg_logistic_regression(labels_tr, features_tr, initial_w, max_iter, gamma)
l_te = compute_loss_logistic(labels_te, features_te, w)

acc_tr = accuracy_score(labels_tr01, make_prediction(sigmoid(features_tr @ w), logistic=True, zero_one=True))
acc_te = accuracy_score(labels_te01, make_prediction(sigmoid(features_te @ w), logistic=True, zero_one=True))

print(f"train acc={acc_tr:.5f}, test acc={acc_te:.5f}")
# The results are not as good as in training, possibly caused by outliers in the test set.

## 4. Predict for Submission

In [5]:
# Load test data
_ , features_submit, ids_submit = load_csv_data(test_path, sub_sample=False)

In [6]:
# Choose a model and train it on the whole dataset
# TODO

# Make prediction
pred = None # TODO


# EXample with MSE GD
from implementations import mean_squared_error_gd
gamma = 3e-2
max_iter = 2000
initial_w = np.zeros((features.shape[1]))
w, l_tr = mean_squared_error_gd(labels, features, initial_w, max_iter, gamma)

from helpers import make_prediction
pred = make_prediction(features_submit @ w)

In [7]:
# Write to file 

from helpers import create_csv_submission

create_csv_submission(ids_submit, pred, sample_path)