## 1. Load data

In [1]:
import numpy as np
from helpers import load_csv_data

data_path = "./data/train.csv"
test_path = "./data/test.csv"
sample_path = "./data/sample-submission.csv"

In [2]:
labels, features, ids = load_csv_data(data_path, sub_sample=False)

print(features.shape, labels.shape, ids.shape)
print(features[0], labels[0], ids[0])

(250000, 30) (250000,) (250000,)
[ 1.38470e+02  5.16550e+01  9.78270e+01  2.79800e+01  9.10000e-01
  1.24711e+02  2.66600e+00  3.06400e+00  4.19280e+01  1.97760e+02
  1.58200e+00  1.39600e+00  2.00000e-01  3.26380e+01  1.01700e+00
  3.81000e-01  5.16260e+01  2.27300e+00 -2.41400e+00  1.68240e+01
 -2.77000e-01  2.58733e+02  2.00000e+00  6.74350e+01  2.15000e+00
  4.44000e-01  4.60620e+01  1.24000e+00 -2.47500e+00  1.13497e+02] 1.0 100000


## 2. Preprocess data

In [3]:
from preprocessing import Preprocessor

prep = Preprocessor()

In [45]:
np.isnan(features).sum()

0

## 3. Train model with all features

In [4]:
# split data
from helpers import train_test_split
features_tr, features_te, labels_tr, labels_te = train_test_split(labels, features, 0.8, seed=432)
# features_tr, labels_tr = prep.remove_outlier(labels_tr, features_tr)
features_tr = prep.process_train(features_tr)
features_te = prep.process_train(features_te)

print(features_tr.shape, labels_tr.shape)
print(features_te.shape, labels_te.shape)

(200000, 31) (200000,)
(50000, 31) (50000,)


### 3.1 Least square GD

In [5]:
from helpers import train_test_split, compute_loss, accuracy_score,make_prediction
from implementations import mean_squared_error_gd

x_tr, x_te, y_tr, y_te = train_test_split(labels_tr, features_tr, ratio=0.8, seed=42)

max_iter = 100
gammas = [1e-1, 3e-2, 1e-2, 3e-3, 1e-3]
mse = np.zeros((len(gammas), 2))

initial_w = np.zeros((x_tr.shape[1]))


for i, gamma in enumerate(gammas):

    w, l_tr = mean_squared_error_gd(y_tr, x_tr, initial_w, max_iter, gamma)
    l_te = compute_loss(y_te, x_te, w)

    mse[i,:] = [l_tr, l_te]
    acc_tr = accuracy_score(y_tr, make_prediction(x_tr @ w))
    acc_te = accuracy_score(y_te, make_prediction(x_te @ w))

    print(f"gamme: {gamma} \ttrain: [loss={mse[i,0]:.5f}, acc={acc_te:.4f}]\
        \ttest: [loss={mse[i,1]:.5f}, accuracy={acc_te:.4f}]")


gamme: 0.1 	train: [loss=0.34346, acc=0.7411]        	test: [loss=0.34310, accuracy=0.7411]
gamme: 0.03 	train: [loss=0.35539, acc=0.7310]        	test: [loss=0.35507, accuracy=0.7310]
gamme: 0.01 	train: [loss=0.37450, acc=0.7190]        	test: [loss=0.37410, accuracy=0.7190]
gamme: 0.003 	train: [loss=0.41859, acc=0.6857]        	test: [loss=0.41781, accuracy=0.6857]
gamme: 0.001 	train: [loss=0.45683, acc=0.6550]        	test: [loss=0.45618, accuracy=0.6550]


In [7]:
# Compute accuracy on test set

gamma = 3e-2
max_iter = 500
initial_w = np.zeros((features_tr.shape[1]))
w, l_tr = mean_squared_error_gd(labels_tr, features_tr, initial_w, max_iter, gamma)

acc_tr = accuracy_score(labels_tr, make_prediction(features_tr @ w))
acc_te = accuracy_score(labels_te, make_prediction(features_te @ w))

print(f"train acc={acc_tr:.5f}, test acc={acc_te:.5f}")

train acc=0.74337, test acc=0.74242


### 3.2 Least Square SGD

In [31]:
from helpers import train_test_split, compute_loss, accuracy_score,make_prediction
from implementations import mean_squared_error_sgd

x_tr, x_te, y_tr, y_te = train_test_split(labels_tr, features_tr, ratio=0.8, seed=42)

max_iter = 3
gammas = [1e-2, 3e-3, 1e-3, 3e-4, 1e-4]
# gammas = np.logspace(-0.8,-2, 5)
mse = np.zeros((len(gammas), 2))

initial_w = np.zeros((x_tr.shape[1]))


for i, gamma in enumerate(gammas):

    w, l_tr = mean_squared_error_sgd(y_tr, x_tr, initial_w, max_iter, gamma)
    l_te = compute_loss(y_te, x_te, w)

    mse[i,:] = [l_tr, l_te]
    acc_tr = accuracy_score(y_tr, make_prediction(x_tr @ w))
    acc_te = accuracy_score(y_te, make_prediction(x_te @ w))

    print(f"gamme: {gamma:.4f} \ttrain: [loss={mse[i,0]:.5f}, acc={acc_te:.4f}]\
        \ttest: [loss={mse[i,1]:.5f}, accuracy={acc_te:.4f}]")


gamme: 0.0100 	train: [loss=0.43122, acc=0.7182]        	test: [loss=0.43349, accuracy=0.7182]
gamme: 0.0030 	train: [loss=0.36289, acc=0.7353]        	test: [loss=0.36262, accuracy=0.7353]
gamme: 0.0010 	train: [loss=0.34932, acc=0.7411]        	test: [loss=0.34955, accuracy=0.7411]
gamme: 0.0003 	train: [loss=0.34299, acc=0.7455]        	test: [loss=0.34309, accuracy=0.7455]
gamme: 0.0001 	train: [loss=0.34105, acc=0.7439]        	test: [loss=0.34109, accuracy=0.7439]


In [33]:
# Compute accuracy on test set

gamma = 3e-4
max_iter = 3
initial_w = np.zeros((features_tr.shape[1]))
w, l_tr = mean_squared_error_sgd(labels_tr, features_tr, initial_w, max_iter, gamma)

acc_tr = accuracy_score(labels_tr, make_prediction(features_tr @ w))
acc_te = accuracy_score(labels_te, make_prediction(features_te @ w))

print(f"train acc={acc_tr:.5f}, test acc={acc_te:.5f}")

train acc=0.74216, test acc=0.74156


### 3.3 Least Squares with normal equation

In [35]:
from implementations import least_squares

# No hyperparameter to chosse

w, l_tr = least_squares(labels_tr, features_tr)
l_te = compute_loss(labels_te, features_te, w)
acc_tr = accuracy_score(labels_tr, make_prediction(features_tr @ w))
acc_te = accuracy_score(labels_te, make_prediction(features_te @ w))

print(f"train loss={l_tr:.5f}, test loss={l_te:.5f}")
print(f"train acc={acc_tr:.5f}, test acc={acc_te:.5f}")

train loss=0.33958, test loss=1.23150
train acc=0.74506, test acc=0.54426


### 3.4 Ridge Regression with normal equation

In [44]:
from helpers import train_test_split, compute_loss, accuracy_score,make_prediction
from implementations import ridge_regression

x_tr, x_te, y_tr, y_te = train_test_split(labels_tr, features_tr, ratio=0.8, seed=42)

max_iter = 3
lambdas = np.logspace(-5, -1, 10)
mse = np.zeros((len(lambdas), 2))


for i, lambda_ in enumerate(lambdas):

    w, l_tr = ridge_regression(y_tr, x_tr, lambda_)
    l_te = compute_loss(y_te, x_te, w)

    mse[i,:] = [l_tr, l_te]
    acc_tr = accuracy_score(y_tr, make_prediction(x_tr @ w))
    acc_te = accuracy_score(y_te, make_prediction(x_te @ w))

    print(f"lambda_: {lambda_:.6f} \ttrain: [loss={mse[i,0]:.5f}, acc={acc_te:.5f}]\
        \ttest: [loss={mse[i,1]:.5f}, accuracy={acc_te:.5f}]")


lambda_: 0.000010 	train: [loss=0.33998, acc=0.74262]        	test: [loss=0.34019, accuracy=0.74262]
lambda_: 0.000028 	train: [loss=0.34010, acc=0.74235]        	test: [loss=0.34028, accuracy=0.74235]
lambda_: 0.000077 	train: [loss=0.34016, acc=0.74228]        	test: [loss=0.34032, accuracy=0.74228]
lambda_: 0.000215 	train: [loss=0.34019, acc=0.74233]        	test: [loss=0.34034, accuracy=0.74233]
lambda_: 0.000599 	train: [loss=0.34021, acc=0.74230]        	test: [loss=0.34033, accuracy=0.74230]
lambda_: 0.001668 	train: [loss=0.34024, acc=0.74228]        	test: [loss=0.34031, accuracy=0.74228]
lambda_: 0.004642 	train: [loss=0.34041, acc=0.74213]        	test: [loss=0.34038, accuracy=0.74213]
lambda_: 0.012915 	train: [loss=0.34136, acc=0.74250]        	test: [loss=0.34114, accuracy=0.74250]
lambda_: 0.035938 	train: [loss=0.34518, acc=0.74067]        	test: [loss=0.34474, accuracy=0.74067]
lambda_: 0.100000 	train: [loss=0.35472, acc=0.73420]        	test: [loss=0.35425, accuracy

In [None]:
# TODO this model is clearly underfitting, need features engineering before word on the model

### 3.5 Logistic Regression

In [18]:
from helpers import train_test_split, sigmoid, compute_loss_logistic, accuracy_score, make_prediction
from implementations import logistic_regression

# convert labels from {-1,1} to {0,1}
labels_tr01 = 0.5 + labels_tr / 2.
x_tr, x_te, y_tr, y_te = train_test_split(labels_tr01, features_tr, ratio=0.8, seed=42)

max_iter = 300
gammas = np.logspace(0.2, -1, 5)
# gammas = np.logspace(-0.8,-2, 5)
mse = np.zeros((len(gammas), 2))

initial_w = np.zeros((x_tr.shape[1]))


for i, gamma in enumerate(gammas):

    w, l_tr = logistic_regression(y_tr, x_tr, initial_w, max_iter, gamma)
    l_te = compute_loss_logistic(y_te, x_te, w)

    mse[i,:] = [l_tr, l_te]
    acc_tr = accuracy_score(y_tr, make_prediction(sigmoid(x_tr @ w), logistic=True, zero_one=True))
    acc_te = accuracy_score(y_te, make_prediction(sigmoid(x_te @ w), logistic=True, zero_one=True))

    print(f"gamme: {gamma:.4f} \ttrain: [loss={l_tr:.5f}, acc={acc_te:.4f}]\
        \ttest: [loss={l_te:.5f}, accuracy={acc_te:.4f}]")


gamme: 1.5849 	train: [loss=0.70072, acc=0.6973]        	test: [loss=0.69790, accuracy=0.6973]
gamme: 0.7943 	train: [loss=0.49838, acc=0.7490]        	test: [loss=0.49725, accuracy=0.7490]
gamme: 0.3981 	train: [loss=0.50020, acc=0.7490]        	test: [loss=0.49919, accuracy=0.7490]
gamme: 0.1995 	train: [loss=0.50546, acc=0.7459]        	test: [loss=0.50460, accuracy=0.7459]
gamme: 0.1000 	train: [loss=0.51395, acc=0.7404]        	test: [loss=0.51333, accuracy=0.7404]


In [14]:
from helpers import train_test_split, sigmoid, compute_loss_logistic, accuracy_score, make_prediction
from implementations import logistic_regression

# convert labels from {-1,1} to {0,1}
labels_tr01 = 0.5 + labels_tr / 2.
labels_te01 = 0.5 + labels_te / 2.

max_iter = 300
gamma = 0.4
initial_w = np.zeros((x_tr.shape[1]))

w, l_tr = logistic_regression(labels_tr, features_tr, initial_w, max_iter, gamma)
l_te = compute_loss_logistic(labels_te, features_te, w)

acc_tr = accuracy_score(labels_tr01, make_prediction(sigmoid(features_tr @ w), logistic=True, zero_one=True))
acc_te = accuracy_score(labels_te01, make_prediction(sigmoid(features_te @ w), logistic=True, zero_one=True))

print(f"train acc={acc_tr:.5f}, test acc={acc_te:.5f}")
# The results are not as good as in training, possibly caused by outliers in the test set.

train acc=0.70296, test acc=0.70134


### 3.6 Regularized Logisic Regression

In [None]:
from helpers import train_test_split, sigmoid, compute_loss_logistic, accuracy_score, make_prediction
from implementations import reg_logistic_regression

# convert labels from {-1,1} to {0,1}
labels_tr01 = 0.5 + labels_tr / 2.
x_tr, x_te, y_tr, y_te = train_test_split(labels_tr01, features_tr, ratio=0.8, seed=42)

max_iter = 300
lambda_ = 0.1
gammas = np.logspace(0.2, -1, 5)
# gammas = np.logspace(-0.8,-2, 5)
mse = np.zeros((len(gammas), 2))

initial_w = np.zeros((x_tr.shape[1]))


for i, gamma in enumerate(gammas):

    w, l_tr = reg_logistic_regression(y_tr, x_tr, lambda_, initial_w, max_iter, gamma)
    l_te = compute_loss_logistic(y_te, x_te, w)

    mse[i,:] = [l_tr, l_te]
    acc_tr = accuracy_score(y_tr, make_prediction(sigmoid(x_tr @ w), logistic=True, zero_one=True))
    acc_te = accuracy_score(y_te, make_prediction(sigmoid(x_te @ w), logistic=True, zero_one=True))

    print(f"gamme: {gamma:.4f} \ttrain: [loss={l_tr:.5f}, acc={acc_te:.4f}]\
        \ttest: [loss={l_te:.5f}, accuracy={acc_te:.4f}]")


In [None]:
# TODO: we should not expect a ggod result with penalty term, the model is suffering from underfitting! Polynomial features could be considered.

## 4. Predict for Submission

In [5]:
# Load test data
_ , features_submit, ids_submit = load_csv_data(test_path, sub_sample=False)

In [6]:
# Choose a model and train it on the whole dataset
# TODO

# Make prediction
pred = None # TODO


# EXample with MSE GD
from implementations import mean_squared_error_gd
gamma = 3e-2
max_iter = 2000
initial_w = np.zeros((features.shape[1]))
w, l_tr = mean_squared_error_gd(labels, features, initial_w, max_iter, gamma)

from helpers import make_prediction
pred = make_prediction(features_submit @ w)

In [7]:
# Write to file 

from helpers import create_csv_submission

create_csv_submission(ids_submit, pred, sample_path)