## 1. Load data

In [2]:
import numpy as np
from helpers import load_data

train_path = "./data/train.csv"
test_path = "./data/test.csv"
sample_path = "./datasample-submission.csv"

In [3]:
data_DER, data_PRI, labels = load_data(train_path, sub_sample=False)
print(data_DER.shape, data_PRI.shape, labels.shape)

(250000, 12) (250000, 17) (250000,)


In [4]:
features = np.hstack([data_DER, data_PRI])
print(features.shape)

(250000, 29)


## 2. Preprocess data

In [26]:
# normalize data

features_mean = features.mean(axis=0)
features_std = features.std(axis=0)
features = (features - features_mean) / features_std

print(features.mean(axis=0))
print(features.std(axis=0))

[-6.08338842e-14 -5.86481808e-14 -2.27427734e-14  3.06118394e-12
  1.92967673e-12  3.05593076e-12 -1.01884043e-11  5.11384814e-14
  6.10630737e-14  4.25036946e-12  6.77316055e-11  3.12946164e-12
  4.93599461e-15  4.28346237e-12  1.64100965e-12 -1.28360976e-13
  4.93923297e-13  1.56916105e-13  3.10176833e-14 -1.99874923e-12
 -6.55965828e-14  1.01017205e-10  4.41391325e-13  8.68616814e-13
  8.61995463e-13  2.86682454e-12  3.05812320e-12  3.08563190e-12
  2.65435187e-12]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1.]


## 3. Train model with all features

In [27]:
# split data
from helpers import train_test_split
features_tr, features_te, labels_tr, labels_te = train_test_split(labels, features, 0.8, seed=42)
print(features_tr.shape, labels_tr.shape)
print(features_te.shape, labels_te.shape)

(200000, 29) (200000,)
(50000, 29) (50000,)


### 3.1 Least square GD

In [38]:
from helpers import kfold_split, compute_loss
from implementations import mean_squared_error_gd

k_fold = 5
max_iter = 2000
gammas = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
mse_tr = np.zeros((len(gammas), k_fold))
mse_te = np.zeros((len(gammas), k_fold))

initial_w = np.zeros((features.shape[1]))

for i, gamma in enumerate(gammas):
    for j, (x_tr, x_te, y_tr, y_te) in enumerate(kfold_split(labels_tr, features_tr, k_fold, seed=42)):

        w, l_tr = mean_squared_error_gd(y_tr, x_tr, initial_w, max_iter, gamma)
        l_te = compute_loss(y_te, x_te, w)

        mse_tr[i,j] = l_tr
        mse_te[i,j] = l_te

    print(f"gamme: {gamma} \ttrain loss: mean={mse_tr[i].mean():5f}, std={mse_tr[i].std():5f} \
        \ttest loss: mean={mse_te[i].mean():5f}, std={mse_te[i].std():5f}")



gamme: 0.1 	train loss: mean=0.143717, std=0.000298         	test loss: mean=0.143764, std=0.001190
gamme: 0.01 	train loss: mean=0.143827, std=0.000301         	test loss: mean=0.143867, std=0.001260
gamme: 0.001 	train loss: mean=0.148650, std=0.000361         	test loss: mean=0.148674, std=0.001455
gamme: 0.0001 	train loss: mean=0.159751, std=0.000369         	test loss: mean=0.159757, std=0.001492
gamme: 1e-05 	train loss: mean=0.168674, std=0.000375         	test loss: mean=0.168675, std=0.001504


In [None]:
# Compute accuracy

### 3.2 Least square SGD

In [None]:
from implementations import least_squares_SGD

### 3.3 Least Squares with normal equation

In [None]:
from implementations import least_squares