In [1]:
# Useful starting lines
%matplotlib inline
import datetime
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [89]:
from proj1_helpers import *
from implementations import*

DATA_TRAIN_PATH = '../data/train.csv/train.csv' # TODO: download train data and supply path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [90]:
selector = np.all(tX != -999.0, axis=1)
tX_clean = tX[selector]
y_clean = y[selector]

print(tX.shape)
print(tX_clean.shape)
    

(250000, 30)
(68114, 30)


In [91]:
def standardize_NAN(tX):
    tX_nan = tX.copy()
    for i in range(tX.shape[0]):
        for j in range(tX.shape[1]):
            if (tX_nan[i,j] == -999.0):
                tX_nan[i,j] = np.nan
    return (standardize(tX_nan))


tX_nan, mean_x_nan, std_x_nan = standardize_NAN(tX)

# Tout les nans (correspondant a des valeurs non connues) sont remplac√©s par la moyenne de la colonnes
means_cols = np.nanmean(tX_nan,axis=1)
for row in range(0,tX_nan.shape[0]):
    for col in range(0,tX_nan.shape[1]):
        if np.isnan(tX_nan[row,col]):
            tX_nan[row,col]=means_cols[col]


92.56765498468643


# 1.Least squares gradient descent

In [95]:
#With tX and y_LS no corrupted
tX_LS, m_X,s = standardize(tX_clean)
tX_LS = tX_LS[0:68110]
y_LS = y_clean[0:68110].copy()

#With nan value replaced
tX_LS=tX_nan
y_LS= y

max_iters = 100
gammas = np.logspace(-4,0,20)
K=5

# Initialization
w_initial = np.zeros(tX_LS.shape[1])

list_tX_LS = np.split(tX_LS,K)
list_y_LS = np.split(y_LS,K)

gen_opt_w=[]
gen_mse =[]

#gamma selection
for ind, gamma in enumerate(gammas):
    weights=[]
    mse_errors = []
    #K-fold crossvalidation
    for ind, tX_bloc in enumerate(list_tX_LS):
        tX_test = tX_bloc
        y_test = list_y_LS[ind]
        tX_train= list_tX_LS[:ind] + list_tX_LS[ind+1:]
        tX_train= np.concatenate(tX_train)
        y_train= list_y_LS[:ind] + list_y_LS[ind+1:]
        y_train=np.concatenate(y_train)
        
        mse, opt_w = least_squares_GD(y_train, tX_train, w_initial, max_iters, gamma)
        mse_errors.append(compute_mse(y_test, tX_test,opt_w))
        weights.append(opt_w)
    gen_mse.append(np.mean(mse_errors))
    gen_opt_w.append(np.mean(weights, axis=0))

optimal_gamma_LS_GD = gammas[np.argmin(gen_mse)]
optimal_weights_LS_GD = gen_opt_w[np.argmin(gen_mse)]
print(" gamma={l:.3f},mse={mse:.3f}".format(mse = np.min(gen_mse), l = optimal_gamma_LS_GD))

#Training Accuracy
y_model = predict_labels(optimal_weights_LS_GD, tX_LS)
sum_ = 0
for i,v in enumerate(y_model):
    if(v == y_LS[i]):
        sum_ = sum_+1
print(sum_/len(y_model))

#With tX_CLEAN : accuracy = 0.6823
#With tX_NAN : accuracy= 0.705


134.10663740969926
 gamma=0.089,mse=0.737
[ 1.52629330e-01 -5.21742771e-01 -8.76666214e-02  2.07713589e-01
  4.53561528e-02  6.52049190e-02  2.56568950e-04  7.61931716e-02
 -2.41081693e-02  1.21985027e-02  6.35830904e-02  9.00370519e-02
  1.22605731e-02  2.79511696e-01  7.55293944e-02  7.49160617e-02
 -4.62982543e-02  7.56155755e-02  7.62740270e-02 -4.51089772e-02
  7.62550201e-02 -3.86426824e-02  7.41566184e-02 -5.67454513e-02
 -1.75254759e-02  2.72325373e-02 -3.92475999e-02 -3.09312665e-02
  2.56323668e-02 -6.94045947e-02]
0.705004


# Least square SDG

We can alter the gamma and the batch size

In [105]:
# Define the parameters of the algorithm.

#With nan value replaced
tX_LS=tX_nan
y_LS= y

max_iters = 50
max_batch_size = 10
gammas = [0.001,0.01]
batch_sizes = np.arange(max_batch_size)

# Initialization
w_initial = np.zeros(tX_LS.shape[1])
list_tX_LS = np.split(tX_LS,K)
list_y_LS = np.split(y_LS,K)

result_mse = np.array((batch_sizes.shape[0], len(gammas)))
result_opt_w =np.array((batch_sizes.shape[0], len(gammas)))

print(result_mse.shape)
print(result_opt_w)

for ind_batch,batch_size in enumerate(batch_sizes):  
    for ind_gamma,gamma in enumerate(gammas):
        mse_errors=[]
        weights=[]
        #K-fold crossvalidation
        for ind, tX_bloc in enumerate(list_tX_LS):
            tX_test = tX_bloc
            y_test = list_y_LS[ind]
            tX_train= list_tX_LS[:ind] + list_tX_LS[ind+1:]
            tX_train= np.concatenate(tX_train)
            y_train= list_y_LS[:ind] + list_y_LS[ind+1:]
            y_train=np.concatenate(y_train)
        
            sgd_mse, opt_w = least_squares_SGD(y_train, tX_train, w_initial, batch_size, max_iters, gamma)
            mse_errors.append(compute_mse(y_test, tX_test,opt_w))
            weights.append(opt_w)
        result_mse[ind_batch,ind_gamma]= np.mean(mse_errors)
        result_opt_w[ind_batch,ind_gamma]= np.mean(weights,axis=0)

print(result_mse)



(2,)


IndexError: too many indices for array

## Generate predictions and save ouput in csv format for submission:

In [93]:
DATA_TEST_PATH = '../data/test.csv/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [96]:
OUTPUT_PATH = './leastSquareGD' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(optimal_weights_LS_GD, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)