In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
from utils.implementations import *
from utils.helpers import *
from utils.preprocessing import *
from utils.crossvalidation import *
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
#unzip csv 
import zipfile
with zipfile.ZipFile('../data/train.csv.zip') as zip_ref:
    zip_ref.extractall(r"../data")

In [5]:
#load data
DATA_TRAIN_PATH = '../data/train.csv' #download train data and supply path here 
y, x, ids = load_csv_data(DATA_TRAIN_PATH)

print("Shape of the data:\n y={ys}, x={xs}, ids={idss}".format(ys = y.shape, xs = x.shape, idss = ids.shape))

Shape of the data:
 y=(250000,), x=(250000, 30), ids=(250000,)


## Comparing the Models

### Pre-processing of the data

In [6]:
#standardize data
tx, mean, std = standardize(x)

#add constant term
tx = np.c_[np.ones((y.shape[0],1)), tx]

### Test Models

#### Gradient Descent

In [8]:
#defining cross validation parameters
k_fold = 5
k_indices = build_k_indices(y, k_fold, seed=1)

#defining model parameters
initial_w = np.zeros(tx.shape[1])
gamma = 0.01
max_iters = 2000

accuracy = []
total_loss_test= []

for k in range(k_fold):
    acc, loss_test = cross_validation(y, tx, k_indices, k, initial_w, 'least_squares_GD', max_iters, gamma)
    accuracy.append(acc)
    total_loss_test.append(loss_test)
    
#Print cross validation results
for i in range(len(accuracy)):
    print("%d - Test loss : %f / Test accuracy : %f" % (i, total_loss_test[i], accuracy[i]))

print("\nAverage test accuracy: %f" % np.mean(accuracy))
print("Variance test accuracy: %f" % np.var(accuracy))
print("Min test accuracy: %f" % np.min(accuracy))
print("Max test accuracy: %f" % np.max(accuracy))

0 - Test loss : 1.361959 / Test accuracy : 0.743240
1 - Test loss : 1.371413 / Test accuracy : 0.740400
2 - Test loss : 1.354045 / Test accuracy : 0.747760
3 - Test loss : 1.356264 / Test accuracy : 0.743900
4 - Test loss : 1.367306 / Test accuracy : 0.743340

Average test accuracy: 0.743728
Variance test accuracy: 0.000006
Min test accuracy: 0.740400
Max test accuracy: 0.747760


#### Stochastic Gradient Descent

In [9]:
#defining cross validation parameters
k_fold = 5
k_indices = build_k_indices(y, k_fold, seed=1)

#defining model parameters
initial_w = np.zeros(tx.shape[1])
gamma = 0.01
max_iters = 2000

accuracy = []
total_loss_test= []

for k in range(k_fold):    
    acc, loss_te_SGD = cross_validation(y, tx, k_indices, k, initial_w, 'least_squares_SGD', max_iters, gamma)
    accuracy.append(acc)
    total_loss_test.append(loss_te_SGD)

#Print cross validation results
for i in range(len(accuracy)):
    print("%d - Test loss : %f / Test accuracy : %f" % (i, total_loss_test[i], accuracy[i]))

print("\nAverage test accuracy: %f" % np.mean(accuracy))
print("Variance test accuracy: %f" % np.var(accuracy))
print("Min test accuracy: %f" % np.min(accuracy))
print("Max test accuracy: %f" % np.max(accuracy))

0 - Test loss : 1.531001 / Test accuracy : 0.699740
1 - Test loss : 1.659796 / Test accuracy : 0.689540
2 - Test loss : 1.739932 / Test accuracy : 0.674780
3 - Test loss : 1.612047 / Test accuracy : 0.715700
4 - Test loss : 1.480298 / Test accuracy : 0.716520

Average test accuracy: 0.699256
Variance test accuracy: 0.000252
Min test accuracy: 0.674780
Max test accuracy: 0.716520


#### Least Squares

In [10]:
#defining cross validation parameters
k_fold = 5
k_indices = build_k_indices(y, k_fold, seed=1)

#defining model parameters
initial_w = np.zeros(tx.shape[1])
lambda_ = 0.6

accuracy = []
total_loss_test= []

for k in range(k_fold):
    
    acc, loss_te_RR = cross_validation(y, tx, k_indices, k, initial_w, 'least_squares', lambda_)
    accuracy.append(acc)
    total_loss_test.append(np.mean(loss_te_RR))

#Print cross validation results
for i in range(len(accuracy)):
    print("%d - Test loss : %f / Test accuracy : %f" % (i, total_loss_test[i], accuracy[i]))

print("\nAverage test accuracy: %f" % np.mean(accuracy))
print("Variance test accuracy: %f" % np.var(accuracy))
print("Min test accuracy: %f" % np.min(accuracy))
print("Max test accuracy: %f" % np.max(accuracy))

0 - Test loss : 1.357794 / Test accuracy : 0.744520
1 - Test loss : 1.366549 / Test accuracy : 0.742460
2 - Test loss : 1.351311 / Test accuracy : 0.748380
3 - Test loss : 1.352930 / Test accuracy : 0.743900
4 - Test loss : 1.362464 / Test accuracy : 0.744680

Average test accuracy: 0.744788
Variance test accuracy: 0.000004
Min test accuracy: 0.742460
Max test accuracy: 0.748380


#### Rigde Regression

In [11]:
#defining cross validation parameters
k_fold = 5
k_indices = build_k_indices(y, k_fold, seed=1)

#defining model parameters
initial_w = np.zeros(tx.shape[1])
lambda_ = 0.6

accuracy = []
total_loss_test= []

for k in range(k_fold):
    acc, loss_te_RR = cross_validation(y, tx, k_indices, k, initial_w, 'ridge_regression', lambda_)
    accuracy.append(np.mean(acc))
    total_loss_test.append(np.mean(loss_te_RR))

#Print cross validation results
for i in range(len(accuracy)):
    print("%d - Test loss : %f / Test accuracy : %f" % (i, total_loss_test[i], accuracy[i]))

print("\nAverage test accuracy: %f" % np.mean(accuracy))
print("Variance test accuracy: %f" % np.var(accuracy))
print("Min test accuracy: %f" % np.min(accuracy))
print("Max test accuracy: %f" % np.max(accuracy))

0 - Test loss : 1.357794 / Test accuracy : 0.744520
1 - Test loss : 1.366549 / Test accuracy : 0.742460
2 - Test loss : 1.351311 / Test accuracy : 0.748380
3 - Test loss : 1.352930 / Test accuracy : 0.743900
4 - Test loss : 1.362464 / Test accuracy : 0.744680

Average test accuracy: 0.744788
Variance test accuracy: 0.000004
Min test accuracy: 0.742460
Max test accuracy: 0.748380


#### Logistic Regression

In [7]:
#defining cross validation parameters
k_fold = 5
k_indices = build_k_indices(y, k_fold, seed=1)

#defining model parameters
initial_w = np.zeros(tx.shape[1])
gamma = 1e-6
max_iters = 2000
lambda_ = 0.001

accuracy = []
total_loss_test= []
for k in range(k_fold):
    acc, loss_te_LR = cross_validation(y, tx, k_indices, k, initial_w, 'logistic_regression', max_iters, gamma)
    accuracy.append(acc)
    total_loss_test.append(loss_te_LR)

#Print cross validation results
for i in range(len(accuracy)):
    print("%d - Test loss : %f / Test accuracy : %f" % (i, total_loss_test[i], accuracy[i]))

print("\nAverage test accuracy: %f" % np.mean(accuracy))
print("Variance test accuracy: %f" % np.var(accuracy))
print("Min test accuracy: %f" % np.min(accuracy))
print("Max test accuracy: %f" % np.max(accuracy))

0 - Test loss : 0.497959 / Test accuracy : 0.749440
1 - Test loss : 0.500810 / Test accuracy : 0.747360
2 - Test loss : 0.494319 / Test accuracy : 0.753360
3 - Test loss : 0.495561 / Test accuracy : 0.749860
4 - Test loss : 0.499373 / Test accuracy : 0.750560

Average test accuracy: 0.750116
Variance test accuracy: 0.000004
Min test accuracy: 0.747360
Max test accuracy: 0.753360


#### Reg Logistic Regression

In [9]:
#defining cross validation parameters
k_fold = 5
k_indices = build_k_indices(y, k_fold, seed=1)

#defining model parameters
initial_w = np.zeros(tx.shape[1])
gamma = 1e-6
max_iters = 2000
lambda_ = 0.001

accuracy = []
total_loss_test= []

for k in range(k_fold):
    acc, loss_te_RLR = cross_validation(y, tx, k_indices, k, initial_w, 'reg_logistic_regression',  max_iters, gamma, lambda_)
    accuracy.append(acc)
    total_loss_test.append(loss_te_RLR)

#Print cross validation results
for i in range(len(accuracy)):
    print("%d - Test loss : %f / Test accuracy : %f" % (i, total_loss_test[i], accuracy[i]))

print("\nAverage test accuracy: %f" % np.mean(accuracy))
print("Variance test accuracy: %f" % np.var(accuracy))
print("Min test accuracy: %f" % np.min(accuracy))
print("Max test accuracy: %f" % np.max(accuracy))

0 - Test loss : 0.497990 / Test accuracy : 0.749260
1 - Test loss : 0.500828 / Test accuracy : 0.747460
2 - Test loss : 0.494341 / Test accuracy : 0.753140
3 - Test loss : 0.495605 / Test accuracy : 0.750100
4 - Test loss : 0.499399 / Test accuracy : 0.750400

Average test accuracy: 0.750072
Variance test accuracy: 0.000003
Min test accuracy: 0.747460
Max test accuracy: 0.753140


In [None]:
#defining cross validation parameters
k_fold = 5
k_indices = build_k_indices(y, k_fold, seed=1)

#defining model parameters
initial_w = np.zeros(tx.shape[1])
gamma = 1e-6
max_iters = 2000
lambda_ = 0.6

accuracy = []
total_loss_test= []

for k in range(k_fold):
    acc, loss_te_RLR = cross_validation(y, tx, k_indices, k, initial_w, 'reg_logistic_regression',  max_iters, gamma, lambda_)
    accuracy.append(acc)
    total_loss_test.append(loss_te_RLR)

#Print cross validation results
for i in range(len(accuracy)):
    print("%d - Test loss : %f / Test accuracy : %f" % (i, total_loss_test[i], accuracy[i]))

print("\nAverage test accuracy: %f" % np.mean(accuracy))
print("Variance test accuracy: %f" % np.var(accuracy))
print("Min test accuracy: %f" % np.min(accuracy))
print("Max test accuracy: %f" % np.max(accuracy))