In [124]:
# Useful starting lines
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
from implementations import *
from helpers import *
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load the training data into feature matrix, class labels, and event ids:

In [125]:
#unzip csv 
import zipfile
with zipfile.ZipFile('../data/train.csv.zip') as zip_ref:
    zip_ref.extractall(r"../data")

In [126]:
from proj1_helpers import *

#load data
DATA_TRAIN_PATH = '../data/train.csv' #download train data and supply path here 
y, x, ids = load_csv_data(DATA_TRAIN_PATH)

print(y.shape, x.shape, ids.shape)

(250000,) (250000, 30) (250000,)


## Comparing the Models

### Replacing Missing Values

In [113]:
'''tx = x.copy()
for i in range(x.shape[1]):
    idx = x[:,i] > -999
    mean = np.mean(x[idx,i])
    tx[idx==False,i] = mean'''

'tx = x.copy()\nfor i in range(x.shape[1]):\n    idx = x[:,i] > -999\n    mean = np.mean(x[idx,i])\n    tx[idx==False,i] = mean'

### Standardize the data

In [127]:
tx = standardize(x)
#add constant term
tx = np.c_[np.ones((y.shape[0],1)), tx]
tx.shape

(250000, 31)

### Split the data 

In [128]:
x_train, y_train, x_test, y_test = split_data(tx, y, 1, seed=1)

In [129]:
y_test

array([], dtype=float64)

### Test Models

#### Gradient Descent

In [35]:
gamma = 0.01
max_iters = 50
initial_w = np.zeros(x_train.shape[1])
k_fold = 5
k_indices = build_k_indices(y, k_fold, seed=1)

#gradients = [least_squares_GD(y_train, x_train, initial_w, max_iters,gamma),
#             least_squares_SGD(y_train, x_train, initial_w, max_iters, gamma)]
#gradients_names = ["Gradient Descent","Stochastic Gradient Descent"]
#print('-----Without standardization-----')
#for i in range (len(gradients)):
''''    w,loss = gradients[i]
    print("{name}, w*={w}, loss={l}\n".format(name=gradients_names[i],w=w, l=loss))'''

acc_GD = []
total_loss_te_GD= []

for k in range(k_fold):
    acc, loss_te_GD = cross_validation(y_train, x_train, k_indices, k, initial_w, 'least_squares_GD', max_iters, gamma)
    acc_GD.append(np.mean(acc))
    total_loss_te_GD.append(np.mean(loss_te_GD))
    
print(acc_GD)
print(total_loss_te_GD)

[0.73704, 0.73708, 0.73374, 0.73668, 0.73508]
[1.3924515051428645531, 1.3966437172033032885, 1.4076181071802289726, 1.3997037523526678248, 1.3968991804508836006]
[0.47096, 0.61934, 0.4944, 0.52392, 0.595]
[48756665.88962448199, 5848951170.4316515788, 4575.1970285782716905, 56656.796372831631633, 876.2883441005645191]


In [27]:
print(acc_GD)
print(total_loss_te_GD)
print(acc_SGD)
print(total_loss_te_GD)

[0.73704, 0.73708, 0.73374, 0.73668, 0.73508]
[1.3924515051428645531, 48756665.88962448199, 1.3966437172033032885, 5848951170.4316515788, 1.4076181071802289726, 4575.1970285782716905, 1.3997037523526678248, 56656.796372831631633, 1.3968991804508836006, 876.2883441005645191]
[0.47096, 0.61934, 0.4944, 0.52392, 0.595]
[]


#### Stochastic Gradient Descent

In [None]:
gamma = 0.01
max_iters = 50
initial_w = np.zeros(x_train.shape[1])
k_fold = 5
k_indices = build_k_indices(y, k_fold, seed=1)

acc_SGD = []
total_loss_te_SGD= []
for k in range(k_fold):    
    acc, loss_te_SGD = cross_validation(y_train, x_train, k_indices, k, initial_w, 'least_squares_SGD', max_iters, gamma)
    acc_SGD.append(np.mean(acc))
    total_loss_te_SGD.append(np.mean(loss_te_SGD))

print(acc_SGD)
print(total_loss_te_SGD)

#### Least Squares

In [36]:
initial_w = np.zeros(x_train.shape[1])
lambda_ = 0.6
'''
regression =[least_squares(y_train, x_train),
             ridge_regression(y_train, x_train,lambda_)]
                  
regression_names =  ["Least Squares", "Ridge Regression"]

print('-----Without standardization-----')
for i in range (len(regression)):
    w,loss = regression[i]
    print("{name}, w*={w}, loss={l}\n".format(name=regression_names[i],w=w, l=loss))
'''
    
acc_LS = []
total_loss_te_LS = []

for k in range(k_fold):
    
    acc, loss_te_RR = cross_validation(y_train, x_train, k_indices, k, initial_w, 'ridge_regression', lambda_)
    acc_RR.append(acc)
    total_loss_te_RR.append(np.mean(loss_te_RR))

print(acc_LS)
print(total_loss_te_LS)

for i in range (len(acc_RR)):
    print("{K}, Loss={l}, Accuracy={a}\n".format(K=i,l=total_loss_te_RR[i], a=acc_RR[i]))


[0.74654, 0.74512, 0.74376, 0.74442, 0.74374]
[1.3525886178173593112, 1.3545276600558762962, 1.3668025050377936305, 1.3620093289471031241, 1.355622976245170133]
[0.74654, 0.74512, 0.74376, 0.74442, 0.74374]
[1.3525886178173593112, 1.3545276600558762962, 1.3668025050377936305, 1.3620093289471031241, 1.355622976245170133]


In [58]:
print(acc_LS)
print(total_loss_te_LS)
print(acc_RR)
print(total_loss_te_RR)  

[0.74654, 0.74512, 0.74376, 0.74442, 0.74374]
[1.3525886178173593112, 1.3545276600558762962, 1.3668025050377936305, 1.3620093289471031241, 1.355622976245170133]
[0.74654, 0.74512, 0.74376, 0.74442, 0.74374]
[1.3525886178173593112, 1.3545276600558762962, 1.3668025050377936305, 1.3620093289471031241, 1.355622976245170133]


#### Rigde Regression

In [86]:
initial_w = np.zeros(x_train.shape[1])
acc_RR = []
total_loss_te_RR = []
for k in range(k_fold):
    
    acc, loss_te_RR = cross_validation(y_train, x_train, k_indices, k, initial_w, 'ridge_regression', lambda_)
    acc_RR.append(np.mean(acc))
    total_loss_te_RR.append(np.mean(loss_te_RR))

print(acc_RR)
print(total_loss_te_RR) 

[0.74654, 0.74512, 0.74376, 0.74442, 0.74374]
[1.3525886178173593112, 1.3545276600558762962, 1.3668025050377936305, 1.3620093289471031241, 1.355622976245170133]


#### Logistic Regression

In [135]:
initial_w = np.zeros(x_train.shape[1])
gamma = 1e-6
max_iters = 50
lambda_ = 0.001
k_fold = 5
k_indices = build_k_indices(y, k_fold, seed=1)

'''
logistic = [logistic_regression(y_train, x_train, initial_w, max_iters, gamma),
            reg_logistic_regressions(y_train, x_train, lambda_, initial_w, max_iters, gamma)]
logistic_names = ['Logistic Ridge Regression','Reg Logistic Ridge Regression']

print('-----Without standardization-----')
for i in range (len(logistic)):
    w,loss = logistic[i]
    print("{name}, w*={w}, loss={l}\n".format(name=logistic_names[i],w=w, l=loss))
'''   
acc_LR = []
total_loss_te_LR = []

for k in range(k_fold):
    acc, loss_te_LR = cross_validation(y_train, x_train, k_indices, k, initial_w, 'logistic_regression', max_iters, gamma, lambda_)
    acc_LR.append(acc)
    total_loss_te_LR.append(loss_te_LR)

for i in range(len(acc_LR)):
    print("%d - Test accuracy : %f" % (i, acc_LR[i]))

print("\nAverage test accuracy: %f" % np.mean(acc_LR))
print("Variance test accuracy: %f" % np.var(acc_LR))
print("Min test accuracy: %f" % np.min(acc_LR))
print("Max test accuracy: %f" % np.max(acc_LR))

0 - Test accuracy : 0.731640
1 - Test accuracy : 0.732560
2 - Test accuracy : 0.728440
3 - Test accuracy : 0.731620
4 - Test accuracy : 0.728980

Average test accuracy: 0.730648
Variance test accuracy: 0.000003
Min test accuracy: 0.728440
Max test accuracy: 0.732560


In [133]:
print(y)

[ 1. -1. -1. ...  1. -1. -1.]


#### Reg Logistic Regression

In [102]:
initial_w = np.zeros(x_train.shape[1])
gamma = 0.01
max_iters = 10
lambda_ = 0.04


acc_RLR = []
total_loss_te_RLR = []
for k in range(k_fold):
    
    acc, loss_te_RLR = cross_validation(y_tr, x_train, k_indices, k, initial_w, 'reg_logistic_regression',  max_iters, gamma, lambda_)
    acc_RLR.append(acc)
    total_loss_te_RLR.append(np.mean(loss_te_RLR))

for i in range(len(acc_LR)):
    print("%d - Mean loss : %f / Test accuracy : %f" % (i, total_loss_te_RLR[i], acc_LR[i]))

print("\nAverage test accuracy: %f" % np.mean(acc_LR))
print("Variance test accuracy: %f" % np.var(acc_LR))
print("Min test accuracy: %f" % np.min(acc_LR))
print("Max test accuracy: %f" % np.max(acc_LR))

  e_t =np.exp(-t)


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [136]:
len(y)

250000

In [None]:
y.shape