In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
from implementations import *
from helpers import *
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
#unzip csv 
import zipfile
with zipfile.ZipFile('../data/train.csv.zip') as zip_ref:
    zip_ref.extractall(r"../data")

In [3]:
from proj1_helpers import *

#load data
DATA_TRAIN_PATH = '../data/train.csv' #download train data and supply path here 
y, x, ids = load_csv_data(DATA_TRAIN_PATH)

print(y.shape, x.shape, ids.shape)

(250000,) (250000, 30) (250000,)


## Comparing the Models

### Replacing Missing Values

In [48]:
tx = x.copy()
for i in range(x.shape[1]):
    idx = x[:,i] > -999
    mean = np.mean(x[idx,i])
    tx[idx==False,i] = mean

In [50]:
tx.shape

(250000, 30)

### Standardize the data

In [51]:
tx = standardize(x)
#add constant term
tx = np.c_[np.ones((y.shape[0], 1)), tx]

### Split the data 

In [4]:
x_train, y_train, x_test, y_test = split_data(x, y, 1, seed=1)

In [5]:
y_test

array([], dtype=float64)

### Test Models

#### Gradient and Stochastic gradient

In [6]:
gamma = 0.1
max_iters = 50
initial_w = np.zeros(x_train.shape[1])
k_fold = 5
k_indices = build_k_indices(y, k_fold, seed=1)

#gradients = [least_squares_GD(y_train, x_train, initial_w, max_iters,gamma),
#             least_squares_SGD(y_train, x_train, initial_w, max_iters, gamma)]
#gradients_names = ["Gradient Descent","Stochastic Gradient Descent"]
#print('-----Without standardization-----')
#for i in range (len(gradients)):
''''    w,loss = gradients[i]
    print("{name}, w*={w}, loss={l}\n".format(name=gradients_names[i],w=w, l=loss))'''
    

acc_GD = []
total_loss_te_GD= []
acc_SGD = []
total_loss_te_SGD= []
for k in range(k_fold):
    acc, loss_te_GD = cross_validation(y_train, x_train, k_indices, k, initial_w, 'least_squares_GD', max_iters, gamma)
    acc_GD.append(np.mean(acc))
    total_loss_te_GD.append(np.mean(loss_te_GD))
    
    acc, loss_te_SGD = cross_validation(y_train, x_train, k_indices, k, initial_w, 'least_squares_SGD', max_iters, gamma)
    acc_SGD.append(np.mean(acc))
    total_loss_te_SGD.append(np.mean(loss_te_SGD))

print(acc_GD)
print(total_loss_te_GD)
print(acc_SGD)
print(total_loss_te_SGD)

NameError: name 'least_squares_GD' is not defined

In [27]:
print(acc_GD)
print(total_loss_te_GD)
print(acc_SGD)
print(total_loss_te_GD)

[0.73704, 0.73708, 0.73374, 0.73668, 0.73508]
[1.3924515051428645531, 48756665.88962448199, 1.3966437172033032885, 5848951170.4316515788, 1.4076181071802289726, 4575.1970285782716905, 1.3997037523526678248, 56656.796372831631633, 1.3968991804508836006, 876.2883441005645191]
[0.47096, 0.61934, 0.4944, 0.52392, 0.595]
[]


#### Least Squares and Rigde Regression

In [60]:
gamma = 0.01
initial_w = np.zeros(31)
lambda_ = 0.00001
'''
regression =[least_squares(y_train, x_train),
             ridge_regression(y_train, x_train,lambda_)]
                  
regression_names =  ["Least Squares", "Ridge Regression"]

print('-----Without standardization-----')
for i in range (len(regression)):
    w,loss = regression[i]
    print("{name}, w*={w}, loss={l}\n".format(name=regression_names[i],w=w, l=loss))
'''
    
acc_LS = []
total_loss_te_LS = []
acc_RR = []
total_loss_te_RR = []
for k in range(k_fold):
    acc, loss_te_LS = cross_validation(y_train, x_train, k_indices, k, initial_w, 'least_squares', lambda_)
    acc_LS.append(np.mean(acc))
    total_loss_te_LS.append(np.mean(loss_te_LS))
    
    acc, loss_te_RR = cross_validation(y_train, x_train, k_indices, k, initial_w, 'ridge_regression', lambda_)
    acc_RR.append(np.mean(acc))
    total_loss_te_RR.append(np.mean(loss_te_RR))

print(acc_LS)
print(total_loss_te_LS)
print(acc_RR)
print(total_loss_te_RR)   
    


[0.7461, 0.74518, 0.7434, 0.74582, 0.74262]
[1.3555913246193446571, 1.3582676360824257117, 1.3706891248972015524, 1.363524075555184476, 1.362251088917323285]
[0.7461, 0.74518, 0.7434, 0.74582, 0.74262]
[1.3555913246193446571, 1.3582676360824257117, 1.3706891248972015524, 1.363524075555184476, 1.362251088917323285]


In [58]:
print(acc_LS)
print(total_loss_te_LS)
print(acc_RR)
print(total_loss_te_RR)  

[0.74654, 0.74512, 0.74376, 0.74442, 0.74374]
[1.3525886178173593112, 1.3545276600558762962, 1.3668025050377936305, 1.3620093289471031241, 1.355622976245170133]
[0.74654, 0.74512, 0.74376, 0.74442, 0.74374]
[1.3525886178173593112, 1.3545276600558762962, 1.3668025050377936305, 1.3620093289471031241, 1.355622976245170133]


#### Logistic Regression and Reg Logistic Regression

In [62]:
initial_w = np.zeros(31)
gamma = 1e-6
max_iters = 50
lambda_ = 1
'''
logistic = [logistic_regression(y_train, x_train, initial_w, max_iters, gamma),
            reg_logistic_regressions(y_train, x_train, lambda_, initial_w, max_iters, gamma)]
logistic_names = ['Logistic Ridge Regression','Reg Logistic Ridge Regression']

print('-----Without standardization-----')
for i in range (len(logistic)):
    w,loss = logistic[i]
    print("{name}, w*={w}, loss={l}\n".format(name=logistic_names[i],w=w, l=loss))
'''   
    
acc_LR = []
total_loss_te_LR = []
acc_RLR = []
total_loss_te_RLR = []
for k in range(k_fold):
    acc, loss_te_LR = cross_validation(y_train, x_train, k_indices, k, initial_w, 'logistic_regression', max_iters, gamma, lambda_)
    acc_LR.append(np.mean(acc))
    total_loss_te_LR.append(np.mean(loss_te_LR))
    
    acc, loss_te_RLR = cross_validation(y_train, x_train, k_indices, k, initial_w, 'reg_logistic_regressions',  max_iters, gamma, lambda_)
    acc_RLR.append(np.mean(acc))
    total_loss_te_RLR.append(np.mean(loss_te_RLR))

print(acc_LR)
print(total_loss_te_LR)
print(acc_RLR)
print(total_loss_te_RLR) 

-----Without standardization-----
Logistic Ridge Regression, w*=[-1.10725936e+01  8.41469761e-02 -3.60034846e+00 -1.00180863e-01
  1.34780359e+00  1.34314996e+00  1.24489798e+00 -1.15490057e+00
  6.77071689e-01 -3.53528957e-01  8.75728625e-01 -1.96086787e+00
  2.54229826e+00  1.35989188e+00  2.02666432e+00 -1.06067284e-02
 -5.25302770e-02 -4.38613373e-01  1.31998910e-02  4.22363295e-02
 -2.51981090e-01  6.96128266e-02  7.43541349e-01  8.24386799e-01
  3.00275454e-01  4.22169233e-05  1.79620915e-04 -3.88428881e-01
  1.13832281e-02 -2.71927538e-02  6.69111205e-01], loss=nan

Reg Logistic Ridge Regression, w*=[-1.10731186e+01  8.35844548e-02 -3.60102072e+00 -1.00770113e-01
  1.34735234e+00  1.34253135e+00  1.24429954e+00 -1.15554503e+00
  6.76363933e-01 -3.54053977e-01  8.75313981e-01 -1.96147333e+00
  2.54175795e+00  1.35930844e+00  2.02610867e+00 -1.11926643e-02
 -5.31476897e-02 -4.39136504e-01  1.26137242e-02  4.16239858e-02
 -2.52471097e-01  6.90056168e-02  7.43110012e-01  8.23897864e

In [61]:
print(acc_LR)
print(total_loss_te_LR)
print(acc_RLR)
print(total_loss_te_RLR) 

[0.68926, 0.69136, 0.6854, 0.6904, 0.68868]
[nan, inf, inf, nan, nan]
[0.68932, 0.69134, 0.68534, 0.69042, 0.68872]
[nan, inf, inf, nan, nan]
