In [1]:
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
%matplotlib inline
import time 

#to keep things in order, and to avoid to copy and paste everytime our functions if we want to use them in more than one folder,
#we can temporarily use this library. 
import sys

#in this way Python will search the implementations also in the path '../HelperFunctions'
sys.path.insert(0, '../HelperFunctions')
sys.path.insert(0, '../pre-processing/Clean_Data/')

from proj1_helpers import *
from common_functions import *
from counters import *
from remove import *
from replace import *
from regressors import *

# Loading data

In [2]:
yb, input_data, ids = load_csv_data("../data/train.csv", sub_sample=False)
_, test_data, ids_test = load_csv_data("../data/test.csv", sub_sample=False)

#this will surely be deleted, in this way we are sure that original_data is the original version of the data and we don't have
#to load them again
from copy import deepcopy
originalData = deepcopy(input_data)
originalY = deepcopy(yb)
print(yb)

# change the label of -1 to 0 for simplicity. 
idx_wrong=np.where(yb==-1)
print(idx_wrong)
yb[idx_wrong]=0
print(yb)

[ 1. -1. -1. ...  1. -1. -1.]
(array([     1,      2,      3, ..., 249996, 249998, 249999]),)
[1. 0. 0. ... 1. 0. 0.]


# Pre-processing

### Removing the columns with bad data and standardization

In [None]:
input_data = deepcopy(originalData)
y = deepcopy(originalY)
print(input_data.shape)
print(y.shape)
# Stocking the indexes of columns to remove
idxCols = np.where(countInvalid(input_data,-999)>0)[0]
input_data=removeColumns(input_data,0)
print(input_data.shape)
input_data,_,_ = standardize(input_data)

### Same pre-processing applied on test dataset

In [None]:
print(test_data.shape)
# To remove the same data we removed from the train set.
test_data=np.delete(test_data,idxCols,axis=1)
print(test_data.shape)
test_data,_,_ = standardize(test_data)

# PCA

In [6]:
#Generating the principal components
sys.path.insert(0, '../pre-processing/PCA/')
from pca_functions import PCAWithCovariance

input_data,_,_ = standardize(input_data)
test_data,_,_ = standardize(test_data)

_,eV = PCAWithCovariance(input_data)

N = 7 #num p. components
components = np.empty(input_data.shape[0])
for i in range(N):
    components = np.c_[components, input_data.dot(eV[:,i])]
    
print(components.shape)
print(components[:,0])
input_data = np.c_[np.ones(input_data.shape[0]), components]
print(input_data.shape)
print(input_data[:,0])
print(input_data[:,1])

(250000, 8)
[0. 0. 0. ... 0. 0. 0.]
(250000, 9)
[1. 1. 1. ... 1. 1. 1.]
[0. 0. 0. ... 0. 0. 0.]


### PCA applied on test dataset

In [7]:
N = 7 #num p. components
components = np.empty(test_data.shape[0])
for i in range(N):
    components = np.c_[components, test_data.dot(eV[:,i])]
test_data = np.c_[np.ones(test_data.shape[0]), components]

# Regression

### - Gradient Descent

#### Creation of the model

In [8]:
# Using Gradient-Descent regressor

max_iters=100
#With gamma = 0.7 loss huuuuge
gamma=0.01

w_initial=np.array(np.zeros(input_data.shape[1]))

#Start
GD_loss,GD_ws= gradient_descent(yb,input_data,w_initial,max_iters,gamma)

#Print results
print("Gradient Descent final loss =",  GD_loss)
print("Weights =",GD_ws)

Gradient Descent(0/99): loss=0.171334
Gradient Descent(10/99): loss=0.15268659633285983
Gradient Descent(20/99): loss=0.14084488862166206
Gradient Descent(30/99): loss=0.13183935594859267
Gradient Descent(40/99): loss=0.12479872393103913
Gradient Descent(50/99): loss=0.11924027759198666
Gradient Descent(60/99): loss=0.11482185247526101
Gradient Descent(70/99): loss=0.11129022798612702
Gradient Descent(80/99): loss=0.10845474755734597
Gradient Descent(90/99): loss=0.10616992614771273
Gradient Descent final loss = 0.10449102874174189
Weights = [ 0.21724043  0.         -0.02499789  0.00726983 -0.07884486  0.02538187
  0.0470352  -0.00154474 -0.00043374]


#### Prediction

In [9]:
# Predicting the labels of test dataset.
GD_labels=predict_labels(GD_ws,test_data)
print(GD_labels)

create_csv_submission(ids_test,GD_labels,'predictions_GD_PCA.csv')
print('Prediction file created')

[-1.  1.  1. ...  1.  1.  1.]
Prediction file created


### - Stochastic Gradient Descent

#### Creation of the model

In [10]:
# Using Stochastic Gradient Descent regressor

max_iters=100
#With gamma = 0.7 loss huuuuge
gamma=0.01
batch_size=32

w_initial=np.array(np.zeros(input_data.shape[1]))

#Start
SGD_loss,SGD_ws= stochastic_gradient_descent(yb,input_data,w_initial,batch_size,max_iters,gamma)

#Print results
print("Stochastic Gradient Descent final loss =",  SGD_loss)
print("Weights =",SGD_ws)

Stochastic gradient Descent(0/99): loss=0.16903423424544467
Stochastic gradient Descent(10/99): loss=0.15104776864779385
Stochastic gradient Descent(20/99): loss=0.14081478451216126
Stochastic gradient Descent(30/99): loss=0.13392797686636673
Stochastic gradient Descent(40/99): loss=0.12569970255531177
Stochastic gradient Descent(50/99): loss=0.1195343598577545
Stochastic gradient Descent(60/99): loss=0.11530073981085984
Stochastic gradient Descent(70/99): loss=0.11243724549607986
Stochastic gradient Descent(80/99): loss=0.10940696642185664
Stochastic gradient Descent(90/99): loss=0.10750270804489886
Stochastic Gradient Descent final loss = 0.10567046452707281
Weights = [ 0.2146644   0.         -0.02026381  0.01583881 -0.07634598  0.02046416
  0.03428112  0.01635048 -0.01122279]


#### Prediction

In [11]:
# Predicting the labels of test dataset.
SGD_labels=predict_labels(SGD_ws,test_data)
print(SGD_labels)

create_csv_submission(ids_test,SGD_labels,'predictions_SGD_PCA.csv')
print('Prediction file created')

[-1.  1.  1. ...  1.  1.  1.]
Prediction file created


In [None]:
sys.path.insert(0, '../')
from implementations import *

# define parameter
lambdas = np.logspace(-5, 0, 15)
degree=3
    
# form tx
tx_tr = build_poly(input_data, degree)
print(tx_tr.shape)

# ridge regression with different lambda
rmse_tr = []
for ind, lambda_ in enumerate(lambdas):
    # ridge regression
    weight = ridge_regression(yb, tx_tr, lambda_)
    rmse_tr.append(np.sqrt(2 * compute_loss_MSE(yb, tx_tr, weight)))

    print("proportion={p}, degree={d}, lambda={l:.3f}, Training RMSE={tr:.3f}, Testing RMSE={te:.3f}".format(
          p=ratio, d=degree, l=lambda_, tr=rmse_tr[ind], te=rmse_te[ind]))

plot_train_test(rmse_tr, rmse_te, lambdas, degree)
    
