In [1]:
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
%matplotlib inline
import time

In [2]:
#to keep things in order, and to avoid to copy and paste everytime our functions if we want to use them in more than one folder,
#we can temporarily use this library. 
import sys

#in this way Python will search the implementations also in the path '../HelperFunctions'
sys.path.insert(0, '../HelperFunctions')
sys.path.insert(0, '../pre-processing/Clean_Data/')

In [3]:
from proj1_helpers import *
from common_functions import *
from counters import *
from replace import *
from regressors import batch_iter

In [4]:
yb, input_data, ids = load_csv_data("../data/train.csv", sub_sample=False)

In [5]:
#this will surely be deleted, in this way we are sure that original_data is the original version of the data and we don't have
#to load them again
from copy import deepcopy
originalData = deepcopy(input_data)
originalY = deepcopy(yb)

# Functions from lab 5

In [6]:
# def sig(t):
#     """apply sigmoid function on t."""
#     if t > 0:
#         return 1 / (1 + np.exp(-t))
#     else:
#         return np.exp(t) / (1 + np.exp(t))

# sigmoid = np.vectorize(sig)

def sigmoid(t):
     return .5*(1+np.tanh(.5 * t))

def calculate_loss(y, tx, w):
    """compute the cost by negative log likelihood."""
    pred = sigmoid(tx.dot(w))
    correctZero = 1e-7
    loss = np.sum((1 - y) * np.log(1 - pred + correctZero) + y * np.log(pred + correctZero))
    #return np.squeeze(- loss)
    return -loss/y.shape[0]


def calculate_gradient(y, tx, w):
    """compute the gradient of loss."""
    pred = sigmoid(tx.dot(w))
    grad = tx.T.dot(pred - y)
    return grad/y.shape[0]

def learning_by_gradient_descent(y, tx, w, gamma):
    """
    Do one step of gradient descen using logistic regression.
    Return the loss and the updated w.
    """
    #batch = next(batch_iter(y, tx, 32))
    #minibatch_y, minibatch_tx = batch[0], batch[1]
    grad = calculate_gradient(y, tx, w)
    loss = calculate_loss(y, tx, w)
    w = w - gamma * grad
    return loss, w

In [None]:
#Creation of tx

input_data = deepcopy(originalData)
print(input_data.shape)
#Clean the dataset
numInvalidValues = countInvalid(input_data,-999)
idxCols = np.where(numInvalidValues>0)[0]
input_data = replaceWithZero(input_data,-999,idxCols)

#standardize
tx,_,_ = standardize(input_data)

#add ones
tx = np.c_[np.ones((yb.shape[0], 1)), tx]
y = yb

In [None]:
#first tries with gradient descent

Y, TX = sample_data(y, tx, 10, 500)
max_iter = 10000
w = np.zeros(tx.shape[1])
gamma = 0.0005
loss0 = 0
print(TX.shape)

for iter in range(max_iter):
    wold = w
    loss, w = learning_by_gradient_descent(Y, TX, w, gamma)

    if iter%100==0:
        wnew = w
        print(f'Iter = {iter}, Loss = {loss}, |wold - wnew| = {np.linalg.norm(wold-wnew)}')




In [None]:
tx

Let's try to generate a submission

In [None]:
WWW = w 

In [None]:
xxx, test_data, ids = load_csv_data("../data/test.csv", sub_sample=False)

In [None]:
#we need to preprocess the test_data as we preprocessed the train data

numInvalidValues = countInvalid(test_data,-999)
idxCols = np.where(numInvalidValues>0)[0]
test_data = replaceWithZero(test_data,-999,idxCols)
txTest,_,_ = standardize(test_data)
txTest = np.c_[np.ones((xxx.shape[0], 1)), txTest]

In [None]:
y_pred = predict_labels(WWW, txTest)

In [None]:
create_csv_submission(ids, y_pred, 'submission.csv')

# Same things with PCA

In [26]:
sys.path.insert(0, '../pre-processing/PCA/')

In [27]:
from pca_functions import PCAWithCovariance

input_data = deepcopy(originalData)
y = deepcopy(originalY)
print(input_data.shape)
print(y.shape)
numInvalidValues = countInvalid(input_data,-999)
idxCols = np.where(numInvalidValues>0)[0]
input_data = replaceWithZero(input_data,-999,idxCols)
input_data,_,_ = standardize(input_data)

(250000, 30)
(250000,)


In [35]:
#Generating the principal components

_,eV = PCAWithCovariance(input_data)

N = 4 #num p. components
components = np.empty(input_data.shape[0])
for i in range(N):
    components = np.c_[components, input_data.dot(eV[:,i])]
    
#tx = np.c_[np.ones(input_data.shape[0]), components]
tx = components

In [40]:
Y, TX = sample_data(y, tx, 1, 2000)
TX.shape

(2000, 5)

In [41]:
max_iter = 25000
w = np.zeros(TX.shape[1])
gamma = 0.005
loss0 = 0
for iter in range(max_iter):
    wold = w
    loss, w = learning_by_gradient_descent(Y, TX, w, gamma)

    if iter%100==0:
        wnew = w
        print(f'Iter = {iter}, Loss = {loss}, |wold - wnew| = {np.linalg.norm(wold-wnew)}')

Iter = 0, Loss = 0.6931469805599656, |wold - wnew| = 0.0034411324058047743
Iter = 100, Loss = 0.5487411800655818, |wold - wnew| = 0.002110866374208986
Iter = 200, Loss = 0.4818058566295827, |wold - wnew| = 0.001596908331352765
Iter = 300, Loss = 0.4397917103253635, |wold - wnew| = 0.001318663569680761
Iter = 400, Loss = 0.40984355808032197, |wold - wnew| = 0.0011365644752609491
Iter = 500, Loss = 0.38699895760991754, |wold - wnew| = 0.0010053654063913243
Iter = 600, Loss = 0.3687974523666116, |wold - wnew| = 0.0009053794088694249
Iter = 700, Loss = 0.35383642377823926, |wold - wnew| = 0.0008262998475352955
Iter = 800, Loss = 0.34124304200179606, |wold - wnew| = 0.0007620532945561591
Iter = 900, Loss = 0.33044035059980137, |wold - wnew| = 0.0007087616304030384
Iter = 1000, Loss = 0.3210295768029005, |wold - wnew| = 0.0006638044859590974
Iter = 1100, Loss = 0.3127253201200707, |wold - wnew| = 0.0006253369951454438
Iter = 1200, Loss = 0.30531739667893093, |wold - wnew| = 0.000592019246656

Iter = 10700, Loss = 0.16144986219220528, |wold - wnew| = 0.0001338060987894953
Iter = 10800, Loss = 0.1609055854136311, |wold - wnew| = 0.00013284223675871884
Iter = 10900, Loss = 0.16036409769043308, |wold - wnew| = 0.00013189286476177872
Iter = 11000, Loss = 0.15982529424548067, |wold - wnew| = 0.00013095764363140603
Iter = 11100, Loss = 0.15928907266462794, |wold - wnew| = 0.0001300362449486015
Iter = 11200, Loss = 0.15875533306041997, |wold - wnew| = 0.00012912835061261926
Iter = 11300, Loss = 0.15822397822416806, |wold - wnew| = 0.00012823365243398095
Iter = 11400, Loss = 0.15769491376581868, |wold - wnew| = 0.00012735185174605847
Iter = 11500, Loss = 0.15716804823293642, |wold - wnew| = 0.00012648265903277526
Iter = 11600, Loss = 0.15664329321054493, |wold - wnew| = 0.00012562579357770776
Iter = 11700, Loss = 0.15612056340212008, |wold - wnew| = 0.0001247809831264152
Iter = 11800, Loss = 0.15559977667971656, |wold - wnew| = 0.000123947963566688
Iter = 11900, Loss = 0.15508085411

Iter = 21300, Loss = 0.11018672947398127, |wold - wnew| = 7.624808526302421e-05
Iter = 21400, Loss = 0.10973675391388701, |wold - wnew| = 7.594119981483931e-05
Iter = 21500, Loss = 0.10928729592892938, |wold - wnew| = 7.563673753614164e-05
Iter = 21600, Loss = 0.10883835409882593, |wold - wnew| = 7.53346686340358e-05
Iter = 21700, Loss = 0.1083899270031838, |wold - wnew| = 7.503496381218519e-05
Iter = 21800, Loss = 0.10794201320224397, |wold - wnew| = 7.473759426122294e-05
Iter = 21900, Loss = 0.10749461128194014, |wold - wnew| = 7.444253164806265e-05
Iter = 22000, Loss = 0.1070477198230094, |wold - wnew| = 7.414974810808127e-05
Iter = 22100, Loss = 0.10660133740628078, |wold - wnew| = 7.385921623184542e-05
Iter = 22200, Loss = 0.10615546263875127, |wold - wnew| = 7.357090905924439e-05
Iter = 22300, Loss = 0.10571009411989447, |wold - wnew| = 7.328480006651603e-05
Iter = 22400, Loss = 0.10526523049481198, |wold - wnew| = 7.300086316177145e-05
Iter = 22500, Loss = 0.1048208703998018, |w