In [1]:
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
%matplotlib inline
import time 

#to keep things in order, and to avoid to copy and paste everytime our functions if we want to use them in more than one folder,
#we can temporarily use this library. 
import sys

#in this way Python will search the implementations also in the path '../HelperFunctions'
sys.path.insert(0, '../HelperFunctions')
sys.path.insert(0, '../pre-processing/Clean_Data/')

from proj1_helpers import *
from common_functions import *
from counters import *
from remove import *
from replace import *
from regressors import *
from CrossValidationFunctions import *

# Regression functions

In [140]:
from CrossValidationFunctions import *
def grid_search_hyperparam_with_CV(y, tx, lambdas, degrees):
    loss_tr = np.zeros((len(lambdas), len(degrees)))
    loss_te = np.zeros((len(lambdas), len(degrees)))
    accuracy = np.zeros((len(lambdas), len(degrees)))
    
    for idx_lambda, lambda_ in enumerate(lambdas):
        for idx_degree, degree in enumerate(degrees):
                        
            x_augmented = build_poly(tx, degree)
            
            #regression with your favourite method
            k_indices = build_k_indices(y, 4, 1)
            acc, loss1, loss2 = cross_validation_with_ridge(y, x_augmented, k_indices, lambda_)
            
            loss_tr[idx_lambda, idx_degree] = loss1
            loss_te[idx_lambda, idx_degree] = loss2
            accuracy[idx_lambda, idx_degree] = acc
    
    #find the best using the loss
    min_loss_te = np.min(loss_te)
    best_lambda_loss = lambdas[ np.where( loss_te == min_loss_te )[0] ]
    best_degree_loss = degrees[ np.where( loss_te == min_loss_te )[1] ]

    #recompute best w
    x_augmented = build_poly(tx, int(best_degree_loss))
    best_w_loss = ridge_regression(y,x_augmented,best_lambda_loss)
    
    #find the best using the accuracy
    max_acc = np.max(accuracy)
    best_lambda_acc = lambdas[ np.where( accuracy == max_acc )[0] ]
    best_degree_acc = degrees[ np.where( accuracy == max_acc )[1] ]
    
    #recompute best w
    x_augmented = build_poly(tx, int(best_degree_acc[0]))
    best_w_acc = ridge_regression(y,x_augmented,best_lambda_acc[0])

    return best_lambda_loss, best_degree_loss, best_w_loss, best_lambda_acc, best_degree_acc, best_w_acc, loss_tr, loss_te, accuracy


def grid_search_hyperparam_RIDGE(y, tx, lambdas, degrees):
    loss_tr = np.zeros((len(lambdas), len(degrees)))
    loss_te = np.zeros((len(lambdas), len(degrees)))
    
    seed = 1
    
    for idx_lambda, lambda_ in enumerate(lambdas):
        for idx_degree, degree in enumerate(degrees):
            
            x_augmented = build_poly(tx, degree)
            
            #regression with your favourite method
            x_tr, x_te, y_tr, y_te = split_data(x_augmented, y, 0.7, seed = seed)

            weights = ridge_regression(y_tr, x_tr, lambda_)

            rmse_tr= np.sqrt(2 * compute_loss_MSE(y_tr, x_tr, weights))
            rmse_vt= np.sqrt(2 * compute_loss_MSE(y_te, x_te, weights))
            loss_tr[idx_lambda, idx_degree] = rmse_tr
            loss_te[idx_lambda, idx_degree] = rmse_vt
        
    min_loss_te = np.min(loss_te)
    best_lambda = lambdas[ np.where( loss_te == min_loss_te )[0] ]
    best_degree = degrees[ np.where( loss_te == min_loss_te )[1] ]

    #recompute best w
    x_augmented = build_poly(tx, int(best_degree))
    best_w = ridge_regression(y,x_augmented,best_lambda)

    return best_lambda, best_degree, best_w, loss_tr, loss_te

# Load Data And Basic Preprocessing

In [94]:
yb, input_data, ids = load_csv_data("../data/train.csv", sub_sample=False)
_, test_data, ids_test = load_csv_data("../data/test.csv", sub_sample=False)

#this will surely be deleted, in this way we are sure that original_data is the original version of the data and we don't have
#to load them again
from copy import deepcopy
originalData = deepcopy(input_data)
originalY = deepcopy(yb)
originalTest = deepcopy(test_data)
print(yb)

[ 1. -1. -1. ...  1. -1. -1.]


In [148]:
#basic step
input_data = deepcopy(originalData)
numInvalidValues=countInvalid(input_data, -999)
idxCols = np.where(numInvalidValues>0)[0]
input_data = replaceWithZero(input_data,-999,idxCols)

# Jet division

In [149]:
# x0, x1, x2
idx0 = np.where(input_data[:,22]==0)
idx1 = np.where(input_data[:,22]==1)
idx2 = np.where(input_data[:,22]>=2)

x0,_,_ = standardize ( input_data[idx0] )
x1,_,_ = standardize ( input_data[idx1] )
x2,_,_ = standardize ( input_data[idx2] )
y0 = yb[idx0]
y1 = yb[idx1]
y2 = yb[idx2]

Substitute 0 with 0.0001
Substitute 0 with 0.0001


# Regression with CV

In [152]:
lambdas = np.logspace(-6,0,5)
degrees = np.arange(14)

best_lambda_loss0, best_degree_loss0, best_w_loss0, best_lambda_acc0, best_degree_acc0, best_w_acc0, loss_tr0, loss_te0, accuracy0 = \
grid_search_hyperparam(y0, x0 ,lambdas,degrees)

best_lambda_loss1, best_degree_loss1, best_w_loss1, best_lambda_acc1, best_degree_acc1, best_w_acc1, loss_tr1, loss_te1, accuracy1 = \
grid_search_hyperparam(y1, x1 ,lambdas,degrees)

best_lambda_loss2, best_degree_loss2, best_w_loss2, best_lambda_acc2, best_degree_acc2, best_w_acc2, loss_tr2, loss_te2, accuracy2 = \
grid_search_hyperparam(y2, x2 ,lambdas,degrees)


print(f'Model with 0 jets: lambda = {best_lambda_loss0}, degree = {best_degree_loss0}, loss = {np.min(loss_te0)}')
print(f'Model with 1 jets: lambda = {best_lambda_loss1}, degree = {best_degree_loss1}, loss = {np.min(loss_te1)}')
print(f'Model with more than 1 jets: lambda = {best_lambda_loss2}, degree = {best_degree_loss2}, loss = {np.min(loss_te2)}')

Model with 0 jets: lambda = [0.001], degree = [1], loss = 0.7378892476990272
Model with 1 jets: lambda = [0.001], degree = [5], loss = 0.8050446178537408
Model with more than 1 jets: lambda = [0.001], degree = [3], loss = 0.7963036673060457


In [153]:
print(f'Model with 0 jets: lambda = {best_lambda_acc0}, degree = {best_degree_acc0}, acc = {np.max(accuracy0)}')
print(f'Model with 1 jets: lambda = {best_lambda_acc1}, degree = {best_degree_acc1}, acc = {np.max(accuracy1)}')
print(f'Model with more than 1 jets: lambda = {best_lambda_acc2}, degree = {best_degree_acc2}, acc = {np.max(accuracy2)}')

Model with 0 jets: lambda = [0.001], degree = [12], acc = 0.8427416126191047
Model with 1 jets: lambda = [0.001], degree = [12], acc = 0.8065098524708554
Model with more than 1 jets: lambda = [0.001 0.001], degree = [12 13], acc = 0.832078853046595


# With ridge

In [104]:
lambdas = np.logspace(-6,0,10)
degrees = np.arange(10)

best_lambda0, best_degree0, best_w0, loss_tr0, loss_te0 = grid_search_hyperparam_RIDGE(y0, x0 ,lambdas,degrees)
best_lambda1, best_degree1, best_w1, loss_tr1, loss_te1 = grid_search_hyperparam_RIDGE(y1, x1 ,lambdas,degrees)
best_lambda2, best_degree2, best_w2, loss_tr2, loss_te2 = grid_search_hyperparam_RIDGE(y2, x2 ,lambdas,degrees)


print(f'Model with 0 jets: lambda = {best_lambda0}, degree = {best_degree0}, loss = {np.min(loss_te0)}')
print(f'Model with 1 jets: lambda = {best_lambda1}, degree = {best_degree1}, loss = {np.min(loss_te1)}')
print(f'Model with more than 1 jets: lambda = {best_lambda2}, degree = {best_degree2}, loss = {np.min(loss_te2)}')

Model with 0 jets: lambda = [0.00215443], degree = [1], loss = 0.7371577566715111
Model with 1 jets: lambda = [0.00215443], degree = [5], loss = 0.8072869429323377
Model with more than 1 jets: lambda = [0.01], degree = [7], loss = 0.7668980459553653


# Submission

In [154]:
test_data = deepcopy(originalTest)
num_tests = test_data.shape[0]

# x0, x1, x2
idx0 = np.where(test_data[:,22]==0)
idx1 = np.where(test_data[:,22]==1)
idx2 = np.where(test_data[:,22]>=2)

x0,_,_ = standardize ( test_data[idx0] )
x1,_,_ = standardize ( test_data[idx1] )
x2,_,_ = standardize ( test_data[idx2] )

x0 = build_poly(x0, int(best_degree0))
x1 = build_poly(x1, int(best_degree1))
x2 = build_poly(x2, int(best_degree2))

y_pred0 = predict_labels(best_w0,x0)
y_pred1 = predict_labels(best_w1,x1)
y_pred2 = predict_labels(best_w2,x2)

y_pred = np.ones(num_tests)
y_pred[idx0] = y_pred0
y_pred[idx1] = y_pred1
y_pred[idx2] = y_pred2

create_csv_submission(ids_test, y_pred, '999to0andDivisionPerJet20DegreeAccuracy.csv')

Substitute 0 with 0.0001
Substitute 0 with 0.0001
