In [1]:
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
%matplotlib inline
import time 

#to keep things in order, and to avoid to copy and paste everytime our functions if we want to use them in more than one folder,
#we can temporarily use this library. 
import sys

#in this way Python will search the implementations also in the path '../HelperFunctions'
sys.path.insert(0, '../HelperFunctions')
sys.path.insert(0, '../pre-processing/Clean_Data/')

from proj1_helpers import *
from common_functions import *
from counters import *
from remove import *
from replace import *
from regressors import *
from CrossValidationFunctions import *

In [2]:
def removeConstantColumns(data):
    '''Remove columns which are constants from the data.
       
       Return data, idx_removed
    '''
    std = np.std(data, axis = 0)
    idx_removed = np.where(std==0)[0]
    if len(idx_removed >0 ):
        data = np.delete(data,idx_removed,axis=1)
    
    return data, idx_removed

def removeHighCorrelatedColumns(data, threshold = 0.8):
    '''Remove columns which are highly correlated.
       
       WARNING: the returned list idx_removed MUST be used in a for loop on the test data, removing features one by one
       
       Return data, idx_removed
    '''
    #initialize idx_removed
    idx_removed = []
        
    #Get first elements of the highly correlated couples
    R = np.ma.corrcoef(data.T)
    idx_HC = np.where( (R > threshold) & (R < 0.98))[0] 

    while(idx_HC.shape[0] > 0):
        
        idx_to_remove = idx_HC.max()
        
        data = np.delete(data, idx_to_remove, axis=1)
        idx_removed.append(idx_to_remove)
        
        #compute the correlation coefficients of the reduced dataset
        R = np.ma.corrcoef(data.T)
        idx_HC = np.where( (R > threshold) & (R < 0.98))[0] 
        
    
    return data, idx_removed

# Load Data And Basic Preprocessing

In [3]:
yb, input_data, ids = load_csv_data("../data/train.csv", sub_sample=False)
_, test_data, ids_test = load_csv_data("../data/test.csv", sub_sample=False)

#this will surely be deleted, in this way we are sure that original_data is the original version of the data and we don't have
#to load them again
from copy import deepcopy
originalData = deepcopy(input_data)
originalY = deepcopy(yb)
originalTest = deepcopy(test_data)


# Step 0

In [22]:
#basic step
input_data = deepcopy(originalData)
numInvalidValues=countInvalid(input_data, -999)
idxCols = np.where(numInvalidValues>0)[0]
input_data = replaceWithZero(input_data,-999,idxCols)

# Jet division, removing constant columns, standardization

In [23]:
# x0, x1, x2
idx0 = np.where(input_data[:,22]==0)
idx1 = np.where(input_data[:,22]==1)
idx2 = np.where(input_data[:,22]>=2)

x0 = input_data[idx0] 
x1 = input_data[idx1] 
x2 = input_data[idx2] 

y0 = yb[idx0]
y1 = yb[idx1]
y2 = yb[idx2]

x0, idx_constants_removed0 = removeConstantColumns(x0)
x1, idx_constants_removed1 = removeConstantColumns(x1)

x0, mean_train0, std_train0 = standardize ( x0 )
x1, mean_train1, std_train1 = standardize ( x1 )
x2, mean_train2, std_train2 = standardize ( x2 )

# Remove HC columns

Only if HC_flag = True

In [24]:
HC_flag = False

if(HC_flag):
    threshold = 0.8

    x0, idx_HC_removed0 = removeHighCorrelatedColumns(x0, threshold)
    x1, idx_HC_removed1 = removeHighCorrelatedColumns(x1, threshold)
    x2, idx_HC_removed2 = removeHighCorrelatedColumns(x2, threshold)
else:
    idx_HC_removed0 = []
    idx_HC_removed0 = []
    idx_HC_removed0 = []

# Regression....

In [25]:
from CrossValidationFunctions import *
def grid_search_hyperparam_with_CV(y, tx, lambdas, degrees):
    loss_tr = np.zeros((len(lambdas), len(degrees)))
    loss_te = np.zeros((len(lambdas), len(degrees)))
    accuracy = np.zeros((len(lambdas), len(degrees)))
    
    for idx_lambda, lambda_ in enumerate(lambdas):
        for idx_degree, degree in enumerate(degrees):
                        
            x_augmented = build_poly(tx, degree)
            
            #regression with your favourite method
            k_indices = build_k_indices(y, 4, 1)
            acc, loss1, loss2 = cross_validation_with_ridge(y, x_augmented, k_indices, lambda_)
            
            loss_tr[idx_lambda, idx_degree] = loss1
            loss_te[idx_lambda, idx_degree] = loss2
            accuracy[idx_lambda, idx_degree] = acc
    
    #find the best using the loss
    min_loss_te = np.min(loss_te)
    best_lambda_loss = lambdas[ np.where( loss_te == min_loss_te )[0] ]
    best_degree_loss = degrees[ np.where( loss_te == min_loss_te )[1] ]

    #recompute best w
    x_augmented = build_poly(tx, int(best_degree_loss))
    best_w_loss = ridge_regression(y,x_augmented,best_lambda_loss)
    
    #find the best using the accuracy
    max_acc = np.max(accuracy)
    best_lambda_acc = lambdas[ np.where( accuracy == max_acc )[0] ]
    best_degree_acc = degrees[ np.where( accuracy == max_acc )[1] ]
    
    #recompute best w
    x_augmented = build_poly(tx, int(best_degree_acc[0]))
    best_w_acc = ridge_regression(y,x_augmented,best_lambda_acc[0])

    return best_lambda_loss, best_degree_loss, best_w_loss, best_lambda_acc, best_degree_acc, best_w_acc, loss_tr, loss_te, accuracy


In [26]:
lambdas0 = np.linspace(1e-5,0.0005,10) 
lambdas1 = np.linspace(1e-5,0.0005,10) 
lambdas2 = np.linspace(1e-5,0.0005,10) 

degrees = np.arange(10,15)


best_lambda_loss0, best_degree_loss0, best_w_loss0, best_lambda_acc0, best_degree_acc0, best_w_acc0, loss_tr0, loss_te0, accuracy0 = \
grid_search_hyperparam_with_CV(y0, x0, lambdas0, degrees)

best_lambda_loss1, best_degree_loss1, best_w_loss1, best_lambda_acc1, best_degree_acc1, best_w_acc1, loss_tr1, loss_te1, accuracy1 = \
grid_search_hyperparam_with_CV(y1, x1, lambdas1, degrees)

best_lambda_loss2, best_degree_loss2, best_w_loss2, best_lambda_acc2, best_degree_acc2, best_w_acc2, loss_tr2, loss_te2, accuracy2 = \
grid_search_hyperparam_with_CV(y2, x2, lambdas2, degrees)

print('LOSS')
print(f'Model with 0 jets: lambda = {best_lambda_loss0}, degree = {best_degree_loss0}, loss = {np.min(loss_te0)}')
print(f'Model with 1 jets: lambda = {best_lambda_loss1}, degree = {best_degree_loss1}, loss = {np.min(loss_te1)}')
print(f'Model with more than 1 jets: lambda = {best_lambda_loss2}, degree = {best_degree_loss2}, loss = {np.min(loss_te2)}')

print('\n\nACCURACY')
print(f'Model with 0 jets: lambda = {best_lambda_acc0}, degree = {best_degree_acc0}, acc = {np.max(accuracy0)}')
print(f'Model with 1 jets: lambda = {best_lambda_acc1}, degree = {best_degree_acc1}, acc = {np.max(accuracy1)}')
print(f'Model with more than 1 jets: lambda = {best_lambda_acc2}, degree = {best_degree_acc2}, acc = {np.max(accuracy2)}')


N0 = x0.shape[0]
N1 = x1.shape[0]
N2 = x2.shape[0]

TOTAccuracy = ( N0*np.max(accuracy0) + N1*np.max(accuracy1) + N2*np.max(accuracy2) ) / ( N0 + N1 + N2 )
print(f'\n\nOur test set reached an accuracy of: acc = {TOTAccuracy}')

LOSS
Model with 0 jets: lambda = [0.00033667], degree = [10], loss = 34425736421631.133
Model with 1 jets: lambda = [6.44444444e-05], degree = [10], loss = 41.33309562589343
Model with more than 1 jets: lambda = [1.e-05], degree = [10], loss = 105.3151619605033


ACCURACY
Model with 0 jets: lambda = [1.e-05], degree = [11], acc = 0.8431920089678918
Model with 1 jets: lambda = [0.00039111], degree = [12], acc = 0.8065614360878985
Model with more than 1 jets: lambda = [0.00017333], degree = [13], acc = 0.8325613454645713


Our test set reached an accuracy of: acc = 0.8287453635041816


In [27]:
#retrain only the first set

lambdas0 = np.linspace(1e-7,5*1e-5,10) 

degrees = np.arange(9,13)


best_lambda_loss0, best_degree_loss0, best_w_loss0, best_lambda_acc0, best_degree_acc0, best_w_acc0, loss_tr0, loss_te0, accuracy0 = \
grid_search_hyperparam_with_CV(y0, x0, lambdas0, degrees)

print('LOSS')
print(f'Model with 0 jets: lambda = {best_lambda_loss0}, degree = {best_degree_loss0}, loss = {np.min(loss_te0)}')
print(f'Model with 1 jets: lambda = {best_lambda_loss1}, degree = {best_degree_loss1}, loss = {np.min(loss_te1)}')
print(f'Model with more than 1 jets: lambda = {best_lambda_loss2}, degree = {best_degree_loss2}, loss = {np.min(loss_te2)}')

print('\n\nACCURACY')
print(f'Model with 0 jets: lambda = {best_lambda_acc0}, degree = {best_degree_acc0}, acc = {np.max(accuracy0)}')
print(f'Model with 1 jets: lambda = {best_lambda_acc1}, degree = {best_degree_acc1}, acc = {np.max(accuracy1)}')
print(f'Model with more than 1 jets: lambda = {best_lambda_acc2}, degree = {best_degree_acc2}, acc = {np.max(accuracy2)}')


N0 = x0.shape[0]
N1 = x1.shape[0]
N2 = x2.shape[0]

TOTAccuracy = ( N0*np.max(accuracy0) + N1*np.max(accuracy1) + N2*np.max(accuracy2) ) / ( N0 + N1 + N2 )
print(f'\n\nOur test set reached an accuracy of: acc = {TOTAccuracy}')

LOSS
Model with 0 jets: lambda = [5.e-05], degree = [9], loss = 29265374737846.613
Model with 1 jets: lambda = [6.44444444e-05], degree = [10], loss = 41.33309562589343
Model with more than 1 jets: lambda = [1.e-05], degree = [10], loss = 105.3151619605033


ACCURACY
Model with 0 jets: lambda = [1.11888889e-05], degree = [11], acc = 0.8431719913523902
Model with 1 jets: lambda = [0.00039111], degree = [12], acc = 0.8065614360878985
Model with more than 1 jets: lambda = [0.00017333], degree = [13], acc = 0.8325613454645713


Our test set reached an accuracy of: acc = 0.8287373634241112


# Submission: import and basic steps

In [28]:
test_data = deepcopy(originalTest)
num_tests = test_data.shape[0]

numInvalidValues=countInvalid(test_data, -999)
idxCols = np.where(numInvalidValues>0)[0]
input_data = replaceWithZero(test_data,-999,idxCols)



# Jet division, removing constant/HC columns, standardization

In [29]:
# x0, x1, x2
idx0 = np.where(test_data[:,22]==0)
idx1 = np.where(test_data[:,22]==1)
idx2 = np.where(test_data[:,22]>=2)

x0 = test_data[idx0] 
x1 = test_data[idx1] 
x2 = test_data[idx2] 

x0 = np.delete(x0, idx_constants_removed0, axis=1)
x1 = np.delete(x1, idx_constants_removed1, axis=1)

x0,_,_ = standardize ( x0, mean_train0, std_train0 )
x1,_,_ = standardize ( x1, mean_train1, std_train1 )
x2,_,_ = standardize ( x2, mean_train2, std_train2 )

if(HC_flag):
    for i in idx_HC_removed0:
        x0 = np.delete(x0,i,axis=1)
    for i in idx_HC_removed1:
        x0 = np.delete(x1,i,axis=1)
    for i in idx_HC_removed2:
        x0 = np.delete(x2,i,axis=1)

# Repeat regression stuff and predict

In [30]:
x0 = build_poly(x0, int(best_degree_acc0[0]))
x1 = build_poly(x1, int(best_degree_acc1[0]))
x2 = build_poly(x2, int(best_degree_acc2[0]))

y_pred0 = predict_labels(best_w_acc0,x0)
y_pred1 = predict_labels(best_w_acc1,x1)
y_pred2 = predict_labels(best_w_acc2,x2)

y_pred = np.ones(num_tests)
y_pred[idx0] = y_pred0
y_pred[idx1] = y_pred1
y_pred[idx2] = y_pred2



In [31]:
create_csv_submission(ids_test, y_pred, '04_RR_WithConstantRemoved.csv')