# Comment

This is the prototype of our future regressions

In [3]:
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
%matplotlib inline
import time 

#to keep things in order, and to avoid to copy and paste everytime our functions if we want to use them in more than one folder,
#we can temporarily use this library. 
import sys

#in this way Python will search the implementations also in the path '../HelperFunctions'
sys.path.insert(0, '../HelperFunctions')
sys.path.insert(0, '../pre-processing/Clean_Data/')

from proj1_helpers import *
from common_functions import *
from counters import *
from remove import *
from replace import *
from regressors import *
from CrossValidationFunctions import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load Data And Basic Preprocessing

In [10]:
yb, input_data, ids = load_csv_data("../data/train.csv", sub_sample=False)
_, test_data, ids_test = load_csv_data("../data/test.csv", sub_sample=False)

#this will surely be deleted, in this way we are sure that original_data is the original version of the data and we don't have
#to load them again
from copy import deepcopy
originalData = deepcopy(input_data)
originalY = deepcopy(yb)
originalTest = deepcopy(test_data)


# Step 0

In [11]:
#basic step
input_data = deepcopy(originalData)
numInvalidValues=countInvalid(input_data, -999)
idxCols = np.where(numInvalidValues>0)[0]
input_data = replaceWithZero(input_data,-999,idxCols)

# Jet division, removing constant columns, standardization

In [12]:
# x0, x1, x2
idx0 = np.where(input_data[:,22]==0)
idx1 = np.where(input_data[:,22]==1)
idx2 = np.where(input_data[:,22]>=2)

x0 = input_data[idx0] 
x1 = input_data[idx1] 
x2 = input_data[idx2] 

y0 = yb[idx0]
y1 = yb[idx1]
y2 = yb[idx2]

x0, idx_constants_removed0 = removeConstantColumns(x0)
x1, idx_constants_removed1 = removeConstantColumns(x1)

x0, mean_train0, std_train0 = standardize ( x0 )
x1, mean_train1, std_train1 = standardize ( x1 )
x2, mean_train2, std_train2 = standardize ( x2 )

# Remove HC columns

Only if HC_flag = True

In [13]:
HC_flag = False

if(HC_flag):
    threshold = 0.8

    x0, idx_HC_removed0 = removeHighCorrelatedColumns(x0, threshold)
    x1, idx_HC_removed1 = removeHighCorrelatedColumns(x1, threshold)
    x2, idx_HC_removed2 = removeHighCorrelatedColumns(x2, threshold)
else:
    idx_HC_removed0 = []
    idx_HC_removed0 = []
    idx_HC_removed0 = []

# Regression....

In [14]:
# lambdas0 = np.linspace(0.0001,0.01,15) #for the first subset the lambda is around 0.001
# lambdas1 = np.linspace(0.00001,0.001,15) #for the second subset the lambda is around 0.0001
# lambdas2 = np.linspace(0.00001,0.001,15) #for the third subset the lambda is around 0.0001

# degrees = np.arange(7,17) #the best degree was high for all the models


# best_lambda_loss0, best_degree_loss0, best_w_loss0, best_lambda_acc0, best_degree_acc0, best_w_acc0, loss_tr0, loss_te0, accuracy0 = \
# grid_search_hyperparam_with_CV(y0, x0, lambdas0, degrees)

# best_lambda_loss1, best_degree_loss1, best_w_loss1, best_lambda_acc1, best_degree_acc1, best_w_acc1, loss_tr1, loss_te1, accuracy1 = \
# grid_search_hyperparam_with_CV(y1, x1, lambdas1, degrees)

# best_lambda_loss2, best_degree_loss2, best_w_loss2, best_lambda_acc2, best_degree_acc2, best_w_acc2, loss_tr2, loss_te2, accuracy2 = \
# grid_search_hyperparam_with_CV(y2, x2, lambdas2, degrees)

# print('LOSS')
# print(f'Model with 0 jets: lambda = {best_lambda_loss0}, degree = {best_degree_loss0}, loss = {np.min(loss_te0)}')
# print(f'Model with 1 jets: lambda = {best_lambda_loss1}, degree = {best_degree_loss1}, loss = {np.min(loss_te1)}')
# print(f'Model with more than 1 jets: lambda = {best_lambda_loss2}, degree = {best_degree_loss2}, loss = {np.min(loss_te2)}')

# print('\n\nACCURACY')
# print(f'Model with 0 jets: lambda = {best_lambda_acc0}, degree = {best_degree_acc0}, acc = {np.max(accuracy0)}')
# print(f'Model with 1 jets: lambda = {best_lambda_acc1}, degree = {best_degree_acc1}, acc = {np.max(accuracy1)}')
# print(f'Model with more than 1 jets: lambda = {best_lambda_acc2}, degree = {best_degree_acc2}, acc = {np.max(accuracy2)}')


# N0 = x0.shape[0]
# N1 = x1.shape[0]
# N2 = x2.shape[0]

# TOTAccuracy = ( N0*np.max(accuracy0) + N1*np.max(accuracy1) + N2*np.max(accuracy2) ) / ( N0 + N1 + N2 )
# print(f'\n\nOur test set reached an accuracy of: acc = {TOTAccuracy}')

# Submission: import and basic steps

In [15]:
test_data = deepcopy(originalTest)
num_tests = test_data.shape[0]

numInvalidValues=countInvalid(test_data, -999)
idxCols = np.where(numInvalidValues>0)[0]
input_data = replaceWithZero(test_data,-999,idxCols)



# Jet division, removing constant/HC columns, standardization

In [16]:
# x0, x1, x2
idx0 = np.where(test_data[:,22]==0)
idx1 = np.where(test_data[:,22]==1)
idx2 = np.where(test_data[:,22]>=2)

x0 = test_data[idx0] 
x1 = test_data[idx1] 
x2 = test_data[idx2] 

x0 = np.delete(x0, idx_constants_removed0, axis=1)
x1 = np.delete(x1, idx_constants_removed1, axis=1)

x0,_,_ = standardizeWithGivenParameters ( x0, mean_train0, std_train0 )
x1,_,_ = standardizeWithGivenParameters ( x1, mean_train1, std_train1 )
x2,_,_ = standardizeWithGivenParameters ( x2, mean_train2, std_train2 )

if(HC_flag):
    for i in idx_HC_removed0:
        x0 = np.delete(x0,i,axis=1)
    for i in idx_HC_removed1:
        x1 = np.delete(x1,i,axis=1)
    for i in idx_HC_removed2:
        x2 = np.delete(x2,i,axis=1)

# Repeat regression stuff and predict

In [None]:
x0 = build_poly(x0, int(best_degree_acc0[0]))
x1 = build_poly(x1, int(best_degree_acc1[0]))
x2 = build_poly(x2, int(best_degree_acc2[0]))

y_pred0 = predict_labels(best_w_acc0,x0)
y_pred1 = predict_labels(best_w_acc1,x1)
y_pred2 = predict_labels(best_w_acc2,x2)

y_pred = np.ones(num_tests)
y_pred[idx0] = y_pred0
y_pred[idx1] = y_pred1
y_pred[idx2] = y_pred2



In [None]:
create_csv_submission(ids_test, y_pred, 'dummy_name.csv')