# Comment

This is the prototype of our future regressions

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
%matplotlib inline
import time 

#to keep things in order, and to avoid to copy and paste everytime our functions if we want to use them in more than one folder,
#we can temporarily use this library. 
import sys

#in this way Python will search the implementations also in the path '../HelperFunctions'
sys.path.insert(0, '../HelperFunctions')
sys.path.insert(0, '../pre-processing/Clean_Data/')
sys.path.insert(0, '../Logit')

from proj1_helpers import *
from common_functions import *
from counters import *
from remove import *
from replace import *
from regressors import *
from CrossValidationFunctions import *
from functions_logistic import * 

In [2]:

def predict_logistic_labels(weights, data):
    """Generates class predictions given weights, and a test data matrix"""
    y_pred = np.dot(data, weights)
    y_pred=sigmoid(y_pred)
    y_pred[np.where(y_pred <= 0.5)] = 0
    y_pred[np.where(y_pred > 0.5)] = 1
    
    return y_pred
            
def convert_0_to_minus1(data):
    data[data == 0]= -1
    return data

def convert_minus1_to_0(data):
    data[data == -1]= 0
    return data



In [3]:
def penalized_logistic_regression(y, tx, w, lambda_):
    """return the loss and gradient."""
    #num_samples = y.shape[0]
    loss = calculate_logistic_loss(y, tx, w) + lambda_ * np.squeeze(w.T.dot(w))
    gradient = calculate_logistic_gradient(y, tx, w) + 2 * lambda_ * w
    return loss, gradient

def learning_by_penalized_gradient(y, tx, w, gamma, lambda_):
    """
    Do one step of gradient descent, using the penalized logistic regression.
    Return the loss and updated w.
    """
    loss, gradient = penalized_logistic_regression(y, tx, w, lambda_)
    w -= gamma * gradient
    return loss, w

In [28]:
def logistic_hyperparam_with_CV(y, tx, lambdas, gamma, degrees, max_iter):

    accuracy = np.zeros((len(lambdas), len(degrees)))
    
    for idx_lambda, lambda_ in enumerate(lambdas):
        for idx_degree, degree in enumerate(degrees):
                        
            x_augmented = build_poly(tx, degree)
            initial_w = np.ones((x_augmented.shape[1]))
            
            #regression with logistic method
            k_indices = build_k_indices(y, 4, 1)
            acc = cross_validation_with_logistic(y, x_augmented, k_indices, initial_w, gamma, lambda_, max_iter)        
            accuracy[idx_lambda, idx_degree] = acc
    
    #find the best using the accuracy
    max_acc = np.max(accuracy)
    print('max acc = ',max_acc)
    coordinates_best_parameter = np.where( accuracy == max_acc )
    best_lambda_acc = lambdas[ coordinates_best_parameter[0][0] ]
    best_degree_acc = degrees[ coordinates_best_parameter[0][1] ]

    return best_lambda_acc, best_degree_acc, max_acc

In [5]:
def cross_validation_with_logistic(y, x, k_indices,initial_w, gamma, lambda_,max_iter):
    """CV regression according to the splitting in train/test given by k_indices.
    
    The returned quantities are the average of the quantities computed in the single folds
    
    return the accuracy"""
    
    folds = k_indices.shape[0]
    accuracy = np.zeros(folds)
    w=initial_w
    
    for k in range(folds):
        
        #split the data in train/test
        idx = k_indices[k]
        yte = y[idx]
        if len( x.shape ) == 1:
            xte = x[idx]
        else:
            xte = x[idx,:]
            
        ytr = np.delete(y,idx,0)
        xtr = np.delete(x,idx,0)

        #learning by penalized graient descent (with regularized logistic)
        for iter_ in range(max_iter):
            _, w = learning_by_penalized_gradient(ytr, xtr, w, gamma, lambda_)
            
        #accuracy
        y_pred = predict_logistic_labels(w, xte)
        accuracy[k] = np.sum(y_pred == yte) / len(yte)  
   
    return np.mean(accuracy)



In [6]:
def removeConstantColumns(data):
    '''Remove columns which are constants from the data.
       
       Return data, idx_removed
    '''
    std = np.std(data, axis = 0)
    idx_removed = np.where(std==0)[0]
    if len(idx_removed >0 ):
        data = np.delete(data,idx_removed,axis=1)
    
    return data, idx_removed



# Load Data And Basic Preprocessing

In [7]:
yb, input_data, ids = load_csv_data("../data/train.csv", sub_sample=False)
_, test_data, ids_test = load_csv_data("../data/test.csv", sub_sample=False)

#this will surely be deleted, in this way we are sure that original_data is the original version of the data and we don't have
#to load them again
from copy import deepcopy
originalData = deepcopy(input_data)
originalY = deepcopy(yb)
originalTest = deepcopy(test_data)


# Step 0

In [8]:
#basic step
input_data = deepcopy(originalData)
numInvalidValues=countInvalid(input_data, -999)
idxCols = np.where(numInvalidValues>0)[0]
input_data = replaceWithZero(input_data,-999,idxCols)

# Jet division, removing constant columns, standardization

In [9]:
# x0, x1, x2
idx0 = np.where(input_data[:,22]==0)
idx1 = np.where(input_data[:,22]==1)
idx2 = np.where(input_data[:,22]>=2)

x0 = input_data[idx0] 
x1 = input_data[idx1] 
x2 = input_data[idx2] 

y0 = yb[idx0]
y1 = yb[idx1]
y2 = yb[idx2]

x0, idx_constants_removed0 = removeConstantColumns(x0)
x1, idx_constants_removed1 = removeConstantColumns(x1)

x0, mean_train0, std_train0 = standardize ( x0 )
x1, mean_train1, std_train1 = standardize ( x1 )
x2, mean_train2, std_train2 = standardize ( x2 )

In [10]:
y0 = convert_minus1_to_0(y0)
y1 = convert_minus1_to_0(y1)
y2 = convert_minus1_to_0(y2)

# Remove HC columns

Only if HC_flag = True

In [11]:
HC_flag = False

if(HC_flag):
    threshold = 0.8

    x0, idx_HC_removed0 = removeHighCorrelatedColumns(x0, threshold)
    x1, idx_HC_removed1 = removeHighCorrelatedColumns(x1, threshold)
    x2, idx_HC_removed2 = removeHighCorrelatedColumns(x2, threshold)
else:
    idx_HC_removed0 = []
    idx_HC_removed0 = []
    idx_HC_removed0 = []

# Regression....

In [30]:
degrees=np.linspace(1,15,15)
degrees=np.asarray(degrees)
degrees=[1,2]
lambdas=np.logspace(-5,-2,2)
max_iter=10
gamma=1e-5

best_lambda_acc_0, best_degree_acc_0,max_acc_0 = logistic_hyperparam_with_CV(y0, x0, lambdas, gamma, degrees, max_iter)
best_lambda_acc_1, best_degree_acc_1,max_acc_1 = logistic_hyperparam_with_CV(y1, x1, lambdas, gamma, degrees, max_iter)
best_lambda_acc_2, best_degree_acc_2,max_acc_2 = logistic_hyperparam_with_CV(y2, x2, lambdas, gamma, degrees, max_iter)
                     
print('Accuracy jet 0 =', max_acc_0)
print('Accuracy jet 1 =', max_acc_1)
print('Accuracy jet >1 =', max_acc_2)

[0.25284301 0.48845637 0.99826244 ... 0.05036729 0.14712255 0.7379369 ]
logistic loss:  206144.0091834263
[0.17870741 0.54313832 0.99489064 ... 0.06757909 0.24649983 0.72179532]
logistic loss:  179962.67376416817
[0.12759058 0.58942464 0.98575404 ... 0.09051163 0.3706107  0.70565963]
logistic loss:  155165.29019331522
[0.09361042 0.62632905 0.96293375 ... 0.12032586 0.50006842 0.68932222]
logistic loss:  132044.1101737867
[0.07155176 0.65350667 0.91227951 ... 0.15763076 0.61433239 0.67246207]
logistic loss:  110958.07197658595
[0.05761697 0.67091502 0.81782346 ... 0.20174849 0.70233184 0.65461882]
logistic loss:  92309.8779457699
[0.04932898 0.67851069 0.67921184 ... 0.24990438 0.76317086 0.63511304]
logistic loss:  76511.45860714294
[0.04518304 0.67614856 0.52527021 ... 0.2969044  0.80101514 0.61303771]
logistic loss:  63902.65321680614
[0.04430203 0.66376458 0.39317211 ... 0.33601776 0.82081003 0.58748155]
logistic loss:  54547.59357035903
[0.04596679 0.64269883 0.29780071 ... 0.3616

[0.28172236 0.36831955 0.02552052 ... 0.95801366 0.16022419 0.03131687]
logistic loss:  29653.950167511648
[0.28210091 0.37005596 0.0252953  ... 0.95654969 0.15934216 0.03189251]
logistic loss:  29565.617299211466
[0.28317487 0.37218173 0.02519842 ... 0.95547213 0.15882768 0.03247863]
logistic loss:  29486.632187582953
[0.28359028 0.37371964 0.02507555 ... 0.95427381 0.15813669 0.03296086]
logistic loss:  29414.029764332514
[0.25284301 0.48845637 0.99826244 ... 0.05036729 0.14712255 0.7379369 ]
logistic loss:  206144.0091834263
[0.17870744 0.54313832 0.99489063 ... 0.06757913 0.24649989 0.72179527]
logistic loss:  179962.63692648825
[0.12759064 0.58942463 0.985754   ... 0.09051172 0.37061083 0.70565955]
logistic loss:  155165.2221761832
[0.0936105  0.62632902 0.96293364 ... 0.12032602 0.50006857 0.68932209]
logistic loss:  132044.0188158928
[0.07155186 0.6535066  0.91227922 ... 0.157631   0.61433251 0.6724619 ]
logistic loss:  110957.96471019152
[0.05761708 0.6709149  0.81782288 ... 0.

[0.27882116 0.36031722 0.02717005 ... 0.96404947 0.1661019  0.02819272]
logistic loss:  30103.986105248572
[0.27913234 0.36194494 0.02654688 ... 0.96246677 0.16394317 0.0290567 ]
logistic loss:  29974.799283528395
[0.28051383 0.36438328 0.02615663 ... 0.96114476 0.16252954 0.0299388 ]
logistic loss:  29858.89382353335
[0.28038004 0.36582704 0.02571343 ... 0.95928231 0.16094682 0.03060238]
logistic loss:  29750.236316940798
[0.28172306 0.36831953 0.02552087 ... 0.95801319 0.16022417 0.03131731]
logistic loss:  29653.92719963035
[0.28210156 0.37005586 0.02529564 ... 0.9565492  0.15934212 0.03189295]
logistic loss:  29565.595450610264
[0.28317556 0.37218163 0.02519877 ... 0.95547165 0.15882765 0.03247908]
logistic loss:  29486.61145478112
[0.28359094 0.37371948 0.0250759  ... 0.95427331 0.15813665 0.03296131]
logistic loss:  29414.01002555431
max acc =  0.8159180078469053
[0.9000567  0.00468867 0.99997016 ... 0.99941083 0.66834536 0.03470827]
logistic loss:  148274.98834617162
[0.90017067

[0.03671667 0.50791503 0.99885446 ... 0.25950361 0.72132452 0.59066407]
logistic loss:  38763.069462354455
[0.28491938 0.00336545 0.28019353 ... 0.00873049 0.73809802 0.22332773]
logistic loss:  32403.26225486893
[0.42508003 0.03958578 0.55647533 ... 0.04540578 0.81877048 0.31512743]
logistic loss:  38064.13973801965
[0.27106383 0.00315554 0.31486007 ... 0.00795384 0.72546372 0.22766045]
logistic loss:  31843.2471389267
[0.41298762 0.03927964 0.59282054 ... 0.04101681 0.81023581 0.32162583]
logistic loss:  37205.806246434586
[0.26498338 0.00343832 0.35070598 ... 0.00747544 0.71634473 0.23427373]
logistic loss:  31650.196250782697
[0.41047408 0.04650291 0.64067998 ... 0.0422768  0.80593282 0.33280026]
logistic loss:  37561.607068818616
[0.25338657 0.00328105 0.38059352 ... 0.00707208 0.7024182  0.23712524]
logistic loss:  31155.502716051255
[0.39778264 0.0439902  0.65715499 ... 0.03707397 0.79471869 0.3352779 ]
logistic loss:  36187.21578635444
[0.25360519 0.00390983 0.41225022 ... 0.00

[0.00237172 0.20349071 0.82776675 ... 0.14661313 0.36613702 0.49944095]
logistic loss:  34192.775482804085
[0.02908367 0.45032293 0.99888689 ... 0.2558937  0.67793908 0.60910026]
logistic loss:  39291.68398267064
[0.0029024  0.24226595 0.81953664 ... 0.14607431 0.41210187 0.47640134]
logistic loss:  33439.41773674664
[0.03671748 0.50791817 0.99885446 ... 0.2595051  0.72132589 0.59066333]
logistic loss:  38763.05498540648
[0.28491853 0.00336547 0.28019488 ... 0.0087306  0.7380958  0.22332864]
logistic loss:  32403.221137328022
[0.42508012 0.03958661 0.55647823 ... 0.04540669 0.81876921 0.31512921]
logistic loss:  38064.132672686115
[0.27106303 0.00315556 0.31486129 ... 0.00795395 0.72546131 0.22766138]
logistic loss:  31843.21188360111
[0.41298778 0.03928044 0.59282314 ... 0.04101769 0.81023442 0.32162766]
logistic loss:  37205.802758813246
[0.26498263 0.00343834 0.35070702 ... 0.00747555 0.71634217 0.2342747 ]
logistic loss:  31650.168482274115
[0.41047435 0.04650386 0.64068242 ... 0.0

[6.43503413e-01 9.75064666e-01 6.83337729e-01 ... 1.43150387e-01
 1.41673424e-83 6.79205097e-03]
logistic loss:  35656.61961499447
[6.27000379e-01 9.73889162e-01 6.75327520e-01 ... 1.45663406e-01
 3.28868035e-84 7.60695630e-03]
logistic loss:  34599.51700264745
[6.05288157e-01 9.71863746e-01 6.62677505e-01 ... 1.44033396e-01
 4.02902560e-85 6.98459337e-03]
logistic loss:  33656.44658827994
[5.94637080e-01 9.71056460e-01 6.58819110e-01 ... 1.49267197e-01
 1.32399877e-85 7.80995478e-03]
logistic loss:  32822.35761296524
[5.74971313e-01 9.68787999e-01 6.47339991e-01 ... 1.48084727e-01
 2.17721375e-86 7.03648567e-03]
logistic loss:  32074.10145263654
[5.67845110e-01 9.68047346e-01 6.45659933e-01 ... 1.54718239e-01
 1.10063451e-86 8.10384866e-03]
logistic loss:  31412.68775797238
[5.47777166e-01 9.65123880e-01 6.33088543e-01 ... 1.52153569e-01
 2.04100285e-87 6.95530714e-03]
logistic loss:  30814.55940289969
[5.45873718e-01 9.64784940e-01 6.34813143e-01 ... 1.61313108e-01
 1.75730154e-87 8.

[6.79331668e-01 7.58062139e-01 7.23193756e-01 ... 3.79946013e-79
 5.30040829e-03 8.14503193e-01]
logistic loss:  45285.95303635285
[6.72679060e-01 7.37543594e-01 7.16933132e-01 ... 3.32258036e-80
 5.40466834e-03 8.22022660e-01]
logistic loss:  43243.48955292275
[6.65618127e-01 7.18055720e-01 7.10951732e-01 ... 4.43559896e-81
 5.68486583e-03 8.29026892e-01]
logistic loss:  41402.550393581216
[6.56414377e-01 6.98214374e-01 7.03948492e-01 ... 7.20446097e-82
 5.89900851e-03 8.34538429e-01]
logistic loss:  39741.636622262406
[6.47138424e-01 6.79516684e-01 6.97320489e-01 ... 1.60616063e-82
 6.23001612e-03 8.39572690e-01]
logistic loss:  38249.32232152608
[6.36443995e-01 6.60839152e-01 6.90065544e-01 ... 4.18353676e-83
 6.47248347e-03 8.43431071e-01]
logistic loss:  36910.210603299856
[6.43502154e-01 9.75064277e-01 6.83336727e-01 ... 1.43151200e-01
 1.41766639e-83 6.79217075e-03]
logistic loss:  35656.50594497529
[6.26999108e-01 9.73888736e-01 6.75326468e-01 ... 1.45664260e-01
 3.29097256e-84

In [None]:
# lambdas0 = np.linspace(0.0001,0.01,15) #for the first subset the lambda is around 0.001
# lambdas1 = np.linspace(0.00001,0.001,15) #for the second subset the lambda is around 0.0001
# lambdas2 = np.linspace(0.00001,0.001,15) #for the third subset the lambda is around 0.0001

# degrees = np.arange(7,17) #the best degree was high for all the models


# best_lambda_loss0, best_degree_loss0, best_w_loss0, best_lambda_acc0, best_degree_acc0, best_w_acc0, loss_tr0, loss_te0, accuracy0 = \
# grid_search_hyperparam_with_CV(y0, x0, lambdas0, degrees)

# best_lambda_loss1, best_degree_loss1, best_w_loss1, best_lambda_acc1, best_degree_acc1, best_w_acc1, loss_tr1, loss_te1, accuracy1 = \
# grid_search_hyperparam_with_CV(y1, x1, lambdas1, degrees)

# best_lambda_loss2, best_degree_loss2, best_w_loss2, best_lambda_acc2, best_degree_acc2, best_w_acc2, loss_tr2, loss_te2, accuracy2 = \
# grid_search_hyperparam_with_CV(y2, x2, lambdas2, degrees)

# print('LOSS')
# print(f'Model with 0 jets: lambda = {best_lambda_loss0}, degree = {best_degree_loss0}, loss = {np.min(loss_te0)}')
# print(f'Model with 1 jets: lambda = {best_lambda_loss1}, degree = {best_degree_loss1}, loss = {np.min(loss_te1)}')
# print(f'Model with more than 1 jets: lambda = {best_lambda_loss2}, degree = {best_degree_loss2}, loss = {np.min(loss_te2)}')

# print('\n\nACCURACY')
# print(f'Model with 0 jets: lambda = {best_lambda_acc0}, degree = {best_degree_acc0}, acc = {np.max(accuracy0)}')
# print(f'Model with 1 jets: lambda = {best_lambda_acc1}, degree = {best_degree_acc1}, acc = {np.max(accuracy1)}')
# print(f'Model with more than 1 jets: lambda = {best_lambda_acc2}, degree = {best_degree_acc2}, acc = {np.max(accuracy2)}')


# N0 = x0.shape[0]
# N1 = x1.shape[0]
# N2 = x2.shape[0]

# TOTAccuracy = ( N0*np.max(accuracy0) + N1*np.max(accuracy1) + N2*np.max(accuracy2) ) / ( N0 + N1 + N2 )
# print(f'\n\nOur test set reached an accuracy of: acc = {TOTAccuracy}')

# Submission: import and basic steps

In [None]:
test_data = deepcopy(originalTest)
num_tests = test_data.shape[0]

numInvalidValues=countInvalid(test_data, -999)
idxCols = np.where(numInvalidValues>0)[0]
input_data = replaceWithZero(test_data,-999,idxCols)



# Jet division, removing constant/HC columns, standardization

In [None]:
# x0, x1, x2
idx0 = np.where(test_data[:,22]==0)
idx1 = np.where(test_data[:,22]==1)
idx2 = np.where(test_data[:,22]>=2)

x0 = test_data[idx0] 
x1 = test_data[idx1] 
x2 = test_data[idx2] 

x0 = np.delete(x0, idx_constants_removed0, axis=1)
x1 = np.delete(x1, idx_constants_removed1, axis=1)

x0,_,_ = standardizeWithGivenParameters ( x0, mean_train0, std_train0 )
x1,_,_ = standardizeWithGivenParameters ( x1, mean_train1, std_train1 )
x2,_,_ = standardizeWithGivenParameters ( x2, mean_train2, std_train2 )

if(HC_flag):
    for i in idx_HC_removed0:
        x0 = np.delete(x0,i,axis=1)
    for i in idx_HC_removed1:
        x1 = np.delete(x1,i,axis=1)
    for i in idx_HC_removed2:
        x2 = np.delete(x2,i,axis=1)

# Repeat regression stuff and predict

In [None]:
x0 = build_poly(x0, int(best_degree_acc0[0]))
x1 = build_poly(x1, int(best_degree_acc1[0]))
x2 = build_poly(x2, int(best_degree_acc2[0]))

y_pred0 = predict_labels(best_w_acc0,x0)
y_pred1 = predict_labels(best_w_acc1,x1)
y_pred2 = predict_labels(best_w_acc2,x2)

y_pred = np.ones(num_tests)
y_pred[idx0] = y_pred0
y_pred[idx1] = y_pred1
y_pred[idx2] = y_pred2



In [None]:
create_csv_submission(ids_test, y_pred, 'dummy_name.csv')