# Project 1 - Functions tests

In [None]:
from preprocessing_functions import *
%matplotlib inline 
import numpy as np   # generic stuff
import matplotlib.pyplot as plt

#### REMOVE THIS LINE BEFORE SUBMISSION
import pandas as pd
#######################################################################

from lib.helpers import * #the helper provided for the project
# choose which implementations you would like
from lib.implementations import *

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [None]:
DATA_FOLDER = 'data/' 

y_train, tx_train, ids_train = load_csv_data(DATA_FOLDER+'train.csv',sub_sample=False)
size_train = int(3*len(y_train)/4)
y_test = y_train[:size_train]
tx_test = tx_train[:size_train,:]
ids_test = ids_train[:size_train]

y_train = y_train[size_train:]
tx_train = tx_train[size_train:,:]
ids_train = ids_train[size_train:]

#y_test, tx_test, ids_test = load_csv_data(DATA_FOLDER+'test.csv',sub_sample=False)

In [None]:
def build_k_indices(y, k_fold, seed):
    """Build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)

In [None]:
def cross_validation_one_fold_LS(y_cross_val_train, y_cross_val_test, tx_cross_val_train, tx_cross_val_test, \
                                 degrees, len_kept_data, stdize=False):
    
    accuracies_train_by_deg = np.zeros(len(degrees))
    accuracies_test_by_deg = np.zeros(len(degrees))
    
    # For each degree, compute the least squares weights, the predictions and the accuracies
    previous_deg = 1
    for deg_id, deg in enumerate(degrees):
        print('++ Degree', deg, '++')
        
        # Add powers of the chosen columns
        len_data = tx_cross_val_train.shape[1]
        tx_cross_val_train = add_powers(tx_cross_val_train, range(previous_deg+1, deg+1), range(len_kept_data))
        tx_cross_val_test = add_powers(tx_cross_val_test, range(previous_deg+1, deg+1), range(len_kept_data))
        if stdize: 
            tx_cross_val_train[:,len_data:] = standardize(tx_cross_val_train[:,len_data:])[0]
            tx_cross_val_test[:,len_data:] = standardize(tx_cross_val_test[:,len_data:])[0]
    
        # Compute the best weights on the training set
        weights, loss = least_squares(y_cross_val_train, tx_cross_val_train, 'mse') 

        # Compute the predictions
        y_predicted_cross_val_train = predict_labels(weights, tx_cross_val_train)
        y_predicted_cross_val_test = predict_labels(weights, tx_cross_val_test)

        # Compute the accuracies for each degree
        accuracies_train_by_deg[deg_id] = \
            np.sum(y_predicted_cross_val_train == y_cross_val_train)/len(y_cross_val_train)
        accuracies_test_by_deg[deg_id] = \
            np.sum(y_predicted_cross_val_test == y_cross_val_test)/len(y_cross_val_test)
        
        # Update the previous degree to the actual degree
        previous_deg = deg
        
    return accuracies_train_by_deg, accuracies_test_by_deg

In [None]:
def cross_validation_least_squares(y_single_jet_train, tx_single_jet_train, degrees, k_fold, seed):
    
    # Get the indices so that we get the k'th subgroup in test, others in train, for each k
    k_indices = build_k_indices(y_single_jet_train, k_fold, seed)
    
    # Initialize matrix of computed accuracies for each degree and each fold
    accuracies_train_by_fold = np.zeros([len(degrees), k_fold])
    accuracies_test_by_fold = np.zeros([len(degrees), k_fold])
    
    # Preprocess training dataset
    tx_single_jet_train_preprocessed, len_kept_data, unique_cols = \
        preprocess_data(tx_single_jet_train, [], 'none')
    
    for k in range(k_fold):
        print('--- Fold', k, '---')
        # Create the testing set for this fold number
        k_index = k_indices[k] # Indices of the testing set for fold k
        y_cross_val_test = y_single_jet_train[k_index]
        tx_cross_val_test = tx_single_jet_train_preprocessed[k_index,:]
        
        # Create the training set for this fold number
        mask = np.ones(len(y_single_jet_train), dtype=bool) # set all elements to True
        mask[k_index] = False # set test elements to False
        y_cross_val_train = y_single_jet_train[mask] # select only True elements (ie train elements)
        tx_cross_val_train = tx_single_jet_train_preprocessed[mask,:]
        
        # Compute the accuracies for each degree
        accuracies_train_by_fold[:,k], accuracies_test_by_fold[:,k] = cross_validation_one_fold_LS\
            (y_cross_val_train, y_cross_val_test, tx_cross_val_train, tx_cross_val_test, degrees, len_kept_data, \
             False)
    
    # Compute the mean accuracies over the folds, for each degree
    mean_accuracies_train_by_deg = np.mean(accuracies_train_by_fold, axis=1)
    mean_accuracies_test_by_deg = np.mean(accuracies_test_by_fold, axis=1)
    
    # Get the index of the best accuracy in the testing set
    max_id_deg_test = np.unravel_index(mean_accuracies_test_by_deg.argmax(), mean_accuracies_test_by_deg.shape)
    
    # Find the optimal degree and the corresponding accuracies in the training and testing sets
    best_deg = degrees[max_id_deg_test[0]]
    best_accuracy_test = mean_accuracies_test_by_deg[max_id_deg_test[0]]
    corresponding_accuracy_train = mean_accuracies_train_by_deg[max_id_deg_test[0]]
    
    print('Best accuracy test =', best_accuracy_test, 'with degree =', best_deg)
    print('Corresponding accuracy train =', corresponding_accuracy_train)
    
    return best_deg, best_accuracy_test, corresponding_accuracy_train

In [None]:
degrees = range(6,11)
k_fold = 5
seed = 1

In [None]:
mask_jets_train = split_jets_mask(tx_train)
mask_jets_test = split_jets_mask(tx_test)
len_mask = len(mask_jets_train)

y_predicted_train = np.zeros(len(y_train))
y_predicted_test = np.zeros(tx_test.shape[0])
best_degrees = np.zeros(len_mask)

In [None]:
for jet_id in range(len_mask):
    print('***** Jet ', jet_id, '*****')
    # SEPARATE THE WHOLE DATA SET TO GET ONLY THE PART THAT HAVE THE RIGHT NUMBER OF JETS
    tx_single_jet_train = tx_train[mask_jets_train[jet_id]]
    tx_single_jet_test = tx_test[mask_jets_test[jet_id]]
    y_single_jet_train = y_train[mask_jets_train[jet_id]]
    
    # CALL CROSS VALIDATION FOR A SINGLE JET ON TRAIN PART, FIND BEST DEG, BEST ACCURACY ON TESTING CROSS VAL
    best_deg, best_accuracy, corresponding_accuracy_train = cross_validation_least_squares(y_single_jet_train, \
                                                                                           tx_single_jet_train, \
                                                                                           degrees, k_fold, seed)
    
    # KEEP IN MEMORY THE BEST DEGREE FOR THIS JET
    best_degrees[jet_id] = best_deg
    
    # PREPROCESS FULL TRAINING AND TESTING DATA
    tx_single_jet_train_preprocessed, len_kept_data, unique_cols = \
        preprocess_data(tx_single_jet_train, [], 'none')
    tx_single_jet_test_preprocessed = preprocess_data(tx_single_jet_test, unique_cols, 'none')[0]
    
    # ADD POWERS TO THE CHOSEN COLUMNS
    tx_single_jet_train_preprocessed = add_powers(tx_single_jet_train_preprocessed, range(2,best_deg+1), \
                                                  range(len_kept_data))
    tx_single_jet_test_preprocessed = add_powers(tx_single_jet_test_preprocessed, range(2,best_deg+1), \
                                                 range(len_kept_data))
    
    # COMPUTE THE BEST WEIGHTS AND FULL ACCURACY ON TRAINING FULL SET - ONE JET
    weights, loss = least_squares(y_single_jet_train, tx_single_jet_train_preprocessed, 'mse')
    
    # COMPUTE THE PREDICTIONS ON THE FULL TESTING SET - SINGLE JET
    y_predicted_single_jet_train = predict_labels(weights, tx_single_jet_train_preprocessed)
    y_predicted_single_jet_test = predict_labels(weights, tx_single_jet_test_preprocessed)
    
    # ADD THE PREDICTIONS TO y_predicted_test AND y_predicted_train
    y_predicted_train[mask_jets_train[jet_id]] = y_predicted_single_jet_train
    y_predicted_test[mask_jets_test[jet_id]] = y_predicted_single_jet_test
    
    # COMPUTE THE ACCURACY train ON JET
    accuracy_train_single_jet = np.sum(y_predicted_single_jet_train == y_single_jet_train)/len(y_single_jet_train)
    
    # PRINT ACCURACY train ON JET
    print('Accuracy full train on jet', jet_id, '=', accuracy_train_single_jet)

In [None]:
# CREATE CSV SUBMISSION
#create_csv_submission(ids_test, y_predicted_test, 'output/trial.csv')

# COMPUTE ACCURACY ON FULL train
total_accuracy_train = np.sum(y_predicted_train == y_train)/len(y_train)*100
print('Total accuracy train =', total_accuracy_train, 'with degrees =', best_degrees)

In [None]:
total_accuracy_test = np.sum(y_predicted_test == y_test)/len(y_test)*100
print('Total accuracy test =', total_accuracy_test, 'with degrees =', best_degrees)

## Gradient descent 

In [None]:
def cross_validation_one_fold_GD(y_cross_val_train, y_cross_val_test, tx_cross_val_train, tx_cross_val_test, \
                                 degrees, gammas, len_kept_data, max_iters, stdize=False):
    
    accuracies_train_by_deg = np.zeros([len(degrees), len(gammas)])
    accuracies_test_by_deg = np.zeros([len(degrees), len(gammas)])
    
    # For each degree, compute the least squares weights, the predictions and the accuracies
    previous_deg = 1
    for deg_id, deg in enumerate(degrees):
        print('++ Degree', deg, '++')
                
        # Add powers of the chosen columns
        len_data = tx_cross_val_train.shape[1]
        tx_cross_val_train = add_powers(tx_cross_val_train, range(previous_deg+1, deg+1), range(len_kept_data))
        tx_cross_val_test = add_powers(tx_cross_val_test, range(previous_deg+1, deg+1), range(len_kept_data))
        if stdize: 
            tx_cross_val_train[:,len_data:] = standardize(tx_cross_val_train[:,len_data:])[0]
            tx_cross_val_test[:,len_data:] = standardize(tx_cross_val_test[:,len_data:])[0]
                
        for gamma_id, single_gamma in enumerate(gammas):
            print('>> Gamma', single_gamma, '<<')
            
            # Compute the best weights on the training set
            initial_w = np.zeros(tx_cross_val_train.shape[1])
            weights, loss = least_squares_GD(y_cross_val_train, tx_cross_val_train, initial_w, max_iters, \
                                             single_gamma, fct='mse');

            # Compute the predictions
            y_predicted_cross_val_train = predict_labels(weights, tx_cross_val_train)
            y_predicted_cross_val_test = predict_labels(weights, tx_cross_val_test)

            # Compute the accuracies for each degree
            accuracies_train_by_deg[deg_id, gamma_id] = \
                np.sum(y_predicted_cross_val_train == y_cross_val_train)/len(y_cross_val_train)
            accuracies_test_by_deg[deg_id, gamma_id] = \
                np.sum(y_predicted_cross_val_test == y_cross_val_test)/len(y_cross_val_test)

        # Update the previous degree to the actual degree
        previous_deg = deg
        
    return accuracies_train_by_deg, accuracies_test_by_deg

In [None]:
def cross_validation_GD(y_single_jet_train, tx_single_jet_train, degrees, gammas, k_fold, seed, max_iters):
    
    # Get the indices so that we get the k'th subgroup in test, others in train, for each k
    k_indices = build_k_indices(y_single_jet_train, k_fold, seed)
    
    # Initialize matrix of computed accuracies for each degree and each fold
    accuracies_train_by_fold = np.zeros([len(degrees), len(gammas), k_fold])
    accuracies_test_by_fold = np.zeros([len(degrees), len(gammas), k_fold])
    
    # Preprocess training dataset
    tx_single_jet_train_preprocessed, len_kept_data, unique_cols = \
        preprocess_data(tx_single_jet_train, [], 'after')
    
    for k in range(k_fold):
        print('--- Fold', k, '---')
        # Create the testing set for this fold number
        k_index = k_indices[k] # Indices of the testing set for fold k
        y_cross_val_test = y_single_jet_train[k_index]
        tx_cross_val_test = tx_single_jet_train_preprocessed[k_index,:]
        
        # Create the training set for this fold number
        mask = np.ones(len(y_single_jet_train), dtype=bool) # set all elements to True
        mask[k_index] = False # set test elements to False
        y_cross_val_train = y_single_jet_train[mask] # select only True elements (ie train elements)
        tx_cross_val_train = tx_single_jet_train_preprocessed[mask,:]
        
        # Compute the accuracies for each degree
        accuracies_train_by_fold[:,:,k], accuracies_test_by_fold[:,:,k] = cross_validation_one_fold_GD\
            (y_cross_val_train, y_cross_val_test, tx_cross_val_train, tx_cross_val_test, \
                                 degrees, gammas, len_kept_data, max_iters, True)
    
    # Compute the mean accuracies over the folds, for each degree
    mean_accuracies_train_by_deg = np.mean(accuracies_train_by_fold, axis=2)
    mean_accuracies_test_by_deg = np.mean(accuracies_test_by_fold, axis=2)
    
    # Get the index of the best accuracy in the testing set
    max_id_deg_test, max_id_gamma_test = \
        np.unravel_index(mean_accuracies_test_by_deg.argmax(), mean_accuracies_test_by_deg.shape)
    
    # Find the optimal degree and the corresponding accuracies in the training and testing sets
    best_deg = degrees[max_id_deg_test]
    best_gamma = gammas[max_id_gamma_test]
    best_accuracy_test = mean_accuracies_test_by_deg[max_id_deg_test, max_id_gamma_test]
    corresponding_accuracy_train = mean_accuracies_train_by_deg[max_id_deg_test, max_id_gamma_test]
    
    print('Best accuracy test =', best_accuracy_test, 'with degree =', best_deg, 'and gamma =', best_gamma)
    print('Corresponding accuracy train =', corresponding_accuracy_train)
    
    return best_deg, best_gamma, best_accuracy_test, corresponding_accuracy_train

In [None]:
degrees = range(6,11)
gammas = np.logspace(-5,-1,5)
k_fold = 5
seed = 1
max_iters = 300

In [None]:
mask_jets_train = split_jets_mask(tx_train)
mask_jets_test = split_jets_mask(tx_test)
len_mask = len(mask_jets_train)

y_predicted_train = np.zeros(len(y_train))
y_predicted_test = np.zeros(tx_test.shape[0])
best_degrees = np.zeros(len_mask)
best_gammas = np.zeros(len_mask)

In [None]:
for jet_id in range(len_mask):
    print('***** Jet ', jet_id, '*****')
    # SEPARATE THE WHOLE DATA SET TO GET ONLY THE PART THAT HAVE THE RIGHT NUMBER OF JETS
    tx_single_jet_train = tx_train[mask_jets_train[jet_id]]
    tx_single_jet_test = tx_test[mask_jets_test[jet_id]]
    y_single_jet_train = y_train[mask_jets_train[jet_id]]
    
    # CALL CROSS VALIDATION FOR A SINGLE JET ON TRAIN PART, FIND BEST DEG, BEST ACCURACY ON TESTING CROSS VAL
    best_deg, best_gamma, best_accuracy_test, corresponding_accuracy_train = \
        cross_validation_GD(y_single_jet_train, tx_single_jet_train, degrees, gammas, k_fold, seed, max_iters)
    
    # KEEP IN MEMORY THE BEST DEGREE FOR THIS JET
    best_degrees[jet_id] = best_deg
    best_gammas[jet_id] = best_gamma
    
    # PREPROCESS FULL TRAINING AND TESTING DATA
    tx_single_jet_train_preprocessed, len_kept_data, unique_cols = \
        preprocess_data(tx_single_jet_train, [], 'after')
    tx_single_jet_test_preprocessed = preprocess_data(tx_single_jet_test, unique_cols, 'after')[0]
    
    # ADD POWERS TO THE CHOSEN COLUMNS
    len_data = tx_single_jet_train_preprocessed.shape[1]
    tx_single_jet_train_preprocessed = add_powers(tx_single_jet_train_preprocessed, range(2,best_deg+1), \
                                                  range(len_kept_data))
    tx_single_jet_test_preprocessed = add_powers(tx_single_jet_test_preprocessed, range(2,best_deg+1), \
                                                 range(len_kept_data))
    tx_single_jet_train_preprocessed[:,len_data:] = standardize(tx_single_jet_train_preprocessed[:,len_data:])[0]
    tx_single_jet_test_preprocessed[:,len_data:] = standardize(tx_single_jet_test_preprocessed[:,len_data:])[0]
    
    # COMPUTE THE BEST WEIGHTS AND FULL ACCURACY ON TRAINING FULL SET - ONE JET
    initial_w = np.zeros(tx_single_jet_train_preprocessed.shape[1])
    weights, loss = least_squares_GD(y_single_jet_train, tx_single_jet_train_preprocessed, initial_w, max_iters, \
                                             best_gamma, fct='mse');
    
    # COMPUTE THE PREDICTIONS ON THE FULL TESTING SET - SINGLE JET
    y_predicted_single_jet_train = predict_labels(weights, tx_single_jet_train_preprocessed)
    y_predicted_single_jet_test = predict_labels(weights, tx_single_jet_test_preprocessed)
    
    # ADD THE PREDICTIONS TO y_predicted_test AND y_predicted_train
    y_predicted_train[mask_jets_train[jet_id]] = y_predicted_single_jet_train
    y_predicted_test[mask_jets_test[jet_id]] = y_predicted_single_jet_test
    
    # COMPUTE THE ACCURACY train ON JET
    accuracy_train_single_jet = np.sum(y_predicted_single_jet_train == y_single_jet_train)/len(y_single_jet_train)
    
    # PRINT ACCURACY train ON JET
    print('Accuracy full train on jet', jet_id, '=', accuracy_train_single_jet)

In [None]:
# CREATE CSV SUBMISSION
#create_csv_submission(ids_test, y_predicted_test, 'output/trial.csv')

# COMPUTE ACCURACY ON FULL train
total_accuracy_train = np.sum(y_predicted_train == y_train)/len(y_train)*100
print('Total accuracy train =', total_accuracy_train, 'with degrees =', best_degrees, 'and gammas =', best_gammas)

In [None]:
total_accuracy_test = np.sum(y_predicted_test == y_test)/len(y_test)*100
print('Total accuracy test =', total_accuracy_test, 'with degrees =', best_degrees)

In [None]:
np.sum(y_predicted_test==-1)

In [None]:
np.sum(y_predicted_test==1)

In [None]:
y_predicted_test[:200]

## Ridge regression   

In [None]:
def cross_validation_one_fold_ridge(y_cross_val_train, y_cross_val_test, tx_cross_val_train, tx_cross_val_test, \
                                    degrees, lambdas, len_kept_data, stdize=False):
    
    accuracies_train_by_deg = np.zeros([len(degrees), len(lambdas)])
    accuracies_test_by_deg = np.zeros([len(degrees), len(lambdas)])
    
    # For each degree, compute the least squares weights, the predictions and the accuracies
    previous_deg = 1
    for deg_id, deg in enumerate(degrees):
        print('++ Degree', deg, '++')
                
        # Add powers of the chosen columns
        len_data = tx_cross_val_train.shape[1]
        tx_cross_val_train = add_powers(tx_cross_val_train, range(previous_deg+1, deg+1), range(len_kept_data))
        tx_cross_val_test = add_powers(tx_cross_val_test, range(previous_deg+1, deg+1), range(len_kept_data))
        if stdize: 
            tx_cross_val_train[:,len_data:] = standardize(tx_cross_val_train[:,len_data:])[0]
            tx_cross_val_test[:,len_data:] = standardize(tx_cross_val_test[:,len_data:])[0]
                
        for lambda_id, single_lambda in enumerate(lambdas):
            print('>> Lambda', single_lambda, '<<')
            
            # Compute the best weights on the training set
            weights, loss = ridge_regression(y_cross_val_train, tx_cross_val_train, single_lambda, 'mse');

            # Compute the predictions
            y_predicted_cross_val_train = predict_labels(weights, tx_cross_val_train)
            y_predicted_cross_val_test = predict_labels(weights, tx_cross_val_test)

            # Compute the accuracies for each degree
            accuracies_train_by_deg[deg_id, lambda_id] = \
                np.sum(y_predicted_cross_val_train == y_cross_val_train)/len(y_cross_val_train)
            accuracies_test_by_deg[deg_id, lambda_id] = \
                np.sum(y_predicted_cross_val_test == y_cross_val_test)/len(y_cross_val_test)

        # Update the previous degree to the actual degree
        previous_deg = deg
        
    return accuracies_train_by_deg, accuracies_test_by_deg

In [None]:
def cross_validation_ridge(y_single_jet_train, tx_single_jet_train, degrees, lambdas, k_fold, seed):
    
    # Get the indices so that we get the k'th subgroup in test, others in train, for each k
    k_indices = build_k_indices(y_single_jet_train, k_fold, seed)
    
    # Initialize matrix of computed accuracies for each degree and each fold
    accuracies_train_by_fold = np.zeros([len(degrees), len(lambdas), k_fold])
    accuracies_test_by_fold = np.zeros([len(degrees), len(lambdas), k_fold])
    
    # Preprocess training dataset
    tx_single_jet_train_preprocessed, len_kept_data, unique_cols = \
        preprocess_data(tx_single_jet_train, [], 'before')
    
    for k in range(k_fold):
        print('--- Fold', k, '---')
        # Create the testing set for this fold number
        k_index = k_indices[k] # Indices of the testing set for fold k
        y_cross_val_test = y_single_jet_train[k_index]
        tx_cross_val_test = tx_single_jet_train_preprocessed[k_index,:]
        
        # Create the training set for this fold number
        mask = np.ones(len(y_single_jet_train), dtype=bool) # set all elements to True
        mask[k_index] = False # set test elements to False
        y_cross_val_train = y_single_jet_train[mask] # select only True elements (ie train elements)
        tx_cross_val_train = tx_single_jet_train_preprocessed[mask,:]
        
        # Compute the accuracies for each degree
        accuracies_train_by_fold[:,:,k], accuracies_test_by_fold[:,:,k] = \
            cross_validation_one_fold_ridge(y_cross_val_train, y_cross_val_test, tx_cross_val_train, \
                                            tx_cross_val_test, degrees, lambdas, len_kept_data, False)
    
    # Compute the mean accuracies over the folds, for each degree
    mean_accuracies_train_by_deg = np.mean(accuracies_train_by_fold, axis=2)
    mean_accuracies_test_by_deg = np.mean(accuracies_test_by_fold, axis=2)
    
    # Get the index of the best accuracy in the testing set
    max_id_deg_test, max_id_lambda_test = \
        np.unravel_index(mean_accuracies_test_by_deg.argmax(), mean_accuracies_test_by_deg.shape)
    
    # Find the optimal degree and the corresponding accuracies in the training and testing sets
    best_deg = degrees[max_id_deg_test]
    best_lambda = lambdas[max_id_lambda_test]
    best_accuracy_test = mean_accuracies_test_by_deg[max_id_deg_test, max_id_lambda_test]
    corresponding_accuracy_train = mean_accuracies_train_by_deg[max_id_deg_test, max_id_lambda_test]
    
    print('Best accuracy test =', best_accuracy_test, 'with degree =', best_deg, 'and lambda =', best_lambda)
    print('Corresponding accuracy train =', corresponding_accuracy_train)
    
    return best_deg, best_lambda, best_accuracy_test, corresponding_accuracy_train

In [None]:
degrees = range(6,11)
lambdas = np.logspace(-9,-2,7)
k_fold = 5
seed = 1

In [None]:
mask_jets_train = split_jets_mask(tx_train)
mask_jets_test = split_jets_mask(tx_test)
len_mask = len(mask_jets_train)

y_predicted_train = np.zeros(len(y_train))
y_predicted_test = np.zeros(tx_test.shape[0])
best_degrees = np.zeros(len_mask)
best_lambdas = np.zeros(len_mask)

In [None]:
for jet_id in range(len_mask):
    print('***** Jet ', jet_id, '*****')
    # SEPARATE THE WHOLE DATA SET TO GET ONLY THE PART THAT HAVE THE RIGHT NUMBER OF JETS
    tx_single_jet_train = tx_train[mask_jets_train[jet_id]]
    tx_single_jet_test = tx_test[mask_jets_test[jet_id]]
    y_single_jet_train = y_train[mask_jets_train[jet_id]]
    
    # CALL CROSS VALIDATION FOR A SINGLE JET ON TRAIN PART, FIND BEST DEG, BEST ACCURACY ON TESTING CROSS VAL
    best_deg, best_lambda, best_accuracy_test, corresponding_accuracy_train = \
        cross_validation_ridge(y_single_jet_train, tx_single_jet_train, degrees, lambdas, k_fold, seed)
    
    # KEEP IN MEMORY THE BEST DEGREE FOR THIS JET
    best_degrees[jet_id] = best_deg
    best_lambdas[jet_id] = best_lambda
    
    # PREPROCESS FULL TRAINING AND TESTING DATA
    tx_single_jet_train_preprocessed, len_kept_data, unique_cols = \
        preprocess_data(tx_single_jet_train, [], 'before')
    tx_single_jet_test_preprocessed = preprocess_data(tx_single_jet_test, unique_cols, 'before')[0]
    
    # ADD POWERS TO THE CHOSEN COLUMNS
    len_data = tx_single_jet_train_preprocessed.shape[1]
    tx_single_jet_train_preprocessed = add_powers(tx_single_jet_train_preprocessed, range(2,best_deg+1), \
                                                  range(len_kept_data))
    tx_single_jet_test_preprocessed = add_powers(tx_single_jet_test_preprocessed, range(2,best_deg+1), \
                                                 range(len_kept_data))
    tx_single_jet_train_preprocessed[:,len_data:] = standardize(tx_single_jet_train_preprocessed[:,len_data:])[0]
    tx_single_jet_test_preprocessed[:,len_data:] = standardize(tx_single_jet_test_preprocessed[:,len_data:])[0]
    
    # COMPUTE THE BEST WEIGHTS AND FULL ACCURACY ON TRAINING FULL SET - ONE JET
    initial_w = np.zeros(tx_single_jet_train_preprocessed.shape[1])
    weights, loss = ridge_regression(y_single_jet_train, tx_single_jet_train_preprocessed, \
                                             best_lambda, fct='mse');
    
    # COMPUTE THE PREDICTIONS ON THE FULL TESTING SET - SINGLE JET
    y_predicted_single_jet_train = predict_labels(weights, tx_single_jet_train_preprocessed)
    y_predicted_single_jet_test = predict_labels(weights, tx_single_jet_test_preprocessed)
    
    # ADD THE PREDICTIONS TO y_predicted_test AND y_predicted_train
    y_predicted_train[mask_jets_train[jet_id]] = y_predicted_single_jet_train
    y_predicted_test[mask_jets_test[jet_id]] = y_predicted_single_jet_test
    
    # COMPUTE THE ACCURACY train ON JET
    accuracy_train_single_jet = np.sum(y_predicted_single_jet_train == y_single_jet_train)/len(y_single_jet_train)
    
    # PRINT ACCURACY train ON JET
    print('Accuracy full train on jet', jet_id, '=', accuracy_train_single_jet)

In [None]:
# CREATE CSV SUBMISSION
#create_csv_submission(ids_test, y_predicted_test, 'output/trial.csv')

# COMPUTE ACCURACY ON FULL train
total_accuracy_train = np.sum(y_predicted_train == y_train)/len(y_train)*100
print('Total accuracy train =', total_accuracy_train, 'with degrees =', best_degrees, 'and lambda =', best_lambdas)

In [None]:
total_accuracy_test = np.sum(y_predicted_test == y_test)/len(y_test)*100
print('Total accuracy test =', total_accuracy_test, 'with degrees =', best_degrees, 'and lambda =', best_lambdas)

In [None]:
np.sum(y_predicted_test==-1)

In [None]:
np.sum(y_predicted_test==1)

In [None]:
y_predicted_test[:200]

##  SGD

In [None]:
def cross_validation_one_fold_SGD(y_cross_val_train, y_cross_val_test, tx_cross_val_train, tx_cross_val_test, \
                                 degrees, gammas, len_kept_data, max_iters, batch_size, stdize=False):
    
    accuracies_train_by_deg = np.zeros([len(degrees), len(gammas)])
    accuracies_test_by_deg = np.zeros([len(degrees), len(gammas)])
    
    # For each degree, compute the least squares weights, the predictions and the accuracies
    previous_deg = 1
    for deg_id, deg in enumerate(degrees):
        print('++ Degree', deg, '++')
                
        # Add powers of the chosen columns
        len_data = tx_cross_val_train.shape[1]
        tx_cross_val_train = add_powers(tx_cross_val_train, range(previous_deg+1, deg+1), range(len_kept_data))
        tx_cross_val_test = add_powers(tx_cross_val_test, range(previous_deg+1, deg+1), range(len_kept_data))
        if stdize: 
            tx_cross_val_train[:,len_data:] = standardize(tx_cross_val_train[:,len_data:])[0]
            tx_cross_val_test[:,len_data:] = standardize(tx_cross_val_test[:,len_data:])[0]
                
        for gamma_id, single_gamma in enumerate(gammas):
            print('>> Gamma', single_gamma, '<<')
            
            # Compute the best weights on the training set
            initial_w = np.zeros(tx_cross_val_train.shape[1])
            weights, loss = least_squares_SGD(y_cross_val_train, tx_cross_val_train, initial_w, max_iters, \
                                             single_gamma, batch_size);

            # Compute the predictions
            y_predicted_cross_val_train = predict_labels(weights, tx_cross_val_train)
            y_predicted_cross_val_test = predict_labels(weights, tx_cross_val_test)

            # Compute the accuracies for each degree
            accuracies_train_by_deg[deg_id, gamma_id] = \
                np.sum(y_predicted_cross_val_train == y_cross_val_train)/len(y_cross_val_train)
            accuracies_test_by_deg[deg_id, gamma_id] = \
                np.sum(y_predicted_cross_val_test == y_cross_val_test)/len(y_cross_val_test)

        # Update the previous degree to the actual degree
        previous_deg = deg
        
    return accuracies_train_by_deg, accuracies_test_by_deg

In [None]:
def cross_validation_SGD(y_single_jet_train, tx_single_jet_train, degrees, gammas, k_fold, seed, max_iters, \
                        batch_size):
    
    # Get the indices so that we get the k'th subgroup in test, others in train, for each k
    k_indices = build_k_indices(y_single_jet_train, k_fold, seed)
    
    # Initialize matrix of computed accuracies for each degree and each fold
    accuracies_train_by_fold = np.zeros([len(degrees), len(gammas), k_fold])
    accuracies_test_by_fold = np.zeros([len(degrees), len(gammas), k_fold])
    
    # Preprocess training dataset
    tx_single_jet_train_preprocessed, len_kept_data, unique_cols = \
        preprocess_data(tx_single_jet_train, [], 'after')
    
    for k in range(k_fold):
        print('--- Fold', k, '---')
        # Create the testing set for this fold number
        k_index = k_indices[k] # Indices of the testing set for fold k
        y_cross_val_test = y_single_jet_train[k_index]
        tx_cross_val_test = tx_single_jet_train_preprocessed[k_index,:]
        
        # Create the training set for this fold number
        mask = np.ones(len(y_single_jet_train), dtype=bool) # set all elements to True
        mask[k_index] = False # set test elements to False
        y_cross_val_train = y_single_jet_train[mask] # select only True elements (ie train elements)
        tx_cross_val_train = tx_single_jet_train_preprocessed[mask,:]
        
        # Compute the accuracies for each degree
        accuracies_train_by_fold[:,:,k], accuracies_test_by_fold[:,:,k] = \
            cross_validation_one_fold_SGD(y_cross_val_train, y_cross_val_test, tx_cross_val_train, \
                                          tx_cross_val_test, degrees, gammas, len_kept_data, max_iters, \
                                          batch_size, True)
    
    # Compute the mean accuracies over the folds, for each degree
    mean_accuracies_train_by_deg = np.mean(accuracies_train_by_fold, axis=2)
    mean_accuracies_test_by_deg = np.mean(accuracies_test_by_fold, axis=2)
    
    # Get the index of the best accuracy in the testing set
    max_id_deg_test, max_id_gamma_test = \
        np.unravel_index(mean_accuracies_test_by_deg.argmax(), mean_accuracies_test_by_deg.shape)
    
    # Find the optimal degree and the corresponding accuracies in the training and testing sets
    best_deg = degrees[max_id_deg_test]
    best_gamma = gammas[max_id_gamma_test]
    best_accuracy_test = mean_accuracies_test_by_deg[max_id_deg_test, max_id_gamma_test]
    corresponding_accuracy_train = mean_accuracies_train_by_deg[max_id_deg_test, max_id_gamma_test]
    
    print('Best accuracy test =', best_accuracy_test, 'with degree =', best_deg, 'and gamma =', best_gamma)
    print('Corresponding accuracy train =', corresponding_accuracy_train)
    
    return best_deg, best_gamma, best_accuracy_test, corresponding_accuracy_train

In [None]:
degrees = range(3,10)
gammas = np.logspace(-6,-1,5)
k_fold = 5
seed = 1
max_iters = 300
batch_size = 1

In [None]:
mask_jets_train = split_jets_mask(tx_train)
mask_jets_test = split_jets_mask(tx_test)
len_mask = len(mask_jets_train)

y_predicted_train = np.zeros(len(y_train))
y_predicted_test = np.zeros(tx_test.shape[0])
best_degrees = np.zeros(len_mask)
best_gammas = np.zeros(len_mask)

In [None]:
for jet_id in range(len_mask):
    print('***** Jet ', jet_id, '*****')
    # SEPARATE THE WHOLE DATA SET TO GET ONLY THE PART THAT HAVE THE RIGHT NUMBER OF JETS
    tx_single_jet_train = tx_train[mask_jets_train[jet_id]]
    tx_single_jet_test = tx_test[mask_jets_test[jet_id]]
    y_single_jet_train = y_train[mask_jets_train[jet_id]]
    
    # CALL CROSS VALIDATION FOR A SINGLE JET ON TRAIN PART, FIND BEST DEG, BEST ACCURACY ON TESTING CROSS VAL
    best_deg, best_gamma, best_accuracy_test, corresponding_accuracy_train = \
        cross_validation_SGD(y_single_jet_train, tx_single_jet_train, degrees, gammas, k_fold, seed, max_iters, \
                            batch_size)
    
    # KEEP IN MEMORY THE BEST DEGREE FOR THIS JET
    best_degrees[jet_id] = best_deg
    best_gammas[jet_id] = best_gamma
    
    # PREPROCESS FULL TRAINING AND TESTING DATA
    tx_single_jet_train_preprocessed, len_kept_data, unique_cols = \
        preprocess_data(tx_single_jet_train, [], 'after')
    tx_single_jet_test_preprocessed = preprocess_data(tx_single_jet_test, unique_cols, 'after')[0]
    
    # ADD POWERS TO THE CHOSEN COLUMNS
    len_data = tx_single_jet_train_preprocessed.shape[1]
    tx_single_jet_train_preprocessed = add_powers(tx_single_jet_train_preprocessed, range(2,best_deg+1), \
                                                  range(len_kept_data))
    tx_single_jet_test_preprocessed = add_powers(tx_single_jet_test_preprocessed, range(2,best_deg+1), \
                                                 range(len_kept_data))
    tx_single_jet_train_preprocessed[:,len_data:] = standardize(tx_single_jet_train_preprocessed[:,len_data:])[0]
    tx_single_jet_test_preprocessed[:,len_data:] = standardize(tx_single_jet_test_preprocessed[:,len_data:])[0]
    
    # COMPUTE THE BEST WEIGHTS AND FULL ACCURACY ON TRAINING FULL SET - ONE JET
    initial_w = np.zeros(tx_single_jet_train_preprocessed.shape[1])
    weights, loss = least_squares_SGD(y_single_jet_train, tx_single_jet_train_preprocessed, initial_w, max_iters, \
                                             best_gamma, batch_size);
    
    # COMPUTE THE PREDICTIONS ON THE FULL TESTING SET - SINGLE JET
    y_predicted_single_jet_train = predict_labels(weights, tx_single_jet_train_preprocessed)
    y_predicted_single_jet_test = predict_labels(weights, tx_single_jet_test_preprocessed)
    
    # ADD THE PREDICTIONS TO y_predicted_test AND y_predicted_train
    y_predicted_train[mask_jets_train[jet_id]] = y_predicted_single_jet_train
    y_predicted_test[mask_jets_test[jet_id]] = y_predicted_single_jet_test
    
    # COMPUTE THE ACCURACY train ON JET
    accuracy_train_single_jet = np.sum(y_predicted_single_jet_train == y_single_jet_train)/len(y_single_jet_train)
    
    # PRINT ACCURACY train ON JET
    print('Accuracy full train on jet', jet_id, '=', accuracy_train_single_jet)

In [None]:
# CREATE CSV SUBMISSION
#create_csv_submission(ids_test, y_predicted_test, 'output/trial.csv')

# COMPUTE ACCURACY ON FULL train
total_accuracy_train = np.sum(y_predicted_train == y_train)/len(y_train)*100
print('Total accuracy train =', total_accuracy_train, 'with degrees =', best_degrees, 'and gammas =', best_gammas)

In [None]:
total_accuracy_test = np.sum(y_predicted_test == y_test)/len(y_test)*100
print('Total accuracy test =', total_accuracy_test, 'with degrees =', best_degrees)

In [None]:
np.sum(y_predicted_test==-1)

In [None]:
np.sum(y_predicted_test==1)

In [None]:
y_predicted_test[:200]

## LOGISTIC REGRESSION

In [None]:
def cross_validation_one_fold_logreg(y_cross_val_train, y_cross_val_test, tx_cross_val_train, tx_cross_val_test, \
                                    degrees, gammas, len_kept_data, max_iters, stdize=False):
    
    accuracies_train_by_deg = np.zeros([len(degrees), len(gammas)])
    accuracies_test_by_deg = np.zeros([len(degrees), len(gammas)])
    
    # For each degree, compute the least squares weights, the predictions and the accuracies
    previous_deg = 1
    for deg_id, deg in enumerate(degrees):
        print('++ Degree', deg, '++')
                
        # Add powers of the chosen columns
        len_data = tx_cross_val_train.shape[1]
        tx_cross_val_train = add_powers(tx_cross_val_train, range(previous_deg+1, deg+1), range(len_kept_data))
        tx_cross_val_test = add_powers(tx_cross_val_test, range(previous_deg+1, deg+1), range(len_kept_data))
        if stdize: 
            tx_cross_val_train[:,len_data:] = standardize(tx_cross_val_train[:,len_data:])[0]
            tx_cross_val_test[:,len_data:] = standardize(tx_cross_val_test[:,len_data:])[0]
                
        for gamma_id, single_gamma in enumerate(gammas):
            print('>> Gamma', single_gamma, '<<')
            
            # Compute the best weights on the training set
            initial_w = np.zeros(tx_cross_val_train.shape[1])
            weights, loss = logistic_regression(y_cross_val_train, tx_cross_val_train, initial_w, max_iters, \
                                                single_gamma);

            # Compute the predictions
            y_predicted_cross_val_train = predict_labels(weights, tx_cross_val_train)
            y_predicted_cross_val_test = predict_labels(weights, tx_cross_val_test)

            # Compute the accuracies for each degree
            accuracies_train_by_deg[deg_id, gamma_id] = \
                np.sum(y_predicted_cross_val_train == y_cross_val_train)/len(y_cross_val_train)
            accuracies_test_by_deg[deg_id, gamma_id] = \
                np.sum(y_predicted_cross_val_test == y_cross_val_test)/len(y_cross_val_test)

        # Update the previous degree to the actual degree
        previous_deg = deg
        
    return accuracies_train_by_deg, accuracies_test_by_deg

In [None]:
def cross_validation_logreg(y_single_jet_train, tx_single_jet_train, degrees, gammas, k_fold, seed, max_iters):
    
    # Get the indices so that we get the k'th subgroup in test, others in train, for each k
    k_indices = build_k_indices(y_single_jet_train, k_fold, seed)
    
    # Initialize matrix of computed accuracies for each degree and each fold
    accuracies_train_by_fold = np.zeros([len(degrees), len(gammas), k_fold])
    accuracies_test_by_fold = np.zeros([len(degrees), len(gammas), k_fold])
    
    # Preprocess training dataset
    tx_single_jet_train_preprocessed, len_kept_data, unique_cols = \
        preprocess_data(tx_single_jet_train, [], 'before')
    
    for k in range(k_fold):
        print('--- Fold', k, '---')
        # Create the testing set for this fold number
        k_index = k_indices[k] # Indices of the testing set for fold k
        y_cross_val_test = y_single_jet_train[k_index]
        tx_cross_val_test = tx_single_jet_train_preprocessed[k_index,:]
        
        # Create the training set for this fold number
        mask = np.ones(len(y_single_jet_train), dtype=bool) # set all elements to True
        mask[k_index] = False # set test elements to False
        y_cross_val_train = y_single_jet_train[mask] # select only True elements (ie train elements)
        tx_cross_val_train = tx_single_jet_train_preprocessed[mask,:]
        
        # Compute the accuracies for each degree
        accuracies_train_by_fold[:,:,k], accuracies_test_by_fold[:,:,k] = \
            cross_validation_one_fold_logreg(y_cross_val_train, y_cross_val_test, tx_cross_val_train, \
                                            tx_cross_val_test, degrees, gammas, len_kept_data, max_iters, False)
    
    # Compute the mean accuracies over the folds, for each degree
    mean_accuracies_train_by_deg = np.mean(accuracies_train_by_fold, axis=2)
    mean_accuracies_test_by_deg = np.mean(accuracies_test_by_fold, axis=2)
    
    # Get the index of the best accuracy in the testing set
    max_id_deg_test, max_id_gamma_test = \
        np.unravel_index(mean_accuracies_test_by_deg.argmax(), mean_accuracies_test_by_deg.shape)
    
    # Find the optimal degree and the corresponding accuracies in the training and testing sets
    best_deg = degrees[max_id_deg_test]
    best_gamma = gammas[max_id_gamma_test]
    best_accuracy_test = mean_accuracies_test_by_deg[max_id_deg_test, max_id_gamma_test]
    corresponding_accuracy_train = mean_accuracies_train_by_deg[max_id_deg_test, max_id_gamma_test]
    
    print('Best accuracy test =', best_accuracy_test, 'with degree =', best_deg, 'and gamma =', best_gamma)
    print('Corresponding accuracy train =', corresponding_accuracy_train)
    
    return best_deg, best_gamma, best_accuracy_test, corresponding_accuracy_train

In [None]:
degrees = range(6,10)
gammas = np.logspace(-9,-2,7)
k_fold = 5
seed = 1
max_iters = 300

In [None]:
mask_jets_train = split_jets_mask(tx_train)
mask_jets_test = split_jets_mask(tx_test)
len_mask = len(mask_jets_train)

y_predicted_train = np.zeros(len(y_train))
y_predicted_test = np.zeros(tx_test.shape[0])
best_degrees = np.zeros(len_mask)
best_lambdas = np.zeros(len_mask)

In [None]:
for jet_id in range(len_mask):
    print('***** Jet ', jet_id, '*****')
    # SEPARATE THE WHOLE DATA SET TO GET ONLY THE PART THAT HAVE THE RIGHT NUMBER OF JETS
    tx_single_jet_train = tx_train[mask_jets_train[jet_id]]
    tx_single_jet_test = tx_test[mask_jets_test[jet_id]]
    y_single_jet_train = y_train[mask_jets_train[jet_id]]
    
    # CALL CROSS VALIDATION FOR A SINGLE JET ON TRAIN PART, FIND BEST DEG, BEST ACCURACY ON TESTING CROSS VAL
    best_deg, best_gamma, best_accuracy_test, corresponding_accuracy_train = \
        cross_validation_logreg(y_single_jet_train, tx_single_jet_train, degrees, gammas, k_fold, seed, max_iters)
    
    # KEEP IN MEMORY THE BEST DEGREE FOR THIS JET
    best_degrees[jet_id] = best_deg
    best_gammas[jet_id] = best_gamma
    
    # PREPROCESS FULL TRAINING AND TESTING DATA
    tx_single_jet_train_preprocessed, len_kept_data, unique_cols = \
        preprocess_data(tx_single_jet_train, [], 'before')
    tx_single_jet_test_preprocessed = preprocess_data(tx_single_jet_test, unique_cols, 'before')[0]
    
    # ADD POWERS TO THE CHOSEN COLUMNS
    len_data = tx_single_jet_train_preprocessed.shape[1]
    tx_single_jet_train_preprocessed = add_powers(tx_single_jet_train_preprocessed, range(2,best_deg+1), \
                                                  range(len_kept_data))
    tx_single_jet_test_preprocessed = add_powers(tx_single_jet_test_preprocessed, range(2,best_deg+1), \
                                                 range(len_kept_data))
    tx_single_jet_train_preprocessed[:,len_data:] = standardize(tx_single_jet_train_preprocessed[:,len_data:])[0]
    tx_single_jet_test_preprocessed[:,len_data:] = standardize(tx_single_jet_test_preprocessed[:,len_data:])[0]
    
    # COMPUTE THE BEST WEIGHTS AND FULL ACCURACY ON TRAINING FULL SET - ONE JET
    initial_w = np.zeros(tx_single_jet_train_preprocessed.shape[1])
    weights, loss = logistic_regression(y_single_jet_train, tx_single_jet_train_preprocessed, initial_w,\
                                        max_iters, best_gamma);
    
    # COMPUTE THE PREDICTIONS ON THE FULL TESTING SET - SINGLE JET
    y_predicted_single_jet_train = predict_labels(weights, tx_single_jet_train_preprocessed)
    y_predicted_single_jet_test = predict_labels(weights, tx_single_jet_test_preprocessed)
    
    # ADD THE PREDICTIONS TO y_predicted_test AND y_predicted_train
    y_predicted_train[mask_jets_train[jet_id]] = y_predicted_single_jet_train
    y_predicted_test[mask_jets_test[jet_id]] = y_predicted_single_jet_test
    
    # COMPUTE THE ACCURACY train ON JET
    accuracy_train_single_jet = np.sum(y_predicted_single_jet_train == y_single_jet_train)/len(y_single_jet_train)
    
    # PRINT ACCURACY train ON JET
    print('Accuracy full train on jet', jet_id, '=', accuracy_train_single_jet)

In [None]:
# CREATE CSV SUBMISSION
#create_csv_submission(ids_test, y_predicted_test, 'output/trial.csv')

# COMPUTE ACCURACY ON FULL train
total_accuracy_train = np.sum(y_predicted_train == y_train)/len(y_train)*100
print('Total accuracy train =', total_accuracy_train, 'with degrees =', best_degrees, 'and gamma =', best_gammas)

In [None]:
total_accuracy_test = np.sum(y_predicted_test == y_test)/len(y_test)*100
print('Total accuracy test =', total_accuracy_test, 'with degrees =', best_degrees, 'and gamma =', best_gammas)

In [None]:
np.sum(y_predicted_test==-1)

In [None]:
np.sum(y_predicted_test==1)

In [None]:
y_predicted_test[:200]

## LOGISTIC REGULARIZED

In [None]:
def cross_validation_one_fold_logistic_regularized(y_cross_val_train, y_cross_val_test, tx_cross_val_train, tx_cross_val_test, \
                                 degrees, gammas, lambdas, len_kept_data, max_iters, stdize=False):
    
    accuracies_train_by_deg = np.zeros([len(degrees), len(gammas),len(lambdas)])
    accuracies_test_by_deg = np.zeros([len(degrees), len(gammas),len(lambdas)])
    
    # For each degree, compute the least squares weights, the predictions and the accuracies
    previous_deg = 1
    for deg_id, deg in enumerate(degrees):
        print('++ Degree', deg, '++')
                
        # Add powers of the chosen columns
        len_data = tx_cross_val_train.shape[1]
        tx_cross_val_train = add_powers(tx_cross_val_train, range(previous_deg+1, deg+1), range(len_kept_data))
        tx_cross_val_test = add_powers(tx_cross_val_test, range(previous_deg+1, deg+1), range(len_kept_data))
        if stdize: 
            tx_cross_val_train[:,len_data:] = standardize(tx_cross_val_train[:,len_data:])[0]
            tx_cross_val_test[:,len_data:] = standardize(tx_cross_val_test[:,len_data:])[0]
                
        for gamma_id, single_gamma in enumerate(gammas):
            print('>> Gamma', single_gamma, '<<')
            
            
            for lambda_id, single_lambda in enumerate(lambdas):
                
                print('>> Lambda', single_lambda, '<<')
            # Compute the best weights on the training set
                initial_w = np.zeros(tx_cross_val_train.shape[1])
                weights, loss = reg_logistic_regression(y_cross_val_train, tx_cross_val_train, single_lambda,initial_w, max_iters, \
                                                 single_gamma);

                # Compute the predictions
                y_predicted_cross_val_train = predict_labels(weights, tx_cross_val_train)
                y_predicted_cross_val_test = predict_labels(weights, tx_cross_val_test)

                # Compute the accuracies for each degree
                accuracies_train_by_deg[deg_id, gamma_id,lambda_id] = \
                    np.sum(y_predicted_cross_val_train == y_cross_val_train)/len(y_cross_val_train)
                accuracies_test_by_deg[deg_id, gamma_id,lambda_id] = \
                    np.sum(y_predicted_cross_val_test == y_cross_val_test)/len(y_cross_val_test)

        # Update the previous degree to the actual degree
        previous_deg = deg
        
    return accuracies_train_by_deg, accuracies_test_by_deg

In [None]:
def cross_validation_logistic_regularized(y_single_jet_train, tx_single_jet_train, degrees, gammas, lambdas, k_fold, seed, max_iters):
    
    # Get the indices so that we get the k'th subgroup in test, others in train, for each k
    k_indices = build_k_indices(y_single_jet_train, k_fold, seed)
    
    # Initialize matrix of computed accuracies for each degree and each fold
    accuracies_train_by_fold = np.zeros([len(degrees), len(gammas),len(lambdas), k_fold])
    accuracies_test_by_fold = np.zeros([len(degrees), len(gammas),len(lambdas), k_fold])
    
    # Preprocess training dataset
    tx_single_jet_train_preprocessed, len_kept_data, unique_cols = \
        preprocess_data(tx_single_jet_train, [], 'after')
    
    for k in range(k_fold):
        print('--- Fold', k, '---')
        # Create the testing set for this fold number
        k_index = k_indices[k] # Indices of the testing set for fold k
        y_cross_val_test = y_single_jet_train[k_index]
        tx_cross_val_test = tx_single_jet_train_preprocessed[k_index,:]
        
        # Create the training set for this fold number
        mask = np.ones(len(y_single_jet_train), dtype=bool) # set all elements to True
        mask[k_index] = False # set test elements to False
        y_cross_val_train = y_single_jet_train[mask] # select only True elements (ie train elements)
        tx_cross_val_train = tx_single_jet_train_preprocessed[mask,:]
        
        # Compute the accuracies for each degree
        accuracies_train_by_fold[:,:,:,k], accuracies_test_by_fold[:,:,:,k] = cross_validation_one_fold_logistic_regularized\
            (y_cross_val_train, y_cross_val_test, tx_cross_val_train, tx_cross_val_test, \
                                 degrees, gammas, lambdas, len_kept_data, max_iters, True)
    
    # Compute the mean accuracies over the folds, for each degree
    mean_accuracies_train_by_deg = np.mean(accuracies_train_by_fold, axis=3)
    mean_accuracies_test_by_deg = np.mean(accuracies_test_by_fold, axis=3)
    
    # Get the index of the best accuracy in the testing set
    max_id_deg_test, max_id_gamma_test,max_id_lambda = \
        np.unravel_index(mean_accuracies_test_by_deg.argmax(), mean_accuracies_test_by_deg.shape)
    
    # Find the optimal degree and the corresponding accuracies in the training and testing sets
    best_deg = degrees[max_id_deg_test]
    best_gamma = gammas[max_id_gamma_test]
    best_lambda=lambdas[max_id_lambda]
    best_accuracy_test = mean_accuracies_test_by_deg[max_id_deg_test, max_id_gamma_test,max_id_lambda]
    corresponding_accuracy_train = mean_accuracies_train_by_deg[max_id_deg_test, max_id_gamma_test,max_id_lambda]
    
    print('Best accuracy test =', best_accuracy_test, 'with degree =', best_deg)
    print('Corresponding accuracy train =', corresponding_accuracy_train)
    
    return best_deg, best_gamma, best_lambda, best_accuracy_test, corresponding_accuracy_train                        


In [None]:
degrees = range(6,11)
gammas = np.logspace(-8,-2,6)
lambdas = np.logspace(-8,-2,6)

k_fold = 5
seed = 1
max_iters = 300

In [None]:
mask_jets_train = split_jets_mask(tx_train)
mask_jets_test = split_jets_mask(tx_test)
len_mask = len(mask_jets_train)

y_predicted_train = np.zeros(len(y_train))
y_predicted_test = np.zeros(tx_test.shape[0])
best_degrees = np.zeros(len_mask)
best_gammas = np.zeros(len_mask)
best_lambdas=np.zeros(len_mask)

In [None]:
for jet_id in range(len_mask):
    print('** Jet ', jet_id, '**')
    # SEPARATE THE WHOLE DATA SET TO GET ONLY THE PART THAT HAVE THE RIGHT NUMBER OF JETS
    tx_single_jet_train = tx_train[mask_jets_train[jet_id]]
    tx_single_jet_test = tx_test[mask_jets_test[jet_id]]
    y_single_jet_train = y_train[mask_jets_train[jet_id]]
    
    # CALL CROSS VALIDATION FOR A SINGLE JET ON TRAIN PART, FIND BEST DEG, BEST ACCURACY ON TESTING CROSS VAL
    best_deg, best_gamma, best_lambda, best_accuracy_test, corresponding_accuracy_train = \
        cross_validation_logistic_regularized(y_single_jet_train, tx_single_jet_train, degrees, gammas, lambdas, k_fold, seed, max_iters)
    
    # KEEP IN MEMORY THE BEST DEGREE FOR THIS JET
    best_degrees[jet_id] = best_deg
    best_gammas[jet_id] = best_gamma
    best_lambdas[jet_id]=best_lambda
    # PREPROCESS FULL TRAINING AND TESTING DATA
    tx_single_jet_train_preprocessed, len_kept_data, unique_cols = \
        preprocess_data(tx_single_jet_train, [], 'after')
    tx_single_jet_test_preprocessed = preprocess_data(tx_single_jet_test, unique_cols, 'after')[0]
    
    # ADD POWERS TO THE CHOSEN COLUMNS
    len_data = tx_single_jet_train_preprocessed.shape[1]
    tx_single_jet_train_preprocessed = add_powers(tx_single_jet_train_preprocessed, range(2,best_deg+1), \
                                                  range(len_kept_data))
    tx_single_jet_test_preprocessed = add_powers(tx_single_jet_test_preprocessed, range(2,best_deg+1), \
                                                 range(len_kept_data))
    tx_single_jet_train_preprocessed[:,len_data:] = standardize(tx_single_jet_train_preprocessed[:,len_data:])[0]
    tx_single_jet_test_preprocessed[:,len_data:] = standardize(tx_single_jet_test_preprocessed[:,len_data:])[0]
    
    # COMPUTE THE BEST WEIGHTS AND FULL ACCURACY ON TRAINING FULL SET - ONE JET
    initial_w = np.zeros(tx_single_jet_train_preprocessed.shape[1])
    weights, loss = reg_logistic_regression(y_single_jet_train, tx_single_jet_train_preprocessed , best_lambda, initial_w, max_iters, best_gamma)
    
    # COMPUTE THE PREDICTIONS ON THE FULL TESTING SET - SINGLE JET
    y_predicted_single_jet_train = predict_labels(weights, tx_single_jet_train_preprocessed)
    y_predicted_single_jet_test = predict_labels(weights, tx_single_jet_test_preprocessed)
    
    # ADD THE PREDICTIONS TO y_predicted_test AND y_predicted_train
    y_predicted_train[mask_jets_train[jet_id]] = y_predicted_single_jet_train
    y_predicted_test[mask_jets_test[jet_id]] = y_predicted_single_jet_test
    
    # COMPUTE THE ACCURACY train ON JET
    accuracy_train_single_jet = np.sum(y_predicted_single_jet_train == y_single_jet_train)/len(y_single_jet_train)
    
    # PRINT ACCURACY train ON JET
    print('Accuracy full train on jet', jet_id, '=', accuracy_train_single_jet)

In [None]:
# CREATE CSV SUBMISSION
#create_csv_submission(ids_test, y_predicted_test, 'output/trial.csv')

# COMPUTE ACCURACY ON FULL train
total_accuracy_train = np.sum(y_predicted_train == y_train)/len(y_train)*100
print('Total accuracy train =', total_accuracy_train, 'with degrees =', best_degrees, \
      ', gammas =', best_gammas, 'and lambdas =', best_lambdas)

In [None]:
total_accuracy_test = np.sum(y_predicted_test == y_test)/len(y_test)*100
print('Total accuracy test =', total_accuracy_test, 'with degrees =', best_degrees)

In [None]:
np.sum(y_predicted_test==-1)

In [None]:
np.sum(y_predicted_test==1)

In [None]:
y_predicted_test[:200]