In [98]:
# Useful starting lines
%matplotlib inline
import datetime
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

from proj1_helpers import *
from implementations import*

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load the training data into feature matrix, class labels, and event ids:

In [99]:
DATA_TRAIN_PATH = '../data/train.csv/train.csv' # TODO: download train data and supply path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Manipulating Nans and subgroups:

There is one easy way to clean the dataset from its Nan values: deleting the rows where there is an outsider (value = -999.0):

In [100]:
selector = np.all(tX != -999.0, axis=1)
tX_clean = tX[selector]
y_clean = y[selector]

print(tX.shape)
print(tX_clean.shape)
print(1-tX_clean.shape[0]/tX.shape[0])

(250000, 30)
(68114, 30)
0.727544


But as we see its bad cause we are reducing our dataset by 72.75%, which is really huge! So this is not the correct way to proceed.

We thus looked at some other information from our data, especially the information we get from the labels:

In [101]:
count_1 = list(y).count(1)
count_2 = list(y).count(-1)
print("Nombre de label = 1: ", count_1 , "\nNombre de label = -1: " , count_2)
print("Pourcentage de label 1: " , count_1/len(y), "\nPourcentage de label -1: ", count_2/len(y))

Nombre de label = 1:  85667 
Nombre de label = -1:  164333
Pourcentage de label 1:  0.342668 
Pourcentage de label -1:  0.657332


The features names:

In [102]:
string_features = 'DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,DER_sum_pt,DER_pt_ratio_lep_tau,DER_met_phi_centrality,DER_lep_eta_centrality,PRI_tau_pt,PRI_tau_eta,PRI_tau_phi,PRI_lep_pt,PRI_lep_eta,PRI_lep_phi,PRI_met,PRI_met_phi,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt'
features = string_features.split(",")
dict = {}
for ind, feat in enumerate(features):
    dict[feat] = ind

What we also saw while analysing our dataset is that one particular feature is a categorical one: PRI_jet_num, which has 4 different values. By dividing our dataset in 4 subgroups, depending on the value of PRI_jet_num, and by analysing the Nan values in each subgroup, we see that indeed the samples in each subgroups have some specific columns that have Nans.

This is why we decided to create 4 models instead of one.

We will standardize each subgroup, and thus each subgroup will have a particular mean and std:

In [103]:
tX_0=tX[tX[:,dict['PRI_jet_num']]==0]
tX_1=tX[tX[:,dict['PRI_jet_num']]==1]
tX_2=tX[tX[:,dict['PRI_jet_num']]==2]
tX_3=tX[tX[:,dict['PRI_jet_num']]==3]
y_0=y[tX[:,dict['PRI_jet_num']]==0]
y_1=y[tX[:,dict['PRI_jet_num']]==1]
y_2=y[tX[:,dict['PRI_jet_num']]==2]
y_3=y[tX[:,dict['PRI_jet_num']]==3]
ids_0=ids[tX[:,dict['PRI_jet_num']]==0]
ids_1=ids[tX[:,dict['PRI_jet_num']]==1]
ids_2=ids[tX[:,dict['PRI_jet_num']]==2]
ids_3=ids[tX[:,dict['PRI_jet_num']]==3]
tX_list = [tX_0]
tX_list.append(tX_1)
tX_list.append(tX_2)
tX_list.append(tX_3)
ids_list = [ids_0]
ids_list.append(ids_1)
ids_list.append(ids_2)
ids_list.append(ids_3)

We are now going to standardize each of our subgroups. Also, we will compute the indices of our tX dataset, where for the x_j_th datapoint, we have values not in the bounds of the percentile. Thus we store these j_th indexes, such we can remove them. 


In [104]:
def standardize_NAN(tX):
    tX_nan = tX.copy()
    for i in range(tX.shape[0]):
        for j in range(tX.shape[1]):
            if (tX_nan[i,j] == -999.0):
                tX_nan[i,j] = np.nan
    return (standardize(tX_nan))

# Tout les nans (correspondant a des valeurs non connues) sont remplacés par la moyenne de la colonnes
def replace_mean(tX_nan):
    means_cols = np.nanmean(tX_nan,axis=1)
    for row in range(0,tX_nan.shape[0]):
        for col in range(0,tX_nan.shape[1]):
            if np.isnan(tX_nan[row,col]):
                tX_nan[row,col]=means_cols[col]
    return (tX_nan)

def get_ind_percentiles(tX, tX_clean, i, percentile):
    arguments = []
    a = np.percentile(tX_clean[:,i],percentile)
    tX_perc = tX.copy()
    arguments = np.argwhere(tX_perc[tX[:,i] > round(a, 2)])
    return list(set(arguments[:,0]))

def remove_rows_by_percentiles(tX,tX_clean):
    args = []
    for i in range(tX.shape[1]):
        args= args+get_ind_percentiles(tX,tX_clean,i,99.97)
    flat_list = [item for item in args]
    mylist = list(set(flat_list))
    return mylist

In [130]:
mean = []
std = []
tX_nan_replaced = []
for i in range(4):
    x,m,s = standardize_NAN(tX_list[i])
    tX_nan_replaced.append(replace_mean(x))
    mean.append(m)
    std.append(s)

44.83469273209879
63.38124719688545
129.81575662840595
139.61904053684216


In the end, to send a correct file, we should group the dataset again as it was before divind it in subgroups:

In [131]:
def group(ls,ids):
    data_ord = np.insert(ls[0],0,ids[0], axis=1)
    for i in range(1,4):
        a = np.insert(ls[i],0,ids[i], axis=1)
        data_ord = np.concatenate((data_ord, a))
    return data_ord[data_ord[:,0].argsort()]

In [132]:
data_inorder = group(tX_nan_replaced, ids_list)

Below, we plot the boxplots of each features, but taking care that all Nan values per previously removes (tX_clean)

In [28]:
def plot_boxplot(num, tX_clean):
    fig1, ax1 = plt.subplots()
    data = []
    for i in range(0, num):
        data.append(tX_clean[:, i])
    ax1.boxplot(data);

The method below is computing the indices of our tX dataset, where for the x_j_th datapoint, we have values not in the bounds of the percentile. Thus we store these j_th indexes, such we can remove them. This method computes this fot the i_th feature.

In [29]:
def get_ind_percentiles(tX, tX_clean, i, percentile):
    arguments = []
    a = np.percentile(tX_clean[:,i],percentile)
    tX_perc = tX.copy()
    arguments = np.argwhere(tX_perc[tX_perc[:,i] > round(a, 2)])
    return list(set(arguments[:,0]))

The method below thus removes all rows where the row has a value not in the percentile range.

In [30]:
def remove_rows_by_percentiles(tX,tX_clean,y):
    args = []
    for i in range(30):
        args= args+get_ind_percentiles(tX,tX_clean,i,99.7)
    flat_list = [item for item in args]
    mylist = list(set(flat_list))
    tX_perc = tX.copy()
    y_perc = y.copy()
    tX_perc = np.delete(tX_perc, mylist, axis=0)
    y_perc = np.delete(y_perc,mylist,axis=0)
    return tX_perc,y_perc

Below we are plotting the correlation between two features.

In [None]:
def plot_of_2_features(y, tX, feat1, feat2):
    y_copy = y.copy()
    fig = plt.figure(figsize=(5,5))
    colormap = np.array(['r', 'g', 'b'])
    #We change values -1 of y to 0 
    y_copy[y_copy<0] = 0
    categories = np.array(y_copy.copy())
    categories = categories.astype(int)
    plt.scatter(tX[:,feat1], tX[:,feat2],  c=colormap[categories])
    plt.title(features[feat1] + ' and ' + features[feat2])
    
plot_of_2_features(y,tX,2,3)

Too many data lost in tX_clean, maybe we can make calculation without taking into account the -999.0 in the average
We can replace this value by NaN wich will be not taking into account during the standardization

Even better, we can use a regression model to predict the missing value per feature, whithout taking into acount the outliers of a particular feature.

In [9]:
def nan_par_feat(tX,y):
    tX_model = tX.copy()
    y_model = y.copy()
    dic = {}
    #finds all datapoints where at feature "ind" it has value -999.0 and stores it in a dictionnary
    for ind, tX_ in enumerate(tX_model.T):
        dic[ind] = list(np.where(tX_ == -999.0)[0])
nan_par_feat(tX,y)

In [10]:
def model_feature(tX,y):
    tX_model = tX.copy()
    y_model = y.copy()
    dic = {}
    #finds all datapoints where at feature "ind" it has value -999.0 and stores it in a dictionnary
    for ind, tX_ in enumerate(tX_model.T):
        dic[ind] = list(np.where(tX_ == -999.0)[0])
    #We are gonna try to group features depending on the indices where they have -999.0
    ls = {}
    for ind in range(len(dic)):
        for ind2 in range(ind+1,len(dic)):
            sum_ = len(set(dic[ind]) & set(dic[ind2])) 
            if sum_ in ls:
                if ind2 not in ls[sum_]:
                    ls[sum_].append(ind2)
            else:
                ls[sum_] = [ind,ind2]
    return ls
dic = model_feature(tX,y)

# 3. Least squares

We can alter the k value of the cross validation.

In [11]:
def return_factors(x):
    # This function takes a number and prints the factors
    a = []
    for i in range(2,10):
        if x % i == 0:
            a.append(i)
    return a

In [12]:
#tX_LS=tX_clean.copy()
#y_LS= y_clean.copy()
#accuracy=0.7239627683002026 mse=0.7401139564470155

#tX_LS=tX_nan.copy()
#y_LS= y.copy()
# accuracy=0.745088 mse=0.6786542286532609

def compute_least_squares(tX, y):
    tX_LS=tX.copy()
    y_LS= y.copy()

    K_values = return_factors(len(tX_LS))
    accuracy = []
    #K-fold crossvalidation
    for K in K_values:
        #Initialization
        list_tX_LS = np.split(tX_LS,K)
        list_y_LS = np.split(y_LS,K)
        weights=[]
        mse_errors = []
        opt_w = []
        for ind, tX_bloc in enumerate(list_tX_LS):
            tX_test = tX_bloc
            y_test = list_y_LS[ind]
            tX_train = np.concatenate(list_tX_LS[:ind] + list_tX_LS[ind+1:])
            y_train = np.concatenate(list_y_LS[:ind] + list_y_LS[ind+1:])
            mse_LS, optimal_weights_LS = least_squares(y_train,tX_train)
            mse_errors.append(compute_mse(y_test, tX_test, optimal_weights_LS))
            weights.append(optimal_weights_LS)

        opt_w = weights[np.argmin(mse_errors)]
        y_model = predict_labels(opt_w, tX_LS)

        #Computing accuracy
        accuracy.append((list(y_model == y_LS).count(True))/len(y_model))
        print("accuracy = {val} mse={mse}".format(mse = mse_LS, val=accuracy[-1]))

    #Plot of accuracies
    print("\nMaximum accuracy = {val}".format(val=np.max(accuracy)))
    #plt.plot(K_values, accuracy, '.-', markersize=15, label = "Accuracy");
    #plt.xlabel("K value")
    #plt.ylabel("Accuracy")
    #plt.title("Accuracies for Least Squares")
    #plt.legend()

In [14]:
tx_perc, y_perc = remove_rows_by_percentiles(tX_nan,tX_clean,y)
print(tX.shape[0]-tx_perc.shape[0], "rows were deleted based on the percentile concept")

0 rows were deleted based on the percentile concept


In [15]:
compute_least_squares(tX_nan, y)

accuracy = 0.745008 mse=0.6781761342388203
accuracy = 0.745088 mse=0.6786542286532609
accuracy = 0.744904 mse=0.6786166979058809
accuracy = 0.745044 mse=0.6785307962457391

Maximum accuracy = 0.745088


In [16]:
compute_least_squares(tx_perc,y_perc)

accuracy = 0.745008 mse=0.6781761342388203
accuracy = 0.745088 mse=0.6786542286532609
accuracy = 0.744904 mse=0.6786166979058809
accuracy = 0.745044 mse=0.6785307962457391

Maximum accuracy = 0.745088


## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = '../data/test.csv/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
OUTPUT_PATH = './leastSquarePOLY' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(opt_w, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [None]:
y_pred.shape