In [53]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from proj1_helpers import *
from implementations import *
import codecs, json 
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Compute correlation matrix

Compute the correlation matrix considering al the inputs and all the features.

In [107]:
# LOAD TRAIN DATA
data_path = "../dataset/train.csv"
y_loaded, data_loaded, _ = load_csv_data(data_path)

In [108]:
# LOAD TRAIN DATA
data_path = "../dataset/test.csv"
_, data_test_loaded, _ = load_csv_data(data_path)

In [4]:
data_loaded.shape, data_test_loaded.shape

((250000, 30), (568238, 30))

In [4]:
data_merged= np.concatenate((data_loaded,data_test_loaded), axis=0)
data_merged.shape

(818238, 30)

In [None]:
# CLEAN DATA: YOU MAY WANT TO CHAGE THIS STEP
data = drop_columns_with_70_nan_ratio(data_merged)
data = fill_with_nan_list(data, nan_values=[0, -999])
data = sustitute_nans(data, substitutions=np.nanmean(data, axis=0))
data.shape

In [None]:
# COMPUTE AND STORE CORRELATION MATRIX
corr_matrix = np.abs(np.corrcoef(data.T)) 
file_path = "corr_matrix23.json"
json.dump(corr_matrix.tolist(), codecs.open(file_path, 'w', encoding='utf-8'), separators=(',', ':'), sort_keys=True, indent=4)

In [None]:
# example: LOAD CORRELATION MATRIX
file_path = "corr_matrix23.json"
obj_text = codecs.open(file_path, 'r', encoding='utf-8').read()
b_new = json.loads(obj_text)
corr_matrix_loaded = np.array(b_new)

In [None]:
# exaple: GET CORRELATED COLUMNS
corr_matrix = np.abs(corr_matrix) > 0.6
# remove i from the list (i is surely correlated to himself)
for i in range(23):
    corr_matrix[i][i] = False
    
correlated_to = {}
for i in range(23):
    c = np.where(corr_matrix[i])[0]
    if len(c) > 0: # if it is not correlated to any other column then ignore it
        correlated_to[i] = c
correlated_to

## Plot ditributions 

In [None]:
# plot data as it is
plot_distributions(data_loaded, y_loaded, col_labels = column_labels(), title = "distribution_rawData")

In [None]:
# plot data after cleaning it is
x_cleaned, keptCols = clean_x(data_loaded, 0.7, subs_func = np.nanmean)

plot_distributions(x_cleaned, y_loaded, col_labels = keptCols, title = "distribution_cleanedData_999=mean_0=mean")

In [None]:
# plot data after cleaning it is
x_cleaned, keptCols = clean_x(data_loaded, 0.7, subs_func = None)

plot_distributions(x_cleaned, y_loaded, col_labels = keptCols, title = "distribution_cleanedData_999=999_0=0")

In [None]:
# better norm the hist to have a clearer idea of the proportions
f, ax = plt.subplots(2, 2)
dataset1 = [0, 0, 0, 0, 1, 2, 4, 6]
dataset2 = [0, 0, 1, 6]

ax[0][0].hist(dataset1, histtype='step',color="red", normed=True)
ax[0][1].hist(dataset1, histtype='step',color="red", normed=False)


ax[1][0].hist(dataset2, histtype='step', color="blue", normed=True)
ax[1][1].hist(dataset2, histtype='step', color="blue", normed=False)


### Percentile

In [6]:
data_merged = fill_with_nan_list(data_merged, nan_values=[0, -999])
data_merged.shape

(818238, 30)

In [65]:
list(range(0, 101, 5))

[0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95]

In [66]:
# [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95]
scan_perc = list(range(0, 101, 5))
col_perc = np.zeros((len(scan_perc), data_merged.shape[1]))
for col in range(data_merged.shape[1]):
    for i, p in enumerate(scan_perc):
         col_perc[i, col] = np.nanpercentile(data_merged[:, col], p)
col_perc.shape

(21, 30)

In [67]:
# store both the computed percentiles and the list of % to which they correspond
file_path = "percentiles.json"
map_ = {
    "scan_perc": scan_perc,
    "col_perc": col_perc.tolist()
}
json.dump(map_, codecs.open(file_path, 'w', encoding='utf-8'), separators=(',', ':'), sort_keys=True, indent=4)

## Other approach: split data depending on jet number

### Why divide data depending on jet numbers and which columns can we drop?

Observations: most of the -999 value are associated with jet number <= 1. The only feature that still contains some -999 values is the first one.

In [96]:
all_data = data_merged.copy()

In [12]:
all_data.shape

(818238, 30)

Divide the data depending on the jet number (column 22) that is a categorical number in {0, 1, 2, 3}

In [97]:
jets_0 = all_data[all_data[:, 22]==0, :]
jets_1 = all_data[all_data[:, 22]==1, :]
jets_2 = all_data[all_data[:, 22]==2, :]
jets_3 = all_data[all_data[:, 22]==3, :]
jets_0.shape, jets_1.shape, jets_2.shape, jets_3.shape 

((327371, 30), (252882, 30), (165027, 30), (72958, 30))

Where are the -999 values?
- jet = 0: columns [4, 5, 6, 12, 23, 24, 25, 26, 27, 28] contain only -999 values, 26.1% of entries in the first column have -999
- jet = 1: columns [4, 5, 6, 12, 26, 27, 28] contain only -999 values, 7.6% of entries in the first column have -999
- jet = 2: 3% of entries in the first column have -999
- jet = 3: 1.4% of entries in the first column have -999

The two tables relative to jet 2 and jet 3 could be merged. The -999 values in the first column will be removed from that column and a boolean column will be created to indicate the position of the -999 values.

In [None]:
for jet, cur_set in enumerate([jets_0, jets_1, jets_2, jets_3]):
    print("Features in the dataset with jet=", jet, "contains this many values != -999")
    for col in range(30):
        print(col, np.sum(cur_set[:, col] != -999))
    print()

Where are the 0 values?
- jet = 0: column 29 contains only 0s and, obviously, column 22 too since it stores the jet num.
- jet = 1: spread
- jet = 2: spread
- jet = 3: spread

In [None]:
for jet, cur_set in enumerate([jets_0, jets_1, jets_2, jets_3]):
    print("Features in the dataset with jet=", jet, "contains this many values != 0")
    for col in range(30):
        print(col, np.sum(cur_set[:, col] != 0))
    print()

After this first step we surely want to drop the following columns since they do not contain any useful information:
- jet = 0: [4, 5, 6, 12, 22, 23, 24, 25, 26, 27, 28, 29]
- jet = 1: [4, 5, 6, 12, 22, 26, 27, 28] 
- jet = 2: [22]
- jet = 3: [22]

The column 22 is dropped in every obtained dataset since it just stores a constant representing the jet number. 

### Divide data and compute the correlation matrix 

We now divide our data, drop the above columns and verify if there are some highly correlated features. If so, it is worth trying to drop all but 1 column in a set of correlated features.

In [157]:
# split data
datasets = split_input_data(data_merged) # split and drop
datasets[0].shape, datasets[1].shape, datasets[2].shape, datasets[3].shape

((327371, 18), (252882, 22), (165027, 29), (72958, 29))

In [163]:
# compute correlation matrices
corr_matrices = [None]*4
for jet in range(4):
    # don't consider the first column since it contains nan values (we will simply keep that column)
    corr_matrices[jet] = np.corrcoef(datasets[jet][:, 1:].T) 
    
    # to keep the same indexing of the columns just add one row above and one column at the left
    corr_matrices[jet] = np.column_stack((np.zeros((corr_matrices[jet].shape[0], 1)), corr_matrices[jet]))   
    corr_matrices[jet] = np.row_stack((np.zeros((1, corr_matrices[jet].shape[1])), corr_matrices[jet]))

corr_matrices[0].shape, corr_matrices[1].shape, corr_matrices[2].shape, corr_matrices[3].shape

((18, 18), (22, 22), (29, 29), (29, 29))

In [155]:
np.row_stack((np.array([1, 2, 3]), np.zeros((3, 3))))

array([[ 1.,  2.,  3.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])

In [147]:
# compute the mapping of correlations > min_corr
min_corr = 0.7
corr_mappings = [{}]*4
for jet in range(4):
    corr_matrix_bool = np.abs(corr_matrices[jet]) > min_corr 
    nfeature = corr_matrix_bool.shape[0]
    # i is surely correlated to itself, drop that (useless) information
    for i in range(nfeature):
        corr_matrix_bool[i][i] = False

    # compute the mapping of correlations
    for i in range(nfeature):
        c = np.where(corr_matrix_bool[i])[0].tolist()
        if len(c) > 0: # if it is not correlated to any other column then ignore it
            corr_mappings[jet][i] = c
            
len(corr_mappings[0]), len(corr_mappings[1]), len(corr_mappings[2]), len(corr_mappings[3])

17
21
28
28


(18, 18, 18, 18)

In [141]:
corr = corr_mappings[0].copy()
tobe_deleted = []
# fetch all the columns that can be deleted and put them in tobe_deleted
for _ in range(len(corr)): 
    longer_key = -1 
    longer_length = 0

    # look for the longer list
    for key in corr:
        curr_length = len(corr[key])
        if curr_length > longer_length:
            longer_length = curr_length
            longer_key = key

    if longer_length == 0: # the map is now empty
        break
        
    tobe_deleted.append(corr[longer_key])
    # delete all the columns that are correlated to column longer_key
    # i.e. all the column whose index is in  corr[longer_key]
    for corr_colum in corr[longer_key]:
        corr[corr_colum] = []

    # since those columns have been dropped they must be removed from all the other lists
    for key in corr: 
        if key != longer_key:
            #print(key, corr[key], "-", corr[longer_key], "=", list(set(corr[key]) - set(corr[longer_key])))
            corr[key] = list(set(corr[key]) - set(corr[longer_key]))
    corr[longer_key] = []

tobe_deleted = [val for sublist in tobe_deleted for val in sublist]
tobe_deleted.sort()

In [142]:
corr_mappings[0]

{0: [14],
 2: [18],
 3: [4, 5],
 4: [3],
 5: [3],
 6: [11],
 8: [20, 21, 24, 27],
 9: [15],
 11: [6],
 14: [0],
 15: [9],
 16: [2, 5, 17, 20],
 17: [2, 5, 16, 20],
 18: [2],
 20: [8, 21, 24, 27],
 21: [8, 20, 27],
 24: [8, 20, 27],
 27: [8, 20, 21, 24]}

In [143]:
tobe_deleted

[2, 3, 5, 6, 14, 15, 17, 20, 21, 24, 27]

### Compute mean and std

We must use the same standardisation process both for the training and for predicting. Since the -999 values will be dropped from the first column I remove them to compute the mean and the std.

In [102]:
data_merged.shape

(818238, 30)

In [103]:
datasets = split_input_data(data_merged)
datasets[0].shape, datasets[1].shape, datasets[2].shape, datasets[3].shape

((327371, 18), (252882, 22), (165027, 29), (72958, 29))

In [106]:
corr_jet0 = np.corrcoef(datasets[0][:, 1:].T)
corr_jet0.shape

(17, 17)

In [66]:
d = datasets[0]
for col in range(d.shape[1]):
    print(np.all(d[:, col]==d[:, col][0]))

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
