In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np
import scripts.implementations as lib  # Add personal library
import scripts.proj1_helpers as helper  # Add personal library

%matplotlib inline
%load_ext autoreload
%autoreload 2

DATA_FOLDER = 'data'

# 1. Data Exploration

We first load the data to see what are the repartition of the data. In our case prediction gives `s` for signal and `b` for backgroud. In this case around 2/3 of the data (65.73%) are labeled as background.

In [None]:
DATA_TRAIN = os.path.join(DATA_FOLDER, 'train.csv')
yb, input_data, ids, header = helper.load_csv_data(DATA_TRAIN)

In [None]:
print('Repartition of {} labels, s: {:.2f}%, b: {:.2f}%'.format(
    len(yb), np.mean(yb==1)*100, np.mean(yb==-1)*100))

Since thoses are unmeasured data, let's put them to NaN so they will be easier to handle

In [None]:
input_data[input_data == -999] = np.nan

Let's now take a look at the repartition of the NaN along the features. We can see that some features seems to have the same amount of NaN value. The second graph shows that some features seems to have NaNs values axactly at the same location.

In [None]:
plt.figure(figsize=(16,4))
plt.bar(np.arange(len(header)), np.sum(np.isnan(input_data), axis=0))
plt.xticks(np.arange(len(header)), header, rotation='vertical')
plt.ylim(0, len(yb)); plt.xlabel('Features'); plt.xlabel('#Sample'); plt.title('NAN sum per feature')
plt.grid(); plt.show();

In [None]:
plt.figure(figsize=(14, 20))
plt.matshow(np.isnan(input_data)[:100, :].T)
plt.yticks(np.arange(len(header)), header)
plt.xlabel('Features'); plt.xlabel('#Sample'); plt.title('NAN sum per feature')
plt.show(); 

Does the NaN value gave us any information (`s` or `b`) ? We can see that is NaN is not present we are more likely to find a signal `s`. If NaN is present it seems that we are close to the initial distribution with 34%-66% ratio.

In [None]:
print('NaN is present, s: {:.2f}, b: {:.2f}'.format(
    np.mean(yb[np.any(np.isnan(input_data), axis=1)] == 1), np.mean(yb[np.any(np.isnan(input_data), axis=1)] == -1)))
print('NaN is not present, s: {:.2f}, b: {:.2f}'.format(
    np.mean(yb[~np.any(np.isnan(input_data), axis=1)] == 1), np.mean(yb[~np.any(np.isnan(input_data), axis=1)] == -1)))

In [None]:
keep_id = np.sum(np.isnan(input_data), axis=0) == 0
x = input_data[:, np.nonzero(keep_id)[0]]

# 2. Model

In [None]:
from scripts.implementations import build_poly, least_squares

xt =  np.ones((np.shape(x)[0], np.shape(x)[1]+1))
xt[:, 1:] = x

loss, w = least_squares(yb, xt)

In [None]:
print(np.sum(xt.dot(w) > 0))
print(np.sum(xt.dot(w) < 0))

In [None]:
def accuracy(y_cgt, y_pred):
    y_pred_s = np.sign(y_pred)
    tp = np.sum(y_cgt == y_pred_s)
    print(tp/len(yb))
    
accuracy(yb, xt.dot(w))