In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np
import scripts.implementations as lib  # Add personal library
import scripts.proj1_helpers as helper  # Add personal library

%matplotlib inline
%load_ext autoreload
%autoreload 2

DATA_FOLDER = 'data'

# 1. Data Exploration

We first load the data to see what are the repartition of the data. In our case prediction gives `s` for signal and `b` for backgroud. In this case around 2/3 of the data (65.73%) are labeled as background.

## 1.1 Data loading


In [None]:
DATA_TRAIN = os.path.join(DATA_FOLDER, 'train.csv')
yb, input_data, ids, header = helper.load_csv_data(DATA_TRAIN)

In [None]:
print('Repartition of {} labels, s: {:.2f}%, b: {:.2f}%'.format(
    len(yb), np.mean(yb==1)*100, np.mean(yb==-1)*100))

Since thoses are unmeasured data, let's put them to NaN so they will be easier to handle

In [None]:
input_data[input_data == -999] = np.nan

Let's now take a look at the repartition of the NaN along the features. We can see that some features seems to have the same amount of NaN value. The second graph shows that some features seems to have NaNs values axactly at the same location.

In [None]:
plt.figure(figsize=(16,4))
plt.bar(np.arange(len(header)), np.sum(np.isnan(input_data), axis=0))
plt.xticks(np.arange(len(header)), header, rotation='vertical')
plt.ylim(0, len(yb)); plt.xlabel('Features'); plt.xlabel('#Sample'); plt.title('NAN sum per feature')
plt.grid(); plt.show();

In [None]:
plt.figure(figsize=(14, 20))
plt.matshow(np.isnan(input_data)[:100, :].T)
plt.yticks(np.arange(len(header)), header)
plt.xlabel('Features'); plt.xlabel('#Sample'); plt.title('NAN sum per feature')
plt.show(); 

Does the NaN value gave us any information (`s` or `b`) ? We can see that is NaN is not present we are more likely to find a signal `s`. If NaN is present it seems that we are close to the initial distribution with 34%-66% ratio.

In [None]:
print('NaN is present, s: {:.2f}, b: {:.2f}'.format(
    np.mean(yb[np.any(np.isnan(input_data), axis=1)] == 1), np.mean(yb[np.any(np.isnan(input_data), axis=1)] == -1)))
print('NaN is not present, s: {:.2f}, b: {:.2f}'.format(
    np.mean(yb[~np.any(np.isnan(input_data), axis=1)] == 1), np.mean(yb[~np.any(np.isnan(input_data), axis=1)] == -1)))

In [None]:
keep_id = np.sum(np.isnan(input_data), axis=0) == 0
x = input_data[:, np.nonzero(keep_id)[0]]

## 1.2 Feature normalization


In [None]:
for i, feature in enumerate(x.T):
    x[:, i] = (feature - np.mean(feature))/np.std(feature)
    
print('Means:', np.mean(x, axis=0), '\nStd:', np.std(x, axis=0))

# 2. Model

## 2.1 Least square

In [None]:
from scripts.implementations import build_poly, least_squares, least_squares_GD, accuracy

xt =  build_poly(x, 3)
loss, w = least_squares(yb, xt)
print(loss)
#loss, w = least_squares_GD(yb, xt, max_iters=700, loss_name='mae')
#print(loss)

In [None]:
accuracy(yb, xt.dot(w))

In [None]:
from scripts.ml import cross_validation_ls

_acc = []
_loss_tr = []
_loss_te = []

for degree in range(1,6):
    print('Least square, deg: {}'.format(degree))
    acc, loss_tr, loss_te = cross_validation_ls(yb, x, degree=degree)
    _acc.append(acc); _loss_te.append(loss_te), _loss_tr.append(loss_tr)

In [None]:
plt.plot(_acc)

In [None]:
from scripts.ml import cross_validation_ridge

_acc = []
_loss_tr = []
_loss_te = []

for degree in range(1,6):
    print('Ridge, deg: {}'.format(degree))
    acc, loss_tr, loss_te = cross_validation_ridge(yb, x, degree=degree)
    _acc.append(acc); _loss_te.append(loss_te), _loss_tr.append(loss_tr)

In [None]:
plt.plot(_acc)

# 2. Submition test

In [None]:
DATA_TEST = os.path.join(DATA_FOLDER, 'test.csv')
yb_test, data_test, ids, header = helper.load_csv_data(DATA_TEST)

In [None]:
# Remove set NaN
data_test[data_test == -999] = np.nan
# Remove features with NaN
keep_id = np.sum(np.isnan(data_test), axis=0) == 0
x_test = data_test[:, np.nonzero(keep_id)[0]]
# Normalize features
for i, feature in enumerate(x_test.T):
    x_test[:, i] = (feature - np.mean(feature))/np.std(feature)
    
print('Means:', np.mean(x_test, axis=0), '\nStd:', np.std(x_test, axis=0))

In [None]:
degree = 3

# Build polynomial matrix
_phi_train = build_poly(x, degree)
_phi_test = build_poly(x_test, degree)

loss_tr, weights = least_squares(yb, _phi_train)
print(loss_tr)

In [None]:
from scripts.proj1_helpers import predict_labels, create_csv_submission

y_pred = predict_labels(weights, _phi_test)
create_csv_submission(ids, y_pred, 'first.csv')

In [None]:
len(y_pred)