In [22]:
import os
import matplotlib.pyplot as plt
import numpy as np
import scripts.implementations as lib  # Add personal library
import scripts.proj1_helpers as helper  # Add personal library

%matplotlib inline
%load_ext autoreload
%autoreload 2
np.set_printoptions(precision=4)

DATA_FOLDER = 'data'
DATA_TRAIN = os.path.join(DATA_FOLDER, 'train.csv')
DATA_TEST = os.path.join(DATA_FOLDER, 'test.csv')

y, x, ids, header = helper.load_csv_data(DATA_TRAIN)
y_train, x_train, y_validation, x_validation = lib.sep_valid_train_data(x, y, 0.8);

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
x_train[x_train == -999] = np.nan
x_validation[x_validation == -999] = np.nan

In [71]:
# Remove features with NaN
keep_id = np.nonzero(np.sum(np.isnan(x_train), axis=0) == 0)[0]
x_naive = x_train[:, keep_id]
head_naive = header[keep_id]
# normalize features
x_naive = (x_naive - np.mean(x_naive, axis=0))/np.std(x_naive, axis=0)
print(head_naive)

['DER_mass_transverse_met_lep' 'DER_mass_vis' 'DER_pt_h'
 'DER_deltar_tau_lep' 'DER_pt_tot' 'DER_sum_pt' 'DER_pt_ratio_lep_tau'
 'DER_met_phi_centrality' 'PRI_tau_pt' 'PRI_tau_eta' 'PRI_tau_phi'
 'PRI_lep_pt' 'PRI_lep_eta' 'PRI_lep_phi' 'PRI_met' 'PRI_met_phi'
 'PRI_met_sumet' 'PRI_jet_num' 'PRI_jet_all_pt']


In [63]:
from scripts.ml import cross_validation_ls

degrees = np.linspace(1, 6, 6).astype(int)
for i, degree in enumerate(degrees):
    acc, _, _ = cross_validation_ls(y_train, x_naive, degree=degree)
    print('{}/{} Least square deg {} with acc {:.4f}'.format(i+1, len(degrees), degree, acc))

1/6 Least square deg 1 with acc 0.7332
2/6 Least square deg 2 with acc 0.7545
3/6 Least square deg 3 with acc 0.7610
4/6 Least square deg 4 with acc 0.7484
5/6 Least square deg 5 with acc 0.7592
6/6 Least square deg 6 with acc 0.7044


In [54]:
from scripts.ml import build_poly, least_squares

# Build polynomial matrix
_phi_train = build_poly(x_naive, 3)
loss_tr, weights = least_squares(y_train, _phi_train)

In [68]:
weights[0:len(header)]
w = weights[1:].reshape((-1, x_naive.shape[1]))
print(np.sum(np.max(np.abs(w), axis=0) > 0.1))

10


In [69]:
feat_keep = np.max(np.abs(w), axis=0) > 0.1
head_naive[feat_keep]

array(['DER_mass_transverse_met_lep', 'DER_mass_vis', 'DER_pt_h',
       'DER_deltar_tau_lep', 'DER_sum_pt', 'DER_pt_ratio_lep_tau',
       'PRI_tau_pt', 'PRI_lep_pt', 'PRI_jet_num', 'PRI_jet_all_pt'],
      dtype='<U27')

In [70]:
from scripts.ml import build_poly, least_squares

degrees = np.linspace(1, 6, 6).astype(int)
for i, degree in enumerate(degrees):
    acc, _, _ = cross_validation_ls(y_train, x_naive[:, feat_keep], degree=degree)
    print('{}/{} Least square deg {} with acc {:.4f}'.format(i+1, len(degrees), degree, acc))

1/6 Least square deg 1 with acc 0.7355
2/6 Least square deg 2 with acc 0.7479
3/6 Least square deg 3 with acc 0.7544
4/6 Least square deg 4 with acc 0.7534
5/6 Least square deg 5 with acc 0.7553
6/6 Least square deg 6 with acc 0.7295


In [90]:
from scripts.ml import augmented_feat_angle

id_angle_feat = np.array([10, 13, 15])
x_aug = augmented_feat_angle(x_naive, id_angle_feat)

x_naive_aug = np.concatenate((x_naive[:, feat_keep], x_aug), axis=1)


# normalize features
#x_aug = (x_aug - np.nanmean(x_aug, axis=0))/np.nanstd(x_aug, axis=0)
#x_aug = np.nan_to_num(x_aug)
#print('\nStd:', np.std(x_aug, axis=0))

#print(np.max(x_train[:, 15]))

x_naive_aug = (x_naive_aug - np.mean(x_naive_aug, axis=0))/np.std(x_naive_aug, axis=0)


degrees = np.linspace(1, 6, 6).astype(int)
for i, degree in enumerate(degrees):
    acc, _, _ = cross_validation_ls(y_train, x_naive_aug, degree=degree)
    print('{}/{} Least square deg {} with acc {:.4f}'.format(i+1, len(degrees), degree, acc))

1/6 Least square deg 1 with acc 0.7357
2/6 Least square deg 2 with acc 0.6631
3/6 Least square deg 3 with acc 0.6215
4/6 Least square deg 4 with acc 0.5946
5/6 Least square deg 5 with acc 0.6269
6/6 Least square deg 6 with acc 0.5415


In [77]:
print(head_naive[[10, 13, 15]])

['PRI_tau_phi' 'PRI_lep_phi' 'PRI_met_phi']
