In [33]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Cleaning

In [50]:
import pandas as pd
import seaborn as sns

In [51]:
df = pd.read_csv('../train.csv')

In [52]:
df.shape

(250000, 32)

In [53]:
df[df.Prediction=='s'].shape

(85667, 32)

In [54]:
df[df.Prediction=='b'].shape

(164333, 32)

In [55]:
feat_names = [col for col in df.columns if col not in ['Id','Prediction']]

for colname in feat_names:
    df.loc[df[colname]==-999.000,[colname]]=df[colname][df[colname]!=-999.000].median()

In [56]:
df.describe()

Unnamed: 0,Id,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_met_phi,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt
count,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,...,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0
mean,224999.5,120.417434,49.239819,81.181982,57.895962,2.193104,268.220619,-0.411629,2.3731,18.917332,...,-0.010119,209.797178,0.979176,77.124366,-0.001966,-0.020629,50.739149,-0.010535,-0.001879,73.064591
std,72168.927986,52.859218,35.344886,40.828691,63.655682,0.948104,224.229668,1.94852,0.782911,22.273494,...,1.812223,126.499506,0.977426,47.939899,1.382703,1.405084,17.792135,1.094446,0.978743,98.015662
min,100000.0,9.044,0.0,6.329,0.0,0.0,13.602,-18.066,0.208,0.0,...,-3.142,13.678,0.0,30.0,-4.499,-3.142,30.0,-4.5,-3.142,0.0
25%,162499.75,95.665,19.241,59.38875,14.06875,2.107,225.885,-0.244,1.81,2.841,...,-1.575,123.0175,0.0,57.439,-0.433,-0.556,47.902,-0.01,-0.002,0.0
50%,224999.5,112.406,46.524,73.752,38.4675,2.107,225.885,-0.244,2.4915,12.3155,...,-0.024,179.739,1.0,65.561,0.0,-0.033,47.902,-0.01,-0.002,40.5125
75%,287499.25,130.60625,73.598,92.259,79.169,2.107,225.885,-0.244,2.961,27.591,...,1.561,263.37925,2.0,75.349,0.433,0.503,47.902,-0.01,-0.002,109.93375
max,349999.0,1192.026,690.075,1349.351,2834.999,8.503,4974.979,16.69,5.684,2834.999,...,3.142,2003.976,3.0,1120.573,4.499,3.141,721.456,4.5,3.142,1633.433


In [57]:
df.shape

(250000, 32)

In [58]:
sum(df["DER_mass_MMC"]==-999.00)

0

In [59]:
df.to_csv('../train_mod.csv', index=False)

In [59]:
phys_feats = ['Id','Prediction'] + DER_features

In [60]:
df_phys = df[phys_feats]

In [62]:
df_phys.to_csv('../train_phys.csv', index=False)

In [66]:
df[DER_features].as_matrix().shape

(250000, 13)

In [78]:
test = pd.read_csv('../test.csv')
test_phys = test[phys_feats]

In [82]:
test_phys.to_csv('../test_phys.csv',index=False)

## PCA train

In [11]:
M = df[feat_names].as_matrix()
U, s, Vt = np.linalg.svd(M, full_matrices=False)
V = Vt.T

# PCs are already sorted by descending order 
# of the singular values (i.e. by the
# proportion of total variance they explain)

# if we use all of the PCs we can reconstruct the noisy signal perfectly
S = np.diag(s)
Mhat = np.dot(U, np.dot(S, V.T))
print("Using all PCs, MSE = %.6G" %(np.mean((M - Mhat)**2)))

# if we use only the first 20 PCs the reconstruction is less accurate
Mhat2 = np.dot(U[:, :20], np.dot(S[:20, :20], V[:,:20].T))
print("Using first 20 PCs, MSE = %.6G" %(np.mean((M - Mhat2)**2)))

Using all PCs, MSE = 3.52782E-25
Using first 20 PCs, MSE = 0.150423


In [12]:
singulars = np.diag(S)**2
singulars.cumsum()/singulars.sum()

array([ 0.8231826 ,  0.93230183,  0.97205019,  0.97924848,  0.98508109,
        0.98984158,  0.99247884,  0.99494948,  0.99635745,  0.99760283,
        0.99863844,  0.99941667,  0.99992348,  0.99993718,  0.99994968,
        0.99995916,  0.99996818,  0.99997438,  0.99997951,  0.99998454,
        0.99998818,  0.99999129,  0.9999941 ,  0.99999642,  0.99999859,
        0.99999922,  0.99999964,  0.99999991,  1.        ,  1.        ])

In [13]:
up_to = 20
Mhat = np.dot(U[:, :up_to], np.sqrt(S[:up_to, :up_to]))
Mhat.shape

(250000, 20)

In [14]:
PCS_train = np.dot(np.sqrt(S[:up_to, :up_to]), V[:,:up_to].T).T
PCS_train.shape

(30, 20)

In [15]:
df.head()

Unnamed: 0,Id,Prediction,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,...,PRI_met_phi,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt
0,100000,s,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,...,-0.277,258.733,2.0,67.435,2.15,0.444,46.062,1.24,-2.475,113.497
1,100001,b,160.937,68.768,103.235,48.146,2.107,225.885,-0.244,3.473,...,-1.916,164.546,1.0,46.226,0.725,1.158,47.902,-0.01,-0.002,46.226
2,100002,b,112.406,162.172,125.953,35.635,2.107,225.885,-0.244,3.148,...,-2.186,260.414,1.0,44.251,2.053,-2.028,47.902,-0.01,-0.002,44.251
3,100003,b,143.905,81.417,80.943,0.414,2.107,225.885,-0.244,3.31,...,0.06,86.062,0.0,65.561,0.0,-0.033,47.902,-0.01,-0.002,0.0
4,100004,b,175.864,16.915,134.805,16.405,2.107,225.885,-0.244,3.891,...,-0.871,53.131,0.0,65.561,0.0,-0.033,47.902,-0.01,-0.002,0.0


In [16]:
##Dummy names for PCA features (must match format)
df_pca = pd.DataFrame(Mhat)
df_pca['Id'] = df['Id']
df_pca['Prediction'] = df['Prediction']
df_pca.columns = feat_names[:up_to] + ['Id','Prediction']
df_pca = df_pca[['Id','Prediction']+feat_names[:up_to]]
df_pca.head()

Unnamed: 0,Id,Prediction,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,...,DER_pt_ratio_lep_tau,DER_met_phi_centrality,DER_lep_eta_centrality,PRI_tau_pt,PRI_tau_eta,PRI_tau_phi,PRI_lep_pt,PRI_lep_eta,PRI_lep_phi,PRI_met
0,100000,s,-0.769838,-0.54304,-0.254366,0.12266,-0.2922,-0.290423,-0.06644,0.045273,...,-0.031438,0.159607,0.044618,0.065721,0.017606,0.034698,0.110437,-0.005449,-0.002059,-0.083173
1,100001,b,-0.746935,-0.02149,-0.450921,-0.120646,0.00236,0.013581,-0.171292,-0.011218,...,0.014835,-0.079058,-0.145778,-0.056469,0.099484,0.038748,0.06641,0.007465,-0.009017,0.000586
2,100002,b,-0.904274,-0.252481,-0.490765,0.280186,0.386489,-0.602813,-0.082314,-0.715058,...,0.13095,-0.075938,0.228257,-0.069395,0.045872,-0.033533,0.0016,-0.082832,-0.09266,0.018978
3,100003,b,-0.594008,0.22327,-0.535427,-0.165784,0.16564,-0.199944,0.107131,0.127347,...,-0.139498,0.022658,0.014392,0.074639,0.015575,0.059463,-0.043838,-0.026064,-0.040767,0.008549
4,100004,b,-0.566397,0.275458,-0.636252,-0.407153,-0.286477,0.171639,0.156505,0.230334,...,0.095634,-0.009633,-0.222229,-0.079581,0.040057,0.000899,-0.027718,-0.015715,-0.011459,-0.000286


In [17]:
df_pca.shape

(250000, 22)

In [18]:
df_pca.to_csv('../train_pca.csv',index=False)

## PCA test

In [60]:
df_pca_test = pd.read_csv('../test.csv')
df_pca_test.shape

(568238, 32)

In [61]:
feat_names = [col for col in df_pca_test.columns if col not in ['Id','Prediction']]

In [21]:
pca_test = df_pca_test[feat_names].as_matrix()
pca_test.shape

(568238, 30)

In [22]:
pca_test = np.dot(pca_test,PCS_train)
pca_test.shape

(568238, 20)

In [23]:
pca_test = pd.DataFrame(pca_test)
pca_test['Id'] = df_pca_test['Id']
pca_test['Prediction'] = df_pca_test['Prediction']
pca_test.columns = feat_names[:up_to] + ['Id','Prediction']
pca_test = pca_test[['Id','Prediction'] + feat_names[:up_to]]
pca_test.shape

(568238, 22)

In [24]:
pca_test.head()

Unnamed: 0,Id,Prediction,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,...,DER_pt_ratio_lep_tau,DER_met_phi_centrality,DER_lep_eta_centrality,PRI_tau_pt,PRI_tau_eta,PRI_tau_phi,PRI_lep_pt,PRI_lep_eta,PRI_lep_phi,PRI_met
0,350000,?,-137351.948492,22192.003098,-18029.576894,984.706527,7691.652369,-2615.330753,432.169987,4059.423219,...,-2752.418848,292.968795,311.324087,19.88765,-36.920995,65.359929,-17.140139,-8.339968,-5.822458,1.617966
1,350001,?,-179867.751988,-2331.484286,-14238.875034,990.215239,700.027427,-1889.995886,102.242738,-3109.724679,...,62.932828,-853.152559,-1014.153567,22.453747,-26.012374,-103.568756,-40.421765,-2.865803,11.091086,-3.942387
2,350002,?,-153518.811579,14658.983079,-24360.761221,-1357.676677,290.089814,-2164.740222,4242.70242,-640.600716,...,71.616095,-258.511736,415.152368,62.245676,10.791008,-85.966516,-12.843907,-5.712181,12.58923,-2.301353
3,350003,?,-166742.950919,7399.379408,-25143.885374,3492.205755,-2158.616233,2161.549444,3386.814426,1235.485043,...,2.391499,-286.307508,956.057499,-80.504179,-26.207603,8.967708,1.61974,1.319876,14.007009,-3.670213
4,350004,?,-632437.284863,-141068.749431,78245.618974,18715.424491,-16951.449081,-32743.328433,3836.826536,7392.087744,...,-524.609267,-973.260609,-990.801357,-25.417641,-30.599906,-78.944951,-75.417246,-3.814281,15.366144,33.129219


In [25]:
pca_test.to_csv('../test_pca.csv',index=False)