# Import data, compute features, train-val split, feature selection, pre-process, & save
## Import
### Modules

In [1]:
from __future__ import division
%matplotlib inline
import sys
sys.path.append('../')
from Modules.Basics import *
from ML_Tools.General.Feature_Selection import *
from ML_Tools.Transformations.HEP_Proc import *

  from ._conv import register_converters as _register_converters
  from pandas.core import datetools
Using TensorFlow backend.


### Data

In [6]:
def importData(name):
    data = pandas.read_csv(dirLoc + name + '.csv')
    data.rename(index=str, columns={"Weight": "gen_weight", 'PRI_met':'PRI_met_pt'}, inplace=True)
    data['gen_target'] = 0
    data.loc[data.Label == 's', 'gen_target'] = 1
    data.drop(columns=['Label'], inplace=True)
    return data

In [7]:
trainingData = importData('training')
trainingData.drop(columns=['EventId'], inplace=True)

In [8]:
trainingData.head()

Unnamed: 0,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,DER_sum_pt,...,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,gen_weight,gen_target
0,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,197.76,...,2,67.435,2.15,0.444,46.062,1.24,-2.475,113.497,0.002653,1
1,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,125.157,...,1,46.226,0.725,1.158,-999.0,-999.0,-999.0,46.226,2.233584,0
2,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,197.814,...,1,44.251,2.053,-2.028,-999.0,-999.0,-999.0,44.251,2.347389,0
3,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,75.968,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,5.446378,0
4,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,57.983,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,6.245333,0


In [9]:
print [x for x in trainingData.columns], len(trainingData.columns)

['DER_mass_MMC', 'DER_mass_transverse_met_lep', 'DER_mass_vis', 'DER_pt_h', 'DER_deltaeta_jet_jet', 'DER_mass_jet_jet', 'DER_prodeta_jet_jet', 'DER_deltar_tau_lep', 'DER_pt_tot', 'DER_sum_pt', 'DER_pt_ratio_lep_tau', 'DER_met_phi_centrality', 'DER_lep_eta_centrality', 'PRI_tau_pt', 'PRI_tau_eta', 'PRI_tau_phi', 'PRI_lep_pt', 'PRI_lep_eta', 'PRI_lep_phi', 'PRI_met_pt', 'PRI_met_phi', 'PRI_met_sumet', 'PRI_jet_num', 'PRI_jet_leading_pt', 'PRI_jet_leading_eta', 'PRI_jet_leading_phi', 'PRI_jet_subleading_pt', 'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi', 'PRI_jet_all_pt', 'gen_weight', 'gen_target'] 32


## Feature processing

In [10]:
rotate = True
cartesian = False

In [11]:
def rotateEvent(inData):
    '''Rotate even in phi such that hardest | leptoninc tau (ll | hh) is at phi == 0'''
    inData['PRI_tau_phi'] = deltaphi(inData['PRI_lep_phi'], inData['PRI_tau_phi'])
    inData['PRI_jet_leading_phi'] = deltaphi(inData['PRI_lep_phi'], inData['PRI_jet_leading_phi'])
    inData['PRI_jet_subleading_phi'] = deltaphi(inData['PRI_lep_phi'], inData['PRI_jet_subleading_phi'])
    inData['PRI_met_phi'] = deltaphi(inData['PRI_lep_phi'], inData['PRI_met_phi'])
    
def convertData(inData):
    '''Pass data through conversions and drop uneeded columns'''
    
    if rotate:
        rotateEvent(inData)
    
    if cartesian:
        moveToCartesian(inData, 'PRI_tau', drop=True)
        moveToCartesian(inData, 'PRI_lep', drop=True)
        moveToCartesian(inData, 'PRI_jet_leading', drop=True)
        moveToCartesian(inData, 'PRI_jet_subleading', drop=True)
        moveToCartesian(inData, 'PRI_met', z=False)
        
        inData.drop(columns=["PRI_met_phi"], inplace=True)
        
    if rotate and not cartesian:
        inData.drop(columns=["PRI_lep_phi"], inplace=True)
    elif rotate and cartesian:
        inData.drop(columns=["PRI_lep_py"], inplace=True)

In [12]:
convertData(trainingData)

In [13]:
trainingData.columns

Index([u'DER_mass_MMC', u'DER_mass_transverse_met_lep', u'DER_mass_vis',
       u'DER_pt_h', u'DER_deltaeta_jet_jet', u'DER_mass_jet_jet',
       u'DER_prodeta_jet_jet', u'DER_deltar_tau_lep', u'DER_pt_tot',
       u'DER_sum_pt', u'DER_pt_ratio_lep_tau', u'DER_met_phi_centrality',
       u'DER_lep_eta_centrality', u'PRI_tau_pt', u'PRI_tau_eta',
       u'PRI_tau_phi', u'PRI_lep_pt', u'PRI_lep_eta', u'PRI_met_pt',
       u'PRI_met_phi', u'PRI_met_sumet', u'PRI_jet_num', u'PRI_jet_leading_pt',
       u'PRI_jet_leading_eta', u'PRI_jet_leading_phi',
       u'PRI_jet_subleading_pt', u'PRI_jet_subleading_eta',
       u'PRI_jet_subleading_phi', u'PRI_jet_all_pt', u'gen_weight',
       u'gen_target'],
      dtype='object')

## Train-Validation Split

In [14]:
trainIndeces, valIndeces = splitDevVal(trainingData)
train = trainingData.loc[trainIndeces]
val = trainingData.loc[valIndeces]
print len(train), len(val)

200000 50000


## Feature Selection
### Get important features

In [15]:
trainFeatures = [x for x in train.columns if 'gen' not in x]
print len(trainFeatures)

29


In [13]:
%%time
importantFeatures = rankClassifierFeatures(train, trainFeatures)

Running fold 1 /10


KeyboardInterrupt: 

In [None]:
print importantFeatures[0]

In [16]:
importantFeatures = trainFeatures

## Pre-process data

### PCA test

In [17]:
pcaPipe, outputPipe = getPreProcPipes(normIn=True, pca=True, normPCA=True)
pcaFeatures = [x for x in importantFeatures if 'DER' in x]
pcaPipe.fit(train[pcaFeatures].values.astype('float32'))

Pipeline(memory=None,
     steps=[('normIn', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('normPCA', StandardScaler(copy=True, with_mean=True, with_std=True))])

In [18]:
xgCompare([train[pcaFeatures],
           pcaPipe.transform(train[pcaFeatures].values.astype('float32'))],
          [train['gen_target'],
           train['gen_target']])

TypeError: __init__() got an unexpected keyword argument 'n_jobs'

In [19]:
classTrainFeatures = importantFeatures

In [20]:
print classTrainFeatures

['DER_mass_MMC', 'DER_mass_transverse_met_lep', 'DER_mass_vis', 'DER_pt_h', 'DER_deltaeta_jet_jet', 'DER_mass_jet_jet', 'DER_prodeta_jet_jet', 'DER_deltar_tau_lep', 'DER_pt_tot', 'DER_sum_pt', 'DER_pt_ratio_lep_tau', 'DER_met_phi_centrality', 'DER_lep_eta_centrality', 'PRI_tau_pt', 'PRI_tau_eta', 'PRI_tau_phi', 'PRI_lep_pt', 'PRI_lep_eta', 'PRI_met_pt', 'PRI_met_phi', 'PRI_met_sumet', 'PRI_jet_num', 'PRI_jet_leading_pt', 'PRI_jet_leading_eta', 'PRI_jet_leading_phi', 'PRI_jet_subleading_pt', 'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi', 'PRI_jet_all_pt']


In [21]:
inputPipe, outputPipe = getPreProcPipes(normIn=True)
inputPipe.fit(train[classTrainFeatures].values.astype('float32'))

Pipeline(memory=None,
     steps=[('normIn', StandardScaler(copy=True, with_mean=True, with_std=True))])

In [22]:
with open(dirLoc + 'inputPipe.pkl', 'w') as fout:
    pickle.dump(inputPipe, fout)

In [23]:
def saveBatch(inData, n, inputPipe, outFile, normWeights):
    X = inputPipe.transform(inData[classTrainFeatures].values.astype('float32'))
    
    if normWeights:
        inData.loc[inData.gen_target == 0, 'gen_weight'] = inData.loc[inData.gen_target == 0, 'gen_weight']/np.sum(inData.loc[inData.gen_target == 0, 'gen_weight'])
        inData.loc[inData.gen_target == 1, 'gen_weight'] = inData.loc[inData.gen_target == 1, 'gen_weight']/np.sum(inData.loc[inData.gen_target == 1, 'gen_weight'])
    
    X_weights = inData['gen_weight'].values.astype('float32')
    y = inData['gen_target'].values.astype('int')
    
    grp = outFile.create_group('fold_' + str(n))
    
    inputs = grp.create_dataset("inputs", shape=X.shape, dtype='float32')
    inputs[...] = X
    
    targets = grp.create_dataset("targets", shape=y.shape, dtype='int')
    targets[...] = y
    
    weights = grp.create_dataset("weights", shape=y.shape, dtype='float32')
    weights[...] = X_weights

In [24]:
def prepareSample(inData, mode, inputPipe, normWeights, N=10):
    print "Running", mode
    os.system('rm ' + dirLoc + mode + '.hdf5')
    outFile = h5py.File(dirLoc + mode + '.hdf5', "w")
            
    kf = StratifiedKFold(n_splits=N, shuffle=True)
    folds = kf.split(inData, inData['gen_target'])

    for i, (train, test) in enumerate(folds):
        print "Saving fold:", i, "of", len(test), "events"
        saveBatch(inData.iloc[test], i, inputPipe, outFile, normWeights)

In [25]:
prepareSample(train, 'train', inputPipe, True)
prepareSample(val, 'val', inputPipe, False)

Running train
Saving fold: 0 of 20001 events


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Saving fold: 1 of 20001 events
Saving fold: 2 of 20001 events
Saving fold: 3 of 20000 events
Saving fold: 4 of 20000 events
Saving fold: 5 of 20000 events
Saving fold: 6 of 20000 events
Saving fold: 7 of 19999 events
Saving fold: 8 of 19999 events
Saving fold: 9 of 19999 events
Running val
Saving fold: 0 of 5000 events
Saving fold: 1 of 5000 events
Saving fold: 2 of 5000 events
Saving fold: 3 of 5000 events
Saving fold: 4 of 5000 events
Saving fold: 5 of 5000 events
Saving fold: 6 of 5000 events
Saving fold: 7 of 5000 events
Saving fold: 8 of 5000 events
Saving fold: 9 of 5000 events
