# Import data, compute features, train-val split, feature selection, pre-process, & save
## Import
### Modules

In [1]:
from __future__ import division
%matplotlib inline
import sys
sys.path.append('../')
from Modules.Basics import *
from ML_Tools.General.Feature_Selection import *
from ML_Tools.Transformations.HEP_Proc import *

  from ._conv import register_converters as _register_converters
  from pandas.core import datetools
Using TensorFlow backend.


### Data

In [2]:
def importData(name):
    data = pandas.read_csv(dirLoc + name + '.csv')
    data.rename(index=str, columns={"Weight": "gen_weight", 'PRI_met':'PRI_met_pt'}, inplace=True)
    data['gen_target'] = 0
    if name == 'training':
        data.loc[data.Label == 's', 'gen_target'] = 1
        data.drop(columns=['Label'], inplace=True)        
    return data

In [3]:
trainingData = importData('training')
testingData = importData('test')

In [4]:
trainingData.head()

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,gen_weight,gen_target
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,2,67.435,2.15,0.444,46.062,1.24,-2.475,113.497,0.002653,1
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,1,46.226,0.725,1.158,-999.0,-999.0,-999.0,46.226,2.233584,0
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,1,44.251,2.053,-2.028,-999.0,-999.0,-999.0,44.251,2.347389,0
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,5.446378,0
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,6.245333,0


In [5]:
print [x for x in trainingData.columns], len(trainingData.columns)

['EventId', 'DER_mass_MMC', 'DER_mass_transverse_met_lep', 'DER_mass_vis', 'DER_pt_h', 'DER_deltaeta_jet_jet', 'DER_mass_jet_jet', 'DER_prodeta_jet_jet', 'DER_deltar_tau_lep', 'DER_pt_tot', 'DER_sum_pt', 'DER_pt_ratio_lep_tau', 'DER_met_phi_centrality', 'DER_lep_eta_centrality', 'PRI_tau_pt', 'PRI_tau_eta', 'PRI_tau_phi', 'PRI_lep_pt', 'PRI_lep_eta', 'PRI_lep_phi', 'PRI_met_pt', 'PRI_met_phi', 'PRI_met_sumet', 'PRI_jet_num', 'PRI_jet_leading_pt', 'PRI_jet_leading_eta', 'PRI_jet_leading_phi', 'PRI_jet_subleading_pt', 'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi', 'PRI_jet_all_pt', 'gen_weight', 'gen_target'] 33


## Feature processing

In [6]:
rotate = True
cartesian = True

In [7]:
def rotateEvent(inData):
    '''Rotate event in phi such that lepton is at phi == 0'''
    inData['PRI_tau_phi'] = deltaphi(inData['PRI_lep_phi'], inData['PRI_tau_phi'])
    inData['PRI_jet_leading_phi'] = deltaphi(inData['PRI_lep_phi'], inData['PRI_jet_leading_phi'])
    inData['PRI_jet_subleading_phi'] = deltaphi(inData['PRI_lep_phi'], inData['PRI_jet_subleading_phi'])
    inData['PRI_met_phi'] = deltaphi(inData['PRI_lep_phi'], inData['PRI_met_phi'])
    
def convertData(inData):
    '''Pass data through conversions and drop uneeded columns'''
    
    inData.loc[inData['DER_mass_MMC'] == -999.0, 'DER_mass_MMC'] = -1
    
    if rotate:
        rotateEvent(inData)
    
    if cartesian:
        moveToCartesian(inData, 'PRI_tau', drop=True)
        moveToCartesian(inData, 'PRI_lep', drop=True)
        moveToCartesian(inData, 'PRI_jet_leading', drop=True)
        moveToCartesian(inData, 'PRI_jet_subleading', drop=True)
        moveToCartesian(inData, 'PRI_met', z=False)
        
        inData.drop(columns=["PRI_met_phi"], inplace=True)
        
    if rotate and not cartesian:
        inData.drop(columns=["PRI_lep_phi"], inplace=True)
    elif rotate and cartesian:
        inData.drop(columns=["PRI_lep_py"], inplace=True)
        
    inData.replace([np.inf, -np.inf], np.nan, inplace=True)
    inData.fillna(-999.0, inplace=True)

In [8]:
convertData(trainingData)
convertData(testingData)

  inData[particle + '_pz'] = pt*np.sinh(eta)


In [9]:
trainingData.columns

Index([u'EventId', u'DER_mass_MMC', u'DER_mass_transverse_met_lep',
       u'DER_mass_vis', u'DER_pt_h', u'DER_deltaeta_jet_jet',
       u'DER_mass_jet_jet', u'DER_prodeta_jet_jet', u'DER_deltar_tau_lep',
       u'DER_pt_tot', u'DER_sum_pt', u'DER_pt_ratio_lep_tau',
       u'DER_met_phi_centrality', u'DER_lep_eta_centrality', u'PRI_met_pt',
       u'PRI_met_sumet', u'PRI_jet_num', u'PRI_jet_all_pt', u'gen_weight',
       u'gen_target', u'PRI_tau_px', u'PRI_tau_py', u'PRI_tau_pz',
       u'PRI_lep_px', u'PRI_lep_pz', u'PRI_jet_leading_px',
       u'PRI_jet_leading_py', u'PRI_jet_leading_pz', u'PRI_jet_subleading_px',
       u'PRI_jet_subleading_py', u'PRI_jet_subleading_pz', u'PRI_met_px',
       u'PRI_met_py'],
      dtype='object')

## Split by jet

In [10]:
def splitByJet(inData):
    outData = {}
    
    outData[0] = inData[inData['PRI_jet_num'] == 0]
    outData[1] = inData[inData['PRI_jet_num'] == 1]    
    outData[2] = inData[inData['PRI_jet_num'] >= 2] 
    
    if not cartesian:
    
        outData[0].drop(columns={'DER_deltaeta_jet_jet', 'DER_mass_jet_jet', 'DER_prodeta_jet_jet', 'DER_lep_eta_centrality',
                                 'PRI_jet_num', 'PRI_jet_leading_pt', 'PRI_jet_leading_eta', 'PRI_jet_leading_phi',
                                 'PRI_jet_subleading_pt', 'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi', 
                                 'PRI_jet_all_pt'}, inplace=True)

        outData[1].drop(columns={'DER_deltaeta_jet_jet', 'DER_mass_jet_jet', 'DER_prodeta_jet_jet', 'DER_lep_eta_centrality',
                                 'PRI_jet_subleading_pt', 'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi',
                                 'PRI_jet_all_pt'}, inplace=True)

    elif cartesian:
        outData[0].drop(columns={'DER_deltaeta_jet_jet', 'DER_mass_jet_jet', 'DER_prodeta_jet_jet', 'DER_lep_eta_centrality',
                                 'PRI_jet_num', 'PRI_jet_leading_px', 'PRI_jet_leading_py', 'PRI_jet_leading_pz',
                                 'PRI_jet_subleading_px', 'PRI_jet_subleading_py', 'PRI_jet_subleading_pz', 
                                 'PRI_jet_all_pt'}, inplace=True)

        outData[1].drop(columns={'DER_deltaeta_jet_jet', 'DER_mass_jet_jet', 'DER_prodeta_jet_jet', 'DER_lep_eta_centrality',
                                 'PRI_jet_subleading_px', 'PRI_jet_subleading_py', 'PRI_jet_subleading_pz',
                                 'PRI_jet_all_pt'}, inplace=True)
    
    return outData

In [11]:
splitJetTraining = splitByJet(trainingData)
splitJetTesting = splitByJet(testingData)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [18]:
for i in splitJetTraining:
    print i, len([x for x in splitJetTraining[i].columns if 'gen' not in x and 'EventId' not in x]) 

0 18
1 22
2 30


## Pre-process data

In [12]:
def getPipe(inData, features):
    inputPipe, outputPipe = getPreProcPipes(normIn=True)
    inputPipe.fit(inData[features].values.astype('float32'))
    return inputPipe

In [13]:
def saveBatch(inData, n, inputPipe, outFile, normWeights, mode, features):
    grp = outFile.create_group('fold_' + str(n))
    
    X = inputPipe.transform(inData[features].values.astype('float32'))
    
    inputs = grp.create_dataset("inputs", shape=X.shape, dtype='float32')
    inputs[...] = X
    
    if 'testing' not in mode:
        if normWeights:
            inData.loc[inData.gen_target == 0, 'gen_weight'] = inData.loc[inData.gen_target == 0, 'gen_weight']/np.sum(inData.loc[inData.gen_target == 0, 'gen_weight'])
            inData.loc[inData.gen_target == 1, 'gen_weight'] = inData.loc[inData.gen_target == 1, 'gen_weight']/np.sum(inData.loc[inData.gen_target == 1, 'gen_weight'])

        X_weights = inData['gen_weight'].values.astype('float32')
        y = inData['gen_target'].values.astype('int')

        targets = grp.create_dataset("targets", shape=y.shape, dtype='int')
        targets[...] = y

        weights = grp.create_dataset("weights", shape=X_weights.shape, dtype='float32')
        weights[...] = X_weights
    
    else:
        X_EventId = inData['EventId'].values.astype('int')
        
        EventId = grp.create_dataset("EventId", shape=X_EventId.shape, dtype='int')
        EventId[...] = X_EventId

In [14]:
def prepareSample(inData, mode, inputPipe, normWeights, N=10):
    print "Running", mode
    os.system('rm ' + dirLoc + mode + '.hdf5')
    outFile = h5py.File(dirLoc + mode + '.hdf5', "w")
    
    features = [x for x in inData.columns if 'gen' not in x and 'EventId' not in x]
    
    if isinstance(inputPipe, types.NoneType):
        inputPipe = getPipe(inData, features)
            
    kf = StratifiedKFold(n_splits=N, shuffle=True)
    folds = kf.split(inData, inData['gen_target'])
    
    for i, (train, test) in enumerate(folds):
        print "Saving fold:", i, "of", len(test), "events"
        saveBatch(inData.iloc[test], i, inputPipe, outFile, normWeights, mode, features)
        
    return inputPipe

In [15]:
for i in splitJetTraining:
    print "Splitting to validation"
    trainIndeces, valIndeces = splitDevVal(splitJetTraining[i])
    train = splitJetTraining[i].loc[trainIndeces]
    val = splitJetTraining[i].loc[valIndeces]
    print len(train), "training", len(val), "validation"
        
    inputPipe = prepareSample(train, 'train_' + str(i), None, True)
    prepareSample(val, 'val_' + str(i), inputPipe, False)
    
    prepareSample(splitJetTesting[i], 'testing_'+ str(i), inputPipe, False)

Splitting to validation
79930 training 19983 validation
Running train_0
Saving fold: 0 of 7994 events


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Saving fold: 1 of 7994 events
Saving fold: 2 of 7994 events
Saving fold: 3 of 7993 events
Saving fold: 4 of 7993 events
Saving fold: 5 of 7993 events
Saving fold: 6 of 7993 events
Saving fold: 7 of 7992 events
Saving fold: 8 of 7992 events
Saving fold: 9 of 7992 events
Running val_0
Saving fold: 0 of 1999 events
Saving fold: 1 of 1999 events
Saving fold: 2 of 1999 events
Saving fold: 3 of 1999 events
Saving fold: 4 of 1999 events
Saving fold: 5 of 1998 events
Saving fold: 6 of 1998 events
Saving fold: 7 of 1998 events
Saving fold: 8 of 1997 events
Saving fold: 9 of 1997 events
Running testing_0
Saving fold: 0 of 22016 events
Saving fold: 1 of 22016 events
Saving fold: 2 of 22016 events
Saving fold: 3 of 22016 events
Saving fold: 4 of 22016 events
Saving fold: 5 of 22016 events
Saving fold: 6 of 22015 events
Saving fold: 7 of 22015 events
Saving fold: 8 of 22015 events
Saving fold: 9 of 22015 events
Splitting to validation
62035 training 15509 validation
Running train_1
Saving fold: 0 o