# Import data, compute features, train-val split, feature selection, pre-process, & save
## Import
### Modules

In [1]:
from __future__ import division
%matplotlib inline
import sys
import os
os.environ['MKL_THREADING_LAYER']='GNU'
sys.path.append('../')
from Modules.Basics import *
from ML_Tools.General.Feature_Selection import *
from ML_Tools.Transformations.HEP_Proc import *

  from pandas.core import datetools
Using Theano backend.


RuntimeError: To use MKL 2018 with Theano you MUST set "MKL_THREADING_LAYER=GNU" in your environement.

### Data

In [None]:
def importData(name):
    data = pandas.read_csv(dirLoc + name + '.csv')
    data.rename(index=str, columns={"Weight": "gen_weight", 'PRI_met':'PRI_met_pt'}, inplace=True)
    data['gen_target'] = 0
    if name == 'training':
        data.loc[data.Label == 's', 'gen_target'] = 1
        data.drop(columns=['Label'], inplace=True)        
    return data

In [None]:
trainingData = importData('training')
testingData = importData('test')

In [None]:
trainingData.head()

In [None]:
print [x for x in trainingData.columns], len(trainingData.columns)

## Feature processing

In [None]:
rotate = True
cartesian = True

In [None]:
def rotateEvent(inData):
    '''Rotate event in phi such that lepton is at phi == 0'''
    inData['PRI_tau_phi'] = deltaphi(inData['PRI_lep_phi'], inData['PRI_tau_phi'])
    inData['PRI_jet_leading_phi'] = deltaphi(inData['PRI_lep_phi'], inData['PRI_jet_leading_phi'])
    inData['PRI_jet_subleading_phi'] = deltaphi(inData['PRI_lep_phi'], inData['PRI_jet_subleading_phi'])
    inData['PRI_met_phi'] = deltaphi(inData['PRI_lep_phi'], inData['PRI_met_phi'])
    
def convertData(inData):
    '''Pass data through conversions and drop uneeded columns'''
    
    inData.loc[inData['DER_mass_MMC'] == -999.0, 'DER_mass_MMC'] = -1
    
    if rotate:
        rotateEvent(inData)
    
    if cartesian:
        moveToCartesian(inData, 'PRI_tau', drop=True)
        moveToCartesian(inData, 'PRI_lep', drop=True)
        moveToCartesian(inData, 'PRI_jet_leading', drop=True)
        moveToCartesian(inData, 'PRI_jet_subleading', drop=True)
        moveToCartesian(inData, 'PRI_met', z=False)
        
        inData.drop(columns=["PRI_met_phi"], inplace=True)
        
    if rotate and not cartesian:
        inData.drop(columns=["PRI_lep_phi"], inplace=True)
    elif rotate and cartesian:
        inData.drop(columns=["PRI_lep_py"], inplace=True)
        
    inData.replace([np.inf, -np.inf], np.nan, inplace=True)
    inData.fillna(-999.0, inplace=True)

In [None]:
convertData(trainingData)
convertData(testingData)

In [None]:
trainingData.columns

## Split by jet

In [None]:
def splitByJet(inData):
    outData = {}
    
    outData[0] = inData[inData['PRI_jet_num'] == 0]
    outData[1] = inData[inData['PRI_jet_num'] == 1]    
    outData[2] = inData[inData['PRI_jet_num'] >= 2] 
    
    if not cartesian:
    
        outData[0].drop(columns={'DER_deltaeta_jet_jet', 'DER_mass_jet_jet', 'DER_prodeta_jet_jet', 'DER_lep_eta_centrality',
                                 'PRI_jet_num', 'PRI_jet_leading_pt', 'PRI_jet_leading_eta', 'PRI_jet_leading_phi',
                                 'PRI_jet_subleading_pt', 'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi', 
                                 'PRI_jet_all_pt'}, inplace=True)

        outData[1].drop(columns={'DER_deltaeta_jet_jet', 'DER_mass_jet_jet', 'DER_prodeta_jet_jet', 'DER_lep_eta_centrality',
                                 'PRI_jet_subleading_pt', 'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi',
                                 'PRI_jet_all_pt'}, inplace=True)

    elif cartesian:
        outData[0].drop(columns={'DER_deltaeta_jet_jet', 'DER_mass_jet_jet', 'DER_prodeta_jet_jet', 'DER_lep_eta_centrality',
                                 'PRI_jet_num', 'PRI_jet_leading_px', 'PRI_jet_leading_py', 'PRI_jet_leading_pz',
                                 'PRI_jet_subleading_px', 'PRI_jet_subleading_py', 'PRI_jet_subleading_pz', 
                                 'PRI_jet_all_pt'}, inplace=True)

        outData[1].drop(columns={'DER_deltaeta_jet_jet', 'DER_mass_jet_jet', 'DER_prodeta_jet_jet', 'DER_lep_eta_centrality',
                                 'PRI_jet_subleading_px', 'PRI_jet_subleading_py', 'PRI_jet_subleading_pz',
                                 'PRI_jet_all_pt'}, inplace=True)
    
    return outData

In [None]:
splitJetTraining = splitByJet(trainingData)
splitJetTesting = splitByJet(testingData)

In [None]:
for i in splitJetTraining:
    print i, len([x for x in splitJetTraining[i].columns if 'gen' not in x and 'EventId' not in x]) 

## Pre-process data

In [None]:
def getPipe(inData, features):
    inputPipe, outputPipe = getPreProcPipes(normIn=True)
    inputPipe.fit(inData[features].values.astype('float32'))
    return inputPipe

In [None]:
def saveBatch(inData, n, inputPipe, outFile, normWeights, mode, features):
    grp = outFile.create_group('fold_' + str(n))
    
    X = inputPipe.transform(inData[features].values.astype('float32'))
    
    inputs = grp.create_dataset("inputs", shape=X.shape, dtype='float32')
    inputs[...] = X
    
    if 'testing' not in mode:
        if normWeights:
            inData.loc[inData.gen_target == 0, 'gen_weight'] = inData.loc[inData.gen_target == 0, 'gen_weight']/np.sum(inData.loc[inData.gen_target == 0, 'gen_weight'])
            inData.loc[inData.gen_target == 1, 'gen_weight'] = inData.loc[inData.gen_target == 1, 'gen_weight']/np.sum(inData.loc[inData.gen_target == 1, 'gen_weight'])

        X_weights = inData['gen_weight'].values.astype('float32')
        y = inData['gen_target'].values.astype('int')

        targets = grp.create_dataset("targets", shape=y.shape, dtype='int')
        targets[...] = y

        weights = grp.create_dataset("weights", shape=X_weights.shape, dtype='float32')
        weights[...] = X_weights
    
    else:
        X_EventId = inData['EventId'].values.astype('int')
        
        EventId = grp.create_dataset("EventId", shape=X_EventId.shape, dtype='int')
        EventId[...] = X_EventId

In [None]:
def prepareSample(inData, mode, inputPipe, normWeights, N=10):
    print "Running", mode
    os.system('rm ' + dirLoc + mode + '.hdf5')
    outFile = h5py.File(dirLoc + mode + '.hdf5', "w")
    
    features = [x for x in inData.columns if 'gen' not in x and 'EventId' not in x]
    
    if isinstance(inputPipe, types.NoneType):
        inputPipe = getPipe(inData, features)
            
    kf = StratifiedKFold(n_splits=N, shuffle=True)
    folds = kf.split(inData, inData['gen_target'])
    
    for i, (train, test) in enumerate(folds):
        print "Saving fold:", i, "of", len(test), "events"
        saveBatch(inData.iloc[test], i, inputPipe, outFile, normWeights, mode, features)
        
    return inputPipe

In [None]:
for i in splitJetTraining:
    print "Splitting to validation"
    trainIndeces, valIndeces = splitDevVal(splitJetTraining[i])
    train = splitJetTraining[i].loc[trainIndeces]
    val = splitJetTraining[i].loc[valIndeces]
    print len(train), "training", len(val), "validation"
        
    inputPipe = prepareSample(train, 'train_' + str(i), None, True)
    prepareSample(val, 'val_' + str(i), inputPipe, False)
    
    prepareSample(splitJetTesting[i], 'testing_'+ str(i), inputPipe, False)