# Import data, compute features, train-val split, feature selection, pre-process, & save
## Import
### Modules

In [10]:
from __future__ import division
%matplotlib inline
import sys
sys.path.append('../')
from Modules.Basics import *
from Modules.Class_Features import *
from ML_Tools.General.Feature_Selection import *
from ML_Tools.Transformations.HEP_Proc import *
import uproot

## Feature processing

In [2]:
rotate = False
cartesian = False
useTrackFeatures = True
useTowerFeatures = True
features = ['jetPt',
 'jetEta',
 'jetMass',
 'ntracks',
 'ntowers']

normIn = True
pca = True

In [3]:
def trackFeatures(inData):
    inData["min_track_pT"] = -1
    inData.loc[inData.ntracks > 0, "min_track_pT"] = inData.loc[inData.ntracks > 0, "trackPt"].apply(np.min)
    
    inData["mean_track_pT"] = -1
    inData.loc[inData.ntracks > 0, "mean_track_pT"] = inData.loc[inData.ntracks > 0, "trackPt"].apply(np.mean)
    
    inData["max_track_pT"] = -1
    inData.loc[inData.ntracks > 0, "max_track_pT"] = inData.loc[inData.ntracks > 0, "trackPt"].apply(np.max)
    
    inData["sum_track_pT"] = -1
    inData.loc[inData.ntracks > 0, "sum_track_pT"] = inData.loc[inData.ntracks > 0,"trackPt"].apply(np.sum)
    
    inData["abs_track_eta"] = -1
    inData.loc[inData.ntracks > 0, "abs_track_eta"] = inData.loc[inData.ntracks > 0, "trackEta"].apply(np.abs)
    
    inData["min_track_eta"] = -1
    inData.loc[inData.ntracks > 0, "min_track_eta"] = inData.loc[inData.ntracks > 0, "abs_track_eta"].apply(np.min)
    
    inData["mean_track_eta"] = -1
    inData.loc[inData.ntracks > 0, "mean_track_eta"] = inData.loc[inData.ntracks > 0, "trackEta"].apply(np.mean)
    
    inData["max_track_eta"] = -1
    inData.loc[inData.ntracks > 0, "max_track_eta"] = inData.loc[inData.ntracks > 0, "abs_track_eta"].apply(np.max)
    
    inData["mean_track_charge"] = -1
    inData.loc["mean_track_charge"] = inData.loc[inData.ntracks > 0, "trackCharge"].apply(np.mean)
    

In [4]:
def towerFeatures(inData):
    inData["min_tower_E"] = -1
    inData.loc[inData.ntowers > 0, "min_tower_E"] = inData.loc[inData.ntowers > 0, "towerE"].apply(np.min)
    
    inData["mean_tower_E"] = -1
    inData.loc[inData.ntowers > 0, "mean_tower_E"] = inData.loc[inData.ntowers > 0, "towerE"].apply(np.mean)
    
    inData["max_tower_E"] = -1
    inData.loc[inData.ntowers > 0, "max_tower_E"] = inData.loc[inData.ntowers > 0, "towerE"].apply(np.max)
    
    inData["sum_tower_E"] = -1
    inData.loc[inData.ntowers > 0, "sum_tower_E"] = inData.loc[inData.ntowers > 0, "towerE"].apply(np.sum)
    
    inData["min_tower_Eem"] = -1
    inData.loc[inData.ntowers > 0, "min_tower_Eem"] = inData.loc[inData.ntowers > 0, "towerEem"].apply(np.min)
    
    inData["mean_tower_Eem"] = -1
    inData.loc[inData.ntowers > 0, "mean_tower_Eem"] = inData.loc[inData.ntowers > 0, "towerEem"].apply(np.mean)
    
    inData["max_tower_Eem"] = -1
    inData.loc[inData.ntowers > 0, "max_tower_Eem"] = inData.loc[inData.ntowers > 0, "towerEem"].apply(np.max)
    
    inData["sum_tower_Eem"] = -1
    inData.loc[inData.ntowers > 0, "sum_tower_Eem"] = inData.loc[inData.ntowers > 0, "towerEem"].apply(np.sum)
    
    inData["min_tower_Ehad"] = -1
    inData.loc[inData.ntowers > 0, "min_tower_Ehad"] = inData.loc[inData.ntowers > 0, "towerEhad"].apply(np.min)
    
    inData["mean_tower_Ehad"] = -1
    inData.loc[inData.ntowers > 0,"mean_tower_Ehad"] = inData.loc[inData.ntowers > 0,"towerEhad"].apply(np.mean)
    
    inData["max_tower_Ehad"] = -1
    inData.loc[inData.ntowers > 0,"max_tower_Ehad"] = inData.loc[inData.ntowers > 0,"towerEhad"].apply(np.max)
    
    inData["sum_tower_Ehad"] = -1
    inData.loc[inData.ntowers > 0,"sum_tower_Ehad"] = inData.loc[inData.ntowers > 0,"towerEhad"].apply(np.sum)
    
    inData["tower_Eem_frac"] = -1
    inData.loc[inData.ntowers > 0,"tower_Eem_frac"] = inData.loc[inData.ntowers > 0, "sum_tower_Eem"]/inData.loc[inData.ntowers > 0, "sum_tower_E"]
    
    inData["tower_Ehad_frac"] = -1
    inData.loc[inData.ntowers > 0,"tower_Ehad_frac"] = inData.loc[inData.ntowers > 0, "sum_tower_Ehad"]/inData.loc[inData.ntowers > 0, "sum_tower_E"]
    
    inData["min_tower_eta"] = -1
    inData.loc[inData.ntowers > 0,"min_tower_eta"] = inData.loc[inData.ntowers > 0,"towerEta"].apply(np.min)
    
    inData["mean_tower_eta"] = -1
    inData.loc[inData.ntowers > 0,"mean_tower_eta"] = inData.loc[inData.ntowers > 0,"towerEta"].apply(np.mean)
    
    inData["max_tower_eta"] = -1
    inData.loc[inData.ntowers > 0,"max_tower_eta"] = inData.loc[inData.ntowers > 0,"towerEta"].apply(np.max)

In [5]:
def getBatch(sample, mode, batch, nBatches):
    f = uproot.open(dirLoc + sample + '_' + mode + '.root')['treeJets']
    totalSize = len(f)
    batchSize = math.floor(totalSize/nBatches)
    data = f.pandas.df(entrystart=batchSize*batch, entrystop=batchSize*(batch+1))
    if useTowerFeatures: towerFeatures(data)
    if useTrackFeatures: trackFeatures(data)
    data.drop(columns=['trackPt', 'trackEta', 'trackPhi', 'trackCharge', 'towerE','towerEem', 'towerEhad', 'towerEta', 'towerPhi', 'abs_track_eta'], inplace=True)
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    #print "NaNs", data.columns[data.isnull().any()].tolist()
    data.dropna(inplace=True) 
    return data

##  Feature selection

In [6]:
gluons = getBatch('gluons', 'standard', 75, 100)
quarks = getBatch('quarks', 'standard', 75, 100)
gluons['gen_target'] = getTarget('gluon')
quarks['gen_target'] = getTarget('quark')
batch = gluons.append(quarks, ignore_index=True)
batch = batch.sample(frac=1).reset_index(drop=True)

In [7]:
batch.head()

Unnamed: 0,jetPt,jetEta,jetPhi,jetMass,ntracks,ntowers,min_tower_E,mean_tower_E,max_tower_E,sum_tower_E,...,max_tower_eta,min_track_pT,mean_track_pT,max_track_pT,sum_track_pT,min_track_eta,mean_track_eta,max_track_eta,mean_track_charge,gen_target
0,100.197922,2.432279,0.993638,7.065969,6.0,7.0,2.023126,47.477852,187.9599,332.344971,...,2.603429,1.318284,6.979834,17.388475,41.879005,2.388552,2.454205,2.553548,-1.0,1
1,100.274712,-0.672284,0.246772,19.914286,13.0,8.0,0.563475,7.839329,24.328726,62.71463,...,-0.519335,1.024813,4.018611,14.103094,52.24194,0.44213,-0.656669,0.957972,-1.0,0
2,121.497414,1.995268,2.058515,19.822773,17.0,20.0,0.642488,6.343688,36.468842,126.873764,...,2.306697,0.559147,5.1953,19.214205,88.320099,1.697378,1.974619,2.189453,-1.0,1
3,103.889732,0.824112,0.87363,16.12298,8.0,4.0,1.327044,10.377432,28.552814,41.509727,...,1.080709,0.767519,9.234488,48.947105,73.8759,0.749554,0.961637,1.193328,-1.0,1
4,103.436089,1.116343,0.897115,4.901496,4.0,6.0,0.946722,20.377319,97.073204,122.263916,...,1.238522,2.033751,7.944368,12.234118,31.777473,1.060365,1.101976,1.130678,-1.0,0


In [8]:
%%time

importantFeatures = rankClassifierFeatures(batch, [x for x in batch.columns if 'gen' not in x])

Running fold 1 /10


TypeError: __init__() got an unexpected keyword argument 'n_jobs'

In [11]:
features = filtered_features_0# importantFeatures[0]
print features

['ntracks', 'ntowers', 'jetMass', 'jetPt', 'max_track_pT', 'max_track_eta', 'min_track_eta', 'sum_tower_Ehad', 'max_tower_E', 'sum_track_pT', 'mean_tower_Eem', 'mean_track_pT', 'tower_Eem_frac', 'min_track_pT', 'sum_tower_Eem', 'min_tower_E', 'jetPhi', 'sum_tower_E', 'min_tower_eta', 'tower_Ehad_frac', 'max_tower_Eem', 'mean_tower_E', 'max_tower_Ehad', 'max_tower_eta', 'mean_tower_eta', 'jetEta', 'mean_tower_Ehad', 'mean_track_eta', 'min_tower_Eem']


## Save data

In [12]:
def getPipe(inData):
    inputPipe, outputPipe = getPreProcPipes(normIn=normIn, pca=pca)
    inputPipe.fit(inData[features].values.astype('float32'))
    
    with open(dirLoc + 'inputPipe.pkl', 'w') as fout:    
        pickle.dump(inputPipe, fout)
        
    return inputPipe

In [13]:
def saveBatch(inData, n, inputPipe, outFile):
    grp = outFile.create_group('fold_' + str(n))
    
    X = inputPipe.transform(inData[features].values.astype('float32'))
    inputs = grp.create_dataset("inputs", shape=X.shape, dtype='float32')
    inputs[...] = X
    
    y = inData['gen_target'].values.astype('int')
    targets = grp.create_dataset("targets", shape=y.shape, dtype='int')
    targets[...] = y

In [14]:
def prepareSample(sample,  mode, inputPipe, nSplit=10, nSave=10):
    print "Running", mode
    os.system('rm ' + dirLoc + mode + '.hdf5')
    outFile = h5py.File(dirLoc + mode + '.hdf5', "w")

    for i in xrange(nSave):
        gluons = getBatch('gluons', sample, i, nSplit)
        quarks = getBatch('quarks', sample, i, nSplit)
        gluons['gen_target'] = getTarget('gluon')
        quarks['gen_target'] = getTarget('quark')
        batch = gluons.append(quarks, ignore_index=True)
        batch = batch.sample(frac=1).reset_index(drop=True) #Shuffle
        
        if isinstance(inputPipe, types.NoneType):
            print "Fitting inputPipe"
            inputPipe = getPipe(batch)
        
        print "Saving fold:", i, "of", nSave, "events"
        saveBatch(batch, i, inputPipe, outFile)
        
    return inputPipe

In [15]:
inputPipe = prepareSample('standard', 'train', None, nSplit=100, nSave=10)
prepareSample('modified', 'testing', inputPipe, nSplit=100, nSave=10)

Running train
Fitting inputPipe
Saving fold: 0 of 10 events
Saving fold: 1 of 10 events
Saving fold: 2 of 10 events
Saving fold: 3 of 10 events
Saving fold: 4 of 10 events
Saving fold: 5 of 10 events
Saving fold: 6 of 10 events
Saving fold: 7 of 10 events
Saving fold: 8 of 10 events
Saving fold: 9 of 10 events
Running testing
Saving fold: 0 of 10 events
Saving fold: 1 of 10 events
Saving fold: 2 of 10 events
Saving fold: 3 of 10 events
Saving fold: 4 of 10 events
Saving fold: 5 of 10 events
Saving fold: 6 of 10 events
Saving fold: 7 of 10 events
Saving fold: 8 of 10 events
Saving fold: 9 of 10 events


Pipeline(memory=None,
     steps=[('normIn', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False))])