# Import data, compute features, train-val split, feature selection, pre-process, & save
## Import
### Modules

In [1]:
from __future__ import division
%matplotlib inline
import sys
sys.path.append('../')
from Modules.Basics import *
from ML_Tools.General.Feature_Selection import *
from ML_Tools.Transformations.HEP_Proc import *
import uproot

  from ._conv import register_converters as _register_converters
  from pandas.core import datetools
Using TensorFlow backend.


## Feature processing

In [2]:
rotate = False
cartesian = False
features = ['jetPt',
 'jetEta',
 'jetMass',
 'ntracks',
 'ntowers']

normIn = True
pca = False

## Save data

In [3]:
def getPipe(inData):
    inputPipe, outputPipe = getPreProcPipes(normIn=normIn, pca=pca)
    inputPipe.fit(inData[features].values.astype('float32'))
    
    with open(dirLoc + 'inputPipe.pkl', 'w') as fout:    
        pickle.dump(inputPipe, fout)
        
    return inputPipe

In [4]:
def saveBatch(inData, n, inputPipe, outFile):
    grp = outFile.create_group('fold_' + str(n))
    
    X = inputPipe.transform(inData[features].values.astype('float32'))
    inputs = grp.create_dataset("inputs", shape=X.shape, dtype='float32')
    inputs[...] = X
    
    y = inData['gen_target'].values.astype('int')
    targets = grp.create_dataset("targets", shape=y.shape, dtype='int')
    targets[...] = y

In [5]:
def getBatch(sample, mode, batch, nBatches):
    f = uproot.open(dirLoc + sample + '_' + mode + '.root')['treeJets']
    totalSize = len(f)
    batchSize = math.floor(totalSize/nBatches)
    return f.pandas.df(branches=features, entrystart=batchSize*batch, entrystop=batchSize*(batch+1))

In [6]:
def prepareSample(sample,  mode, inputPipe, N=10):
    print "Running", mode
    os.system('rm ' + dirLoc + mode + '.hdf5')
    outFile = h5py.File(dirLoc + mode + '.hdf5', "w")

    for i in xrange(N):
        gluons = getBatch('gluons', sample, i, N)
        quarks = getBatch('quarks', sample, i, N)
        gluons['gen_target'] = getTarget('gluon')
        quarks['gen_target'] = getTarget('quark')
        batch = gluons.append(quarks, ignore_index=True)
        batch = batch.sample(frac=1).reset_index(drop=True) #Shuffle
        
        if isinstance(inputPipe, types.NoneType):
            print "Fitting inputPipe"
            inputPipe = getPipe(batch)
        
        print "Saving fold:", i, "of", N, "events"
        saveBatch(batch, i, inputPipe, outFile)
        
    return inputPipe

In [7]:
inputPipe = prepareSample('standard', 'train', None, N=100)
prepareSample('modified', 'testing', inputPipe, N=100)

Running train
Fitting inputPipe
Saving fold: 0 of 100 events
Saving fold: 1 of 100 events
Saving fold: 2 of 100 events
Saving fold: 3 of 100 events
Saving fold: 4 of 100 events
Saving fold: 5 of 100 events
Saving fold: 6 of 100 events
Saving fold: 7 of 100 events
Saving fold: 8 of 100 events
Saving fold: 9 of 100 events
Saving fold: 10 of 100 events
Saving fold: 11 of 100 events
Saving fold: 12 of 100 events
Saving fold: 13 of 100 events
Saving fold: 14 of 100 events
Saving fold: 15 of 100 events
Saving fold: 16 of 100 events
Saving fold: 17 of 100 events
Saving fold: 18 of 100 events
Saving fold: 19 of 100 events
Saving fold: 20 of 100 events
Saving fold: 21 of 100 events
Saving fold: 22 of 100 events
Saving fold: 23 of 100 events
Saving fold: 24 of 100 events
Saving fold: 25 of 100 events
Saving fold: 26 of 100 events
Saving fold: 27 of 100 events
Saving fold: 28 of 100 events
Saving fold: 29 of 100 events
Saving fold: 30 of 100 events
Saving fold: 31 of 100 events
Saving fold: 32 of

Pipeline(memory=None,
     steps=[('normIn', StandardScaler(copy=True, with_mean=True, with_std=True))])