### Import the necessary modules

We start with importing the necessary modules required for the tutorial. Here we imported ROOT and TMVA(Toolkit for Multivariate Data Analysis). If you want to know more about TMVA, you can refer the documentation.

In [28]:
import ROOT
from ROOT import TMVA 
import os
from array import array

### Setting up TMVA

In [29]:
ROOT.TMVA.Tools.Instance()
TMVA.PyMethodBase.PyInitialize()

## For PYMVA methods
TMVA.PyMethodBase.PyInitialize();


outputFile = ROOT.TFile.Open("BatchdataOutput.root", "RECREATE")

### Define the input datafile

In [30]:
inputFileName = "images_data.root"
inputFile = ROOT.TFile.Open(inputFileName)

### Setting up the Signal and background Tree

In [31]:
signalTree     = inputFile.Get("sig_tree")
backgroundTree = inputFile.Get("bkg_tree")

signalTree.Print()

******************************************************************************
*Tree    :sig_tree  : signal_tree                                            *
*Entries :    10000 : Total =         2601382 bytes  File  Size =    2572423 *
*        :          : Tree compression factor =   1.00                       *
******************************************************************************
*Br    0 :var0      : var0/F                                                 *
*Entries :    10000 : Total  Size=      40631 bytes  File Size  =      40150 *
*Baskets :        2 : Basket Size=      32000 bytes  Compression=   1.00     *
*............................................................................*
*Br    1 :var1      : var1/F                                                 *
*Entries :    10000 : Total  Size=      40631 bytes  File Size  =      40150 *
*Baskets :        2 : Basket Size=      32000 bytes  Compression=   1.00     *
*...................................................

### Extracting the data into a RDataFrame 

In [32]:
df = ROOT.RDataFrame("sig_tree", "images_data.root")

### Use of RDataFrame.Range to select a chunk (batch of data)
Here we are considering a batch of 100 for each branch of signal Tree (Total there are 10000 entries in each branch)

In [33]:
df1 = df.Range(0,100)

### Use of AsNumpy to extract the data from the dataframe into Numpy dictionary

In [34]:
# Take all columns
npy1 = df1.AsNumpy()
print(npy1)

{'var0': ndarray([-1.33602500e+00, -4.09288311e+00,  1.96556091e+00,
          3.89667678e+00,  5.52924681e+00,  1.16362429e+00,
          2.64346790e+00,  2.47144938e+00, -4.85097289e-01,
          2.89741921e+00,  4.73280478e+00,  3.01877904e+00,
          3.12367535e+00, -4.48586035e+00,  3.51635313e+00,
          5.71682882e+00,  4.31257057e+00,  1.01603842e+01,
          2.73509383e+00,  6.47822976e-01,  2.66566372e+00,
          1.39952393e+01,  3.16755414e+00,  1.00602722e+01,
          4.35392761e+00,  9.60262775e+00,  5.18892527e-01,
          1.23200884e+01,  5.39695919e-02,  1.45863571e+01,
          6.90780067e+00,  1.40009069e+00, -2.25628781e+00,
          2.57242751e+00,  3.16384578e+00,  2.44943380e+00,
         -1.92530617e-01,  6.89263868e+00,  3.01004720e+00,
         -2.65800071e+00,  5.18959045e+00,  3.72071290e+00,
          2.51667857e+00,  6.89902258e+00, -2.52435970e+00,
         -1.84800971e+00,  2.33429265e+00,  6.61716080e+00,
         -1.71678603e+00,  7.44

Extracting the name of each branch in an array which are the keys of Dictionary npy1

In [35]:
print(type(npy1))
rows=list(npy1.keys())
print(rows)

<class 'dict'>
['var0', 'var1', 'var10', 'var11', 'var12', 'var13', 'var14', 'var15', 'var16', 'var17', 'var18', 'var19', 'var2', 'var20', 'var21', 'var22', 'var23', 'var24', 'var25', 'var26', 'var27', 'var28', 'var29', 'var3', 'var30', 'var31', 'var32', 'var33', 'var34', 'var35', 'var36', 'var37', 'var38', 'var39', 'var4', 'var40', 'var41', 'var42', 'var43', 'var44', 'var45', 'var46', 'var47', 'var48', 'var49', 'var5', 'var50', 'var51', 'var52', 'var53', 'var54', 'var55', 'var56', 'var57', 'var58', 'var59', 'var6', 'var60', 'var61', 'var62', 'var63', 'var7', 'var8', 'var9']


### Create a numpy array extracting all the data from the dataframe

In [36]:
import numpy as np
vec = df1.AsNumpy(columns=["var0"])
res1_array=np.array(list(vec.values()))

In [37]:
import numpy as np
for i in rows[1:32]:
    vec = df1.AsNumpy(columns=[i])
    res1_array = np.r_[res1_array,list(vec.values())]
print(res1_array)

[[-1.336025   -4.092883    1.9655609  ... -2.5696828   1.357026
   6.254604  ]
 [ 0.7632829   5.66362     1.4061441  ...  5.484785    0.95200175
   3.5430255 ]
 [ 7.2729025   5.52698    10.708342   ...  4.2346277   5.5558853
  12.274164  ]
 ...
 [19.657343    3.6865606  15.7146845  ... 13.756122   18.308765
  11.770398  ]
 [14.433575   18.267435    9.263986   ... 13.805253   17.320747
  20.793114  ]
 [10.888389   19.185621   10.288151   ...  4.6916385  22.37562
  17.388264  ]]


In [38]:
import pandas as pd

# convert array into dataframe
DF = pd.DataFrame(res1_array[:])

# save the dataframe as a csv file
DF.to_csv("data1.csv")

DF.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-1.336025,-4.092883,1.965561,3.896677,5.529247,1.163624,2.643468,2.471449,-0.485097,2.897419,...,6.278726,3.972255,-1.039392,4.30566,5.746236,2.795549,1.561632,-2.569683,1.357026,6.254604
1,0.763283,5.66362,1.406144,-0.971143,7.293005,7.006859,2.172393,6.329552,12.701866,-2.058933,...,19.785761,4.204327,1.677374,6.501729,2.189398,6.799557,8.629866,5.484785,0.952002,3.543025
2,7.272902,5.52698,10.708342,3.70902,13.599938,8.297036,2.951663,1.785292,22.514666,9.435592,...,12.652964,14.756213,7.102365,14.396164,3.771356,5.983877,9.122152,4.234628,5.555885,12.274164
3,2.21868,11.85489,9.511338,7.482359,9.167512,7.954143,14.677694,4.751048,14.485039,13.261776,...,14.795762,12.287502,2.439374,14.008286,4.32867,9.699433,11.308884,10.453323,-1.160988,13.371491
4,4.300408,3.502065,18.303375,10.689672,12.923079,8.774014,11.06116,9.289441,4.465584,16.648767,...,8.612619,10.756506,10.025352,7.474237,3.651299,10.048705,7.685595,6.862747,7.853603,7.977712


In [39]:
vec = df1.AsNumpy(columns=[rows[32]])
res2_array=np.array(list(vec.values()))

In [40]:
import numpy as np
for i in rows[33:64]:
    vec = df1.AsNumpy(columns=[i])
    res2_array = np.r_[res2_array,list(vec.values())]
print(res2_array)

[[17.552488   14.8517     20.258608   ...  7.862945   19.474665
  13.752408  ]
 [ 2.4140606   7.3459044   9.887703   ... -4.709083    5.254788
  -0.80547893]
 [ 3.816624    4.2820153  16.960608   ...  8.691112    6.98026
  10.415255  ]
 ...
 [-3.8706129   1.7222944  11.104798   ... -2.9287522   0.8441311
   0.94262445]
 [ 3.5277545   5.1762023   6.3105745  ...  2.496738   -4.3461485
   1.6584157 ]
 [ 1.0955719   6.0525527   6.063516   ... -0.19227293  1.9838908
   1.111334  ]]


In [41]:
import pandas as pd

# convert array into dataframe
DF = pd.DataFrame(res2_array[:])

# save the dataframe as a csv file
DF.to_csv("data1.csv")

DF.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,17.552488,14.8517,20.258608,16.023281,5.414101,1.079743,10.099811,10.457681,2.390532,8.605271,...,4.244608,13.882496,7.606391,1.598511,11.439191,8.202053,4.52944,7.862945,19.474665,13.752408
1,2.414061,7.345904,9.887703,21.604706,3.834147,4.448325,4.780985,9.219211,4.723229,-1.488317,...,5.234775,4.028111,4.222233,5.426785,4.505137,5.41512,1.866752,-4.709083,5.254788,-0.805479
2,3.816624,4.282015,16.960608,5.388222,8.992385,0.13874,11.342526,1.958259,5.763241,4.522415,...,8.702728,9.011817,-1.770969,6.02681,8.011896,10.832191,-0.312128,8.691112,6.98026,10.415255
3,-2.254239,-5.478763,-0.673511,2.807751,8.727755,9.313919,8.015204,-2.20289,3.171855,3.808939,...,6.344444,6.643681,3.915831,8.439391,7.805997,3.536092,15.459486,6.146331,3.789025,3.055388
4,17.0368,2.747641,1.845184,8.925235,8.612143,12.617948,9.465577,8.436515,6.059497,4.187626,...,10.573262,7.483535,14.596214,8.319124,7.631743,9.465684,18.026104,12.423092,1.75487,7.62342


### Create a Keras Model for Training 

In [42]:
from keras.models import Sequential
from keras.layers import Dense,Flatten
model = Sequential()
model.add(Dense(12, input_dim=100, activation="relu"))
model.add(Dense(12, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
model.add(Flatten())
model.summary()
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 12)                1212      
                                                                 
 dense_4 (Dense)             (None, 12)                156       
                                                                 
 dense_5 (Dense)             (None, 1)                 13        
                                                                 
 flatten_1 (Flatten)         (None, 1)                 0         
                                                                 
Total params: 1,381
Trainable params: 1,381
Non-trainable params: 0
_________________________________________________________________


### Training of Model without using a Generator

In [43]:
# tf_ds = pd.read_csv("data1.csv")
# tf_ds2=pd.read_csv("data.csv")

model.fit(res1_array,res2_array,batch_size=32, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f9c577b8b20>

### Declaring a Generator for Training the Model in Batches

Here we consider the batch size as number of rows present in the numpy array

In [44]:
batch_size=32

In [45]:
def generator(array1,array2, batch_size):
  inputs = []
  targets = []
  batchcount = 0
  while True:
       for x in range(batch_size):
         inputs.append(array1[x])
         targets.append(array2[x])
         batchcount += 1
         if batchcount > batch_size:
                  X = np.array(inputs, dtype='float32')
                  y = np.array(targets, dtype='float32')
                  yield (X,y)
                  inputs = []
                  targets = []
                  batchcount = 0

### Training a Model using a generator

In [46]:

model.fit_generator(generator(res1_array,res2_array,batch_size=32),steps_per_epoch=100/batch_size ,epochs=20)

Epoch 1/20


  model.fit_generator(generator(res1_array,res2_array,batch_size=32),steps_per_epoch=100/batch_size ,epochs=20)


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f9cc44ca9a0>