In [1]:
import ROOT

import pandas as pd
import numpy as np
import uproot
import os
import logging
import tensorflow as tf

from script import utils
from script import cms
from script.models.layers import Clip, Divide, StandardScaler

Welcome to JupyROOT 6.24/06


In [2]:
# best-cuts

# bbA
CUT_CAT1_BBA_W = [0.52525253, 0.51515152, 0.48484848, 0.48484848, 0.43434343,
                  0.42424242, 0.39393939, 0.41414141, 0.4040404 , 0.47474747,
                  0.45454545, 0.41414141, 0.50505051, 0.48484848, 0.51515152,
                  0.53535354]  # computed on 100 bins

CUT_CAT1_BBA_PU_W = [0.57142857, 0.6122449 , 0.57142857, 0.57142857, 0.46938776, 
                     0.46938776, 0.53061224, 0.48979592, 0.46938776, 0.53061224, 
                     0.53061224, 0.59183673, 0.59183673, 0.57142857, 0.57142857, 
                     0.6122449]   # computed on 50 bins

# bbH
CUT_CAT1_BBH_W = [0.52525253, 0.52525253, 0.46464646, 0.47474747, 0.46464646,
                  0.41414141, 0.39393939, 0.41414141, 0.4040404 , 0.45454545,
                  0.49494949, 0.47474747, 0.50505051, 0.51515152, 0.51515152,
                  0.53535354]  # computed on 100 bins

CUT_CAT1_BBH_PU_W = [0.6122449 , 0.59183673, 0.55102041, 0.57142857, 0.57142857,
                     0.51020408, 0.51020408, 0.48979592, 0.46938776, 0.48979592,
                     0.55102041, 0.63265306, 0.59183673, 0.55102041, 0.57142857, 
                     0.6122449]   # computed on 50 bins


In [3]:
hood = []
data_hood = []

masses = [130,150,170,200,250,300,350,400,450,500,600,700,800,1000,1200,1500]

FEATURES_CAT1 = ["dimuon_deltar", "dimuon_deltaphi", "dimuon_deltaeta", "met_pt", 
                 "deltar_bjet1_dimuon", "deltapt_bjet1_dimuon", "deltaeta_bjet1_dimuon", 
                 "bjet_1_pt", "bjet_1_eta", "deltaphi_bjet1_dimuon",
                 "ljet_1_pt", "ljet_1_eta", "bjet_n", "ljet_n"]

FEATURES_CAT2 = ["dimuon_deltar", "dimuon_deltaphi", "dimuon_deltaeta", "met_pt",
                 "ljet_1_pt", "ljet_1_eta", "ljet_n"]

INTERVALS = [(115, 180),   # 130
             (115, 200),   # 150
             (120, 220),   # 170
             (150, 250),   # 200
             (200, 300),   # 250
             (225, 375),   # 300
             (275, 425),   # 350
             (300, 500),   # 400
             (350, 550),   # 450
             (350, 650),   # 500
             (400, 800),   # 600
             (500, 900),   # 700
             (600, 1000),  # 800
             (700, 1800),  # 1000
             (700, 1800),  # 1200
             (700, 1800)]  # 1500

tanbetas = [2,5,10,15,20,25,30,40,50,60]
dibosons = ['WWTo2L2Nu','WZTo3LNu','ZZTo2L2Nu','ZZTo4L']
singletops = ['s-channel','t-channel_antitop','t-channel_top','tW_antitop','tW_top']
bkgttbar_treename = 'background_treeTTbar_UL2016'
ttbar_binned = ['Incl_700to1000','Incl_1000toInf']
bkgDY_treename = 'background_treeDY_UL2016'
DY_binned = ['ZMM_50to120','ZMM_120to200','ZMM_200to400','ZMM_400to800','ZMM_800to1400','ZMM_1400to2300']
datas = ['B','C','D','E','F','G','H']

for mA in masses:
    for tanb in tanbetas:
        sgn_treename = 'signal_treeMSSM_bbA_mA' + str(mA) + '_tanb' + str(tanb) + '_UL2016'
        sgn_treename2 = 'signal_treeMSSM_bbH_mA' + str(mA) + '_tanb' + str(tanb) + '_UL2016'
        sgn_treename3 = 'signal_treeMSSM_ggA_mA' + str(mA) + '_tanb' + str(tanb) + '_UL2016'
        sgn_treename4 = 'signal_treeMSSM_ggH_mA' + str(mA) + '_tanb' + str(tanb) + '_UL2016'
        hood.append(sgn_treename)
        hood.append(sgn_treename2)
        hood.append(sgn_treename3)
        hood.append(sgn_treename4)

for diboson in dibosons:
    bkgdiboson_treename = 'background_treediboson_' + diboson + '_UL2016'
    hood.append(bkgdiboson_treename)

for st in singletops:
    bkgst_treename = 'background_treeST_' + st + '_UL2016'
    hood.append(bkgst_treename)
    
hood.append(bkgttbar_treename)
hood.append(bkgDY_treename)

for ttbar in ttbar_binned:
    bkgttbar_binned_treename = 'background_treeTTbar' + ttbar + '_UL2016'
    hood.append(bkgttbar_binned_treename)
    
for dy in DY_binned:
    bkgdy_binned_treename = 'background_tree' + dy + '_UL2016'
    hood.append(bkgdy_binned_treename)

for data in datas:
    data_treename = 'background_treeULRun2016' + data
    data_hood.append(data_treename) 

In [4]:
# statistics
mean_cat1 = cms.retrieve_stat(cms.STATS_CAT1, which='mean', columns=FEATURES_CAT1)
std_cat1 = cms.retrieve_stat(cms.STATS_CAT1, which='std', columns=FEATURES_CAT1)
clip_cat1 = cms.retrieve_clip(cms.CLIP_CAT1, columns=FEATURES_CAT1)

# cat 2..

In [5]:
# preprocessing layers: clip, standardization, and mA normalization
preproc_cat1 = dict(x=[Clip(min_value=clip_cat1[:, 0], max_value=clip_cat1[:, 1]),
                       StandardScaler(mean=mean_cat1, std=std_cat1)],
                    m=[Divide(value=1000.0)])

2021-12-29 13:12:11.335233: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
# def get_dummy_dataset(features):
#     from collections import namedtuple
    
#     a = namedtuple('A', ['train_features'])
#     a.train_features = namedtuple('A', ['shape'])
#     a.train_features.shape = (len(features),)
#     return a

In [7]:
def get_pNN_output_signal(df, mass, pth='./weigths/new/'):
    
    # model building and loading
    df_1 = df[df["bjet_n"] > 0]
    df_2 = df[df["bjet_n"] == 0]
    
    out = np.empty((df.shape[0]))
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    logger = tf.get_logger()
    logger.setLevel(logging.ERROR)
    # CAT 1
    # data = get_dummy_dataset(features=FEATURES_CAT1)
    model = utils.get_compiled_pnn(len(FEATURES_CAT1), dropout=0.25,
                                   preprocess=preproc_cat1)
    # utils.load_from_checkpoint(model, path='new/pnn-balanced-cat_1-case_2')
    utils.load_from_checkpoint(model, path='tmp/pnn-cat_1-preproc-drop-l2')
                   
    sig = df_1[df_1['type'] == 1.0]
    
    # seleziona su `mass` e `interval`
    x = sig[sig['mA'] == mass][FEATURES_CAT1]
    m = mass * np.ones((x.shape[0], 1))
    
    # apply model
    y_s = model.predict({'x': x, 'm': m}, batch_size=1024)
        
    # store
    np.put(out, x.index.values, y_s)
    
    # CAT 2
    # data = get_dummy_dataset(features=FEATURES_CAT2)
    model = utils.get_compiled_pnn(len(FEATURES_CAT2))
    utils.load_from_checkpoint(model, path= 'new/pnn-balanced-cat_2-case_2')

    sig = df_2[df_2['type'] == 1.0]

    # seleziona su `mass` e `interval`
    x = sig[sig['mA'] == mass][FEATURES_CAT2]
    m = mass * np.ones((x.shape[0], 1))

    # apply model
    y_s = model.predict({'x': x, 'm': m}, batch_size=1024)
        
    # store
    np.put(out, x.index.values, y_s)
        
    return out

In [21]:
def get_pNN_output_background(df, pth='./weigths/new/'):
    
    # model building and loading
    df_1 = df[df["bjet_n"] > 0]
    df_2 = df[df["bjet_n"] == 0]
    
    out = np.empty((df.shape[0], len(INTERVALS)))
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    logger = tf.get_logger()
    logger.setLevel(logging.ERROR)
    # CAT 1
    # data = get_dummy_dataset(features=FEATURES_CAT1)
    model = utils.get_compiled_pnn(len(FEATURES_CAT1), dropout=0.25,
                                   preprocess=preproc_cat1)
    # utils.load_from_checkpoint(model, path='new/pnn-balanced-cat_1-case_2')
    utils.load_from_checkpoint(model, path='tmp/pnn-cat_1-preproc-drop-l2')
                   
    bkg = df_1[df_1['type'] == 0.0]
    
    for i, (mass, (low, up)) in enumerate(zip(masses, INTERVALS)):
        # seleziona su `mass` e `interval`
        b = bkg[(bkg['dimuon_mass'] > low) & (bkg['dimuon_mass'] < up)][FEATURES_CAT1]
        
        # features and mass
        x = b.values
        if x.size == 0:
            continue
        m = mass * np.ones((x.shape[0], 1))
        
        # apply model
        y_b = model.predict({'x': x, 'm': m}, batch_size=1024)
                   
        # store
        out[b.index.values, i] = np.squeeze(y_b)
    
    # CAT 2
    # data = get_dummy_dataset(features=FEATURES_CAT2)
    model = utils.get_compiled_pnn(len(FEATURES_CAT2))
    utils.load_from_checkpoint(model, path='new/pnn-balanced-cat_2-case_2')

    bkg = df_2[df_2['type'] == 0.0]

    for i, (mass, (low, up)) in enumerate(zip(masses, INTERVALS)):
        # seleziona su `mass` e `interval`
        b = bkg[(bkg['dimuon_mass'] > low) & (bkg['dimuon_mass'] < up)][FEATURES_CAT2]

        # features and mass
        x = b.values
        if x.size == 0:
            continue
        m = mass * np.ones((x.shape[0], 1))

        # apply model
        y_b = model.predict({'x': x, 'm': m}, batch_size=1024)
                   
        # store
        out[b.index.values, i] = np.squeeze(y_b)
        
    return out

In [9]:
def rewrite_root_signal(input_file, tree, array_of_pNN):
    print("Save new branch in original ROOT file")
    
    myfile = ROOT.TFile(input_file, 'update')
    mytree = myfile.Get(tree)
    
    pNN_output = np.array([0.5])
    newBranch = mytree.Branch("pNN_output", pNN_output, "pNN_output/D")
    numOfEvents = mytree.GetEntries()
    
    for n in range(numOfEvents):
        pNN_output[0] = array_of_pNN[n]
        mytree.GetEntry(n)
        newBranch.Fill()
    
    mytree.Write("", ROOT.TFile.kOverwrite)
    myfile.Close()  

In [10]:
def rewrite_root_background(input_file, tree, array_of_pNN):
    print("Save new branch in original ROOT file")
    
    myfile = ROOT.TFile(input_file, 'update')
    mytree = myfile.Get(tree)
    
    listOfNewBranches = []
    
    pNN_output_130 = np.array([0.5])
    pNN_output_150 = np.array([0.5])
    pNN_output_170 = np.array([0.5])
    pNN_output_200 = np.array([0.5])
    pNN_output_250 = np.array([0.5])
    pNN_output_300 = np.array([0.5])
    pNN_output_350 = np.array([0.5])
    pNN_output_400 = np.array([0.5])
    pNN_output_450 = np.array([0.5])
    pNN_output_500 = np.array([0.5])
    pNN_output_600 = np.array([0.5])
    pNN_output_700 = np.array([0.5])
    pNN_output_800 = np.array([0.5])
    pNN_output_1000 = np.array([0.5])
    pNN_output_1200 = np.array([0.5])
    pNN_output_1500 = np.array([0.5])

    listOfNewBranches.append(mytree.Branch("pNN_output_130", pNN_output_130, "pNN_output_130/D"))
    listOfNewBranches.append(mytree.Branch("pNN_output_150", pNN_output_150, "pNN_output_150/D"))
    listOfNewBranches.append(mytree.Branch("pNN_output_170", pNN_output_170, "pNN_output_170/D"))
    listOfNewBranches.append(mytree.Branch("pNN_output_200", pNN_output_200, "pNN_output_200/D"))
    listOfNewBranches.append(mytree.Branch("pNN_output_250", pNN_output_250, "pNN_output_250/D"))
    listOfNewBranches.append(mytree.Branch("pNN_output_300", pNN_output_300, "pNN_output_300/D"))
    listOfNewBranches.append(mytree.Branch("pNN_output_350", pNN_output_350, "pNN_output_350/D"))
    listOfNewBranches.append(mytree.Branch("pNN_output_400", pNN_output_400, "pNN_output_400/D"))
    listOfNewBranches.append(mytree.Branch("pNN_output_450", pNN_output_450, "pNN_output_450/D"))
    listOfNewBranches.append(mytree.Branch("pNN_output_500", pNN_output_500, "pNN_output_500/D"))
    listOfNewBranches.append(mytree.Branch("pNN_output_600", pNN_output_600, "pNN_output_600/D"))
    listOfNewBranches.append(mytree.Branch("pNN_output_700", pNN_output_700, "pNN_output_700/D"))
    listOfNewBranches.append(mytree.Branch("pNN_output_800", pNN_output_800, "pNN_output_800/D"))
    listOfNewBranches.append(mytree.Branch("pNN_output_1000", pNN_output_1000, "pNN_output_1000/D"))
    listOfNewBranches.append(mytree.Branch("pNN_output_1200", pNN_output_1200, "pNN_output_1200/D"))
    listOfNewBranches.append(mytree.Branch("pNN_output_1500", pNN_output_1500, "pNN_output_1500/D"))
            
    numOfEvents = mytree.GetEntries()
    
    for n in range(numOfEvents):
        pNN_output_130[0] = array_of_pNN[n,0]
        pNN_output_150[0] = array_of_pNN[n,1]
        pNN_output_170[0] = array_of_pNN[n,2]
        pNN_output_200[0] = array_of_pNN[n,3]
        pNN_output_250[0] = array_of_pNN[n,4]
        pNN_output_300[0] = array_of_pNN[n,5]
        pNN_output_350[0] = array_of_pNN[n,6]
        pNN_output_400[0] = array_of_pNN[n,7]
        pNN_output_450[0] = array_of_pNN[n,8]
        pNN_output_500[0] = array_of_pNN[n,9]
        pNN_output_600[0] = array_of_pNN[n,10]
        pNN_output_700[0] = array_of_pNN[n,11]
        pNN_output_800[0] = array_of_pNN[n,12]
        pNN_output_1000[0] = array_of_pNN[n,13]
        pNN_output_1200[0] = array_of_pNN[n,14]
        pNN_output_1500[0] = array_of_pNN[n,15]
                
        mytree.GetEntry(n)
        
        for newBranch in listOfNewBranches:
            newBranch.Fill()
            
    mytree.Write("", ROOT.TFile.kOverwrite)
    myfile.Close()

In [11]:
# constants
fname = './root_file/'
fileIn = fname
valid_header=['mA', 'training', 'dimuon_deltar', 'dimuon_deltaphi', 'dimuon_deltaeta', 
              'dimuon_mass', 'dimuon_pt', 'met_pt', 'met_phi', 'met_eta', 'bjet_n', 
              'bjet_1_pt', 'bjet_1_eta', 'jetfwd_n', 'ljet_n', 'ljet_1_pt', 'ljet_1_eta', 
              'deltar_bjet1_dimuon', 'deltapt_bjet1_dimuon', 'deltaeta_bjet1_dimuon', 
              'deltaphi_bjet1_dimuon', 'PU_Weight']

In [12]:
def write_output_to_root_signal(tree, file, mass):
    print ("[signal] Reading trees {} in file {}".format(tree, file))
    data = uproot.open(fname + file)[tree]

    out = data.arrays(valid_header, library="pd")
    out["type"] = 1.0

    mass = float(mass)

    pNN_out_array = get_pNN_output_signal(out, mass)
    
    # update root file with output pNN
    rewrite_root_signal(fileIn + file, tree, pNN_out_array)

In [13]:
def write_output_to_root_background(tree, file):
    print ("[background] Reading trees {} in file {}".format(tree, file))
    data = uproot.open(fname + file)[tree]

    out = data.arrays(valid_header, library="pd")
    out["type"] = 0.0

    pNN_out_array = get_pNN_output_background(out)
    
    # update root file with output pNN
    rewrite_root_background(fileIn + file, tree, pNN_out_array)

In [22]:
for tree in hood:
    for file in os.listdir(fileIn):
        t_splits = tree.split('_')
        f_splits = file.split('_')
        
        kind = t_splits[0]
        
        if kind == 'signal':
            if f_splits[2] == 'bbH' or f_splits[2] == 'bbA':
                # if file.split('_')[1] + '_' + file.split('_')[2] + '_' + file.split('_')[3] + '_' + file.split('_')[4] != tree.split('_')[1][4:8] + '_' + tree.split('_')[2] + '_' + tree.split('_')[3] + '_' + tree.split('_')[4]:
                if '_'.join(f_splits[1:5]) != t_splits[1][4:8] + '_' + '_'.join(t_splits[2:5]):
                    continue
                
                write_output_to_root_signal(tree, file, mass=f_splits[3][2:])
                
        if kind == 'background':
            bkg = f_splits[1]
            
            if bkg == 'diboson':
                # if file.split('_')[1] + file.split('_')[2] != tree.split('_')[1][4:11] + tree.split('_')[2]:
                if ''.join(f_splits[1:3]) != t_splits[1][4:11] + t_splits[2]:
                    continue
                
                write_output_to_root_background(tree, file)
            
            elif bkg == 'ST':
                if f_splits[2:-1] != t_splits[2:-1]:
                    continue
                    
                write_output_to_root_background(tree, file)
                
            elif bkg == 'DY':
                if f_splits[1] != t_splits[1][4:6]:
                    continue
                    
                write_output_to_root_background(tree, file)

            elif bkg == 'TTbarIncl':
                if f_splits[1] != t_splits[1][4:13]:
                    continue
                
                if f_splits[2] != t_splits[2]:
                    continue
                    
                write_output_to_root_background(tree, file)

            elif bkg == 'TTbar':
                if f_splits[1] != t_splits[1][4:9] or t_splits[1][4:13] == "TTbarIncl":
                    continue
                    
                write_output_to_root_background(tree, file)

            elif bkg == 'ZMM':
                if f_splits[1] != t_splits[1][4:7]:
                    continue
                
                if f_splits[2] != t_splits[2]:
                    continue
                    
                write_output_to_root_background(tree, file)


[background] Reading trees background_treeST_s-channel_UL2016 in file Out_ST_s-channel_UL2016.root
Loaded from "weights/tmp/pnn-cat_1-preproc-drop-l2/weights-18-0.943"
130
<class 'numpy.ndarray'>
150
<class 'numpy.ndarray'>
170
<class 'numpy.ndarray'>
200
<class 'numpy.ndarray'>
250
<class 'numpy.ndarray'>
300
<class 'numpy.ndarray'>
350
<class 'numpy.ndarray'>
400
<class 'numpy.ndarray'>
450
<class 'numpy.ndarray'>
500
<class 'numpy.ndarray'>
600
<class 'numpy.ndarray'>
700
<class 'numpy.ndarray'>
800
<class 'numpy.ndarray'>
1000
<class 'numpy.ndarray'>
1200
<class 'numpy.ndarray'>
1500
<class 'numpy.ndarray'>
Loaded from "weights/new/pnn-balanced-cat_2-case_2/weights-73-0.976"
Save new branch in original ROOT file
[background] Reading trees background_treeST_t-channel_antitop_UL2016 in file Out_ST_t-channel_antitop_UL2016.root
Loaded from "weights/tmp/pnn-cat_1-preproc-drop-l2/weights-18-0.943"
130
<class 'numpy.ndarray'>
150
<class 'numpy.ndarray'>
170
<class 'numpy.ndarray'>
200
<c

In [None]:
#Loop for data

for tree in data_hood: 
    for file in os.listdir(fileIn): 
        if tree.split('_')[0] == 'background': 
            if file.split('_')[1][2:9] == 'Run2016':     
                if file.split('_')[1][2:10] != tree.split('_')[1][6:14]:
                    continue
                    
                print ("Reading trees {} in file {}".format(tree, file))
                data = uproot.open(fname + file)[tree]
                
                out = data.arrays(valid_header, library="pd")
                out.to_csv("./output_csv/" + file.split(".")[0] + ".csv") 
