# Reading Root Files
2020-04-08, John Rodriguez

In [1]:
import ROOT
from glob import glob #get files in directory with regex
import pandas as pd
from root_numpy import tree2array
from os import path

data_path = "../raw_data"
prodata_path = "../processed_data"
signal = "Xtohh"
energy = "3000"

Welcome to JupyROOT 6.20/04


In [2]:
def is_signal_or_back(file, signal, energy):
    """Returns True if file contains signal or background. If none, return False"""
    if signal in file:
        if energy in file:
            print("signal", file)
            return True
    elif "data.root" not in file:
        print("back", file)
        return True
    return False

In [3]:
def root_to_df(file):
    """Turns root file into a dataframe"""
    rfile = ROOT.TFile(file) #root file
    intree = rfile.Get("Nominal") #get tree from root file
    return pd.DataFrame(tree2array(intree)) #DataFrame from array from root file

In [4]:
#signal shouldnt have fakes uwu
def filter_fakes(df):
    """Return df without fake samples"""
    #display(df[(df["sample"] == "fakes") & (df["EventWeight"] > 0)])
    return df[df["sample"] != "fakes"]

In [5]:
def classify_signal(df, file, signal, energy, ff):
    """Adds a column to the df indicating if it is the desired signal or not"""
    if signal in file and energy in file:
        #signal
        if ff:
            #fakes had been filtered
            df["signal"] = 1
        else:
            #there are fake samples
            df["signal"] = (df["sample"] != "fakes").astype(int)
    else:
        df["signal"] = 0
    return df

In [6]:
def process_data(file, signal, energy, ff = True):
    """Process the data contained in a root file. ff = filter_fakes"""
    df = root_to_df(file)
    if ff:
        df = filter_fakes(df)
    return classify_signal(df, file, signal, energy, ff)

In [7]:
def get_signal_dataset(data_path, prodata_path, signal, energy, force_create = True, ff = False):
    """Returns the dataset with the desired signal and background"""
    dataset_file = f"{prodata_path}/{signal}{energy}_dataset.csv"
    
    if path.exists(dataset_file) and not force_create:
        print(f"Dataset found in {dataset_file}")
        return pd.read_csv(dataset_file)
    
    print("Creating dataset")
    
    root_files = glob(f"{data_path}/*.root")
    signal_dfs = [process_data(file, signal, energy, ff) for file in root_files if is_signal_or_back(file, signal, energy)]
    all_signals = pd.concat(signal_dfs, axis = 0).reset_index(drop = True)
    all_signals.to_csv(dataset_file, index = False)
    return all_signals

In [8]:
all_signals = get_signal_dataset(data_path, prodata_path, signal, energy)

Creating dataset
back ../raw_data/Wmunu_Pw.root
back ../raw_data/stoptchan.root
back ../raw_data/stopschan.root
back ../raw_data/WW_Pw.root
back ../raw_data/Wtaunu_221.root
signal ../raw_data/Xtohh3000.root
back ../raw_data/WZ_Pw.root
back ../raw_data/Zmumu_Pw.root
back ../raw_data/Zee_221.root
back ../raw_data/Wenu_Pw.root
back ../raw_data/ttbar.root
back ../raw_data/ZZ_Pw.root
back ../raw_data/Ztautau.root
back ../raw_data/stopWt.root
ReadStreamerInfo, class:string, illegal uid=-2
ReadStreamerInfo, class:string, illegal uid=-2
ReadStreamerInfo, class:string, illegal uid=-2
ReadStreamerInfo, class:string, illegal uid=-2
ReadStreamerInfo, class:string, illegal uid=-2
ReadStreamerInfo, class:string, illegal uid=-2
ReadStreamerInfo, class:string, illegal uid=-2
ReadStreamerInfo, class:string, illegal uid=-2
ReadStreamerInfo, class:string, illegal uid=-2


In [9]:
all_signals

Unnamed: 0,sample,EventWeightNoXSec,EventWeight,EventNumber,m_region,m_FJNbtagJets,m_AntiBTag,m_AntiTauTag,m_FJpt,m_FJeta,...,m_dRFJwDT,m_dPhiDTwMET,m_MET,m_hhm,m_bbttpt,m_hhm_Coll,m_hhm_Eff,m_hhm_ConEff,m_hhm_ConVis,signal
0,fakes,-0.098973,-0.000892,3994248,QCDCR,1,3,2,614.656067,0.235415,...,2.887341,1.446572,431.106171,1296.251343,487.546082,-999.0,1852.812256,633.167480,1355.152954,0
1,fakes,-0.204399,-0.001842,4499931,SR,1,3,2,552.745605,0.379534,...,3.171939,-0.179417,146.013962,996.793335,138.957611,-999.0,1172.927368,823.800781,1061.258667,0
2,fakes,-0.131752,-0.001187,3470245,SR,0,3,2,506.226685,0.974346,...,3.197988,0.382356,196.942551,1062.961426,137.017685,-999.0,1283.133179,1168.350586,1376.213867,0
3,fakes,-0.131752,-0.001187,3470245,TwoDimMassWindow,0,3,2,506.226685,0.974346,...,3.197988,0.382356,196.942551,1062.961426,137.017685,-999.0,1283.133179,1168.350586,1376.213867,0
4,fakes,0.101267,0.000913,3325958,SR,1,2,2,496.985443,-0.908197,...,3.238636,-0.063105,226.396774,1006.510315,175.830734,-999.0,1256.632690,1098.993408,1151.838379,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93793,fakes,-0.332620,-0.002222,12376699,QCDCR,1,4,2,794.782288,0.315326,...,3.089225,-2.698368,19.014040,1572.301147,59.693996,-999.0,1591.837769,1123.655151,1894.155884,0
93794,fakes,-0.275385,-0.001839,12386882,SR,1,2,2,564.551331,0.851411,...,3.024492,-0.584560,61.609558,1050.203979,126.832039,-999.0,1133.736694,864.505737,961.399902,0
93795,fakes,-0.141334,-0.000944,19016456,SR,0,4,2,644.525940,-0.134551,...,2.904739,0.233702,36.224155,1079.656494,304.118408,-999.0,1125.484497,1163.578125,1341.280273,0
93796,stopWt,43.250130,0.288861,12540537,QCDCR,0,3,0,470.826141,-0.549138,...,2.923927,-1.375074,86.295708,1037.256836,145.850006,-999.0,1132.372559,744.588806,1157.786621,0


In [11]:
all_1000 = root_to_df("../processed_data/all_1000.root")

In [14]:
all_1000

Unnamed: 0,sample,EventWeight,EventNumber,m_region,m_FJNbtagJets,m_FJpt,m_FJeta,m_FJphi,m_FJm,m_DTpt,m_DTeta,m_DTphi,m_DTm,m_dPhiFTwDT,m_dRFJwDT,m_dPhiDTwMET,m_MET,m_hhm,m_bbttpt
0,fakes,0.004518,153694019,PreSel_0tag,0,482.772003,1.429120,2.421229,13794.311523,506.167145,0.207020,-0.655010,119845.726562,3.076240,3.310103,3.092851,36.086731,1197.072266,39.882824
1,fakes,0.004518,153694019,QCDCR_0tag,0,482.772003,1.429120,2.421229,13794.311523,506.167145,0.207020,-0.655010,119845.726562,3.076240,3.310103,3.092851,36.086731,1197.072266,39.882824
2,fakes,0.002319,45687125,PreSel_0tag,0,477.880615,1.934286,3.091218,12434.049805,404.434326,-0.968759,0.025319,119041.578125,3.065899,4.222251,0.043567,83.948036,1993.124146,80.629944
3,fakes,0.002319,45687125,SR_0tag,0,477.880615,1.934286,3.091218,12434.049805,404.434326,-0.968759,0.025319,119041.578125,3.065899,4.222251,0.043567,83.948036,1993.124146,80.629944
4,fakes,0.008115,70491915,PreSel_1tag,1,295.462860,-1.470354,-2.393916,62526.699219,522.142578,-1.833821,0.774659,124460.093750,3.114611,3.135747,3.121153,170.698441,820.318542,226.927307
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255887,fakes,-0.000057,1694476,SR_0tag,0,536.534729,1.435661,-1.967037,57871.671875,374.088776,0.362937,1.112791,105870.218750,3.079828,3.261300,0.374776,55.910549,1051.035034,164.785110
255888,ZZPw,0.023068,1061480,PreSel_0tag,0,416.966187,-0.547422,-2.326550,92135.187500,453.424438,0.316010,0.773775,104357.882812,3.100325,3.218312,-2.358163,33.718464,971.921692,40.634193
255889,ZZPw,0.023068,1061480,QCDCR_0tag,0,416.966187,-0.547422,-2.326550,92135.187500,453.424438,0.316010,0.773775,104357.882812,3.100325,3.218312,-2.358163,33.718464,971.921692,40.634193
255890,ZZPw,0.017314,179311,PreSel_0tag,0,653.310364,0.371117,-2.305289,12055.783203,667.591919,0.848044,0.793432,155061.015625,3.098721,3.135208,-0.009933,57.572094,1373.808594,31.709263
