# Reading Root Files
2020-04-08, John Rodriguez

In [1]:
import ROOT
from glob import glob #get files in directory with regex
import pandas as pd
from root_numpy import tree2array
from os import path

data_path = "../raw_data"
prodata_path = "../processed_data"
signal = "Xtohh"
energy = "3000"

Welcome to JupyROOT 6.20/04


In [2]:
def root_to_df(file):
    """Turns root file into a dataframe"""
    rfile = ROOT.TFile(file) #root file
    intree = rfile.Get("Nominal") #get tree from root file
    return pd.DataFrame(tree2array(intree)) #DataFrame from array from root file

In [3]:
def filter_fakes(df):
    """Return df without fake samples"""
    return df[df["sample"] != "fakes"]

In [4]:
def is_signal_or_back(file, signal, energy):
    """Returns True if file contains signal or background. If none, return False"""
    if signal in file:
        if energy in file:
            #print("signal", file)
            return True
    elif "data.root" not in file:
        #print("back", file)
        return True
    return False

In [5]:
def get_signal_dataset(data_path, prodata_path, signal, energy, force_create = False):
    dataset_file = f"{prodata_path}/{signal}{energy}_dataset.csv"
    
    if path.exists(dataset_file) and not force_create:
        print(f"Dataset found in {dataset_file}")
        return pd.read_csv(dataset_file)
    
    print("Creating dataset")
    root_files = glob(f"{data_path}/*.root")
    signal_dfs = [filter_fakes(root_to_df(file)) for file in root_files if is_signal_or_back(file, signal, energy)]
    all_signals = pd.concat(signal_dfs, axis = 0).reset_index(drop = True)
    all_signals.to_csv(dataset_file)
    return all_signals

In [8]:
all_signals = get_signal_dataset(data_path, prodata_path, signal, energy, force_create = True)

Creating dataset


In [9]:
all_signals

Unnamed: 0,sample,EventWeightNoXSec,EventWeight,EventNumber,m_region,m_FJNbtagJets,m_AntiBTag,m_AntiTauTag,m_FJpt,m_FJeta,...,m_dPhiFTwDT,m_dRFJwDT,m_dPhiDTwMET,m_MET,m_hhm,m_bbttpt,m_hhm_Coll,m_hhm_Eff,m_hhm_ConEff,m_hhm_ConVis
0,stopt,34.877720,0.293185,13462051,SR,0,1,0,861.875366,-0.304526,...,3.126026,3.266931,0.013121,302.029083,1400.892700,409.237701,-999.0,1758.257324,1198.907959,1676.697510
1,WWPw,9.584293,0.023479,133790,SR,0,2,0,831.194458,-0.176349,...,3.104488,3.224856,0.073350,521.377930,1148.022095,508.322754,-999.0,1777.929321,1099.240601,1383.665894
2,WWPw,8.806340,0.021573,3704260,SR,0,3,0,637.725098,1.187717,...,3.001453,3.129304,0.457147,211.862717,1316.768921,121.648369,-999.0,1586.030884,1216.527832,1577.986938
3,WWPw,8.806340,0.021573,3704260,TwoDimMassWindow,0,3,0,637.725098,1.187717,...,3.001453,3.129304,0.457147,211.862717,1316.768921,121.648369,-999.0,1586.030884,1216.527832,1577.986938
4,WWPw,6.919804,0.016952,13791576,SR,0,3,0,660.002258,-0.874483,...,3.057571,3.246438,0.279711,81.513786,1344.046387,154.597977,-999.0,1512.565796,747.123535,2170.850830
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56052,stopWt,40.829983,0.272697,14258965,QCDCR,1,2,0,363.587463,0.399854,...,3.052697,3.089946,3.031674,29.369617,888.826294,136.578751,-999.0,922.597839,639.257446,1143.713379
56053,stopWt,42.578201,0.284373,15053602,SR,1,4,0,481.960785,-0.231428,...,3.111821,3.114481,-0.269028,26.319555,1016.920776,30.398745,-999.0,1042.506348,1262.884888,1299.504272
56054,stopWt,42.578201,0.284373,15053602,TwoDimMassWindow,1,4,0,481.960785,-0.231428,...,3.111821,3.114481,-0.269028,26.319555,1016.920776,30.398745,-999.0,1042.506348,1262.884888,1299.504272
56055,stopWt,8.883183,0.059329,17552876,QCDCR,1,3,0,534.322510,0.131959,...,1.627407,1.816824,2.141683,82.603447,766.809082,616.363586,-999.0,931.473633,556.508606,987.984741
