# Photon ID Run 2 BDT training sample skimming

In [1]:
import uproot
import numpy as np
import pandas as pd
import pickle

In [2]:
pd.__version__

'2.2.2'

In [3]:
datadir = "/lapp_data/atlas/perf-egamma/InclusivePhotons/fullRun2/FinalNtuples/"
datasave = "/lapp_data/atlas/chardong/Venv/savedir/save_pkl/"

In [4]:
df_sig = pd.read_pickle(datadir+"Py8_yj_mc16ade_pd122_train_w.pkl")
df_bkg = pd.read_pickle(datadir+"Py8_jj_mc16ade_pd122_train_w.pkl")

In [5]:
columns = list(df_sig.columns)
columns

['y_Reta',
 'y_Rphi',
 'y_weta2',
 'y_fracs1',
 'y_weta1',
 'y_emaxs1',
 'y_f1',
 'y_wtots1',
 'y_Rhad',
 'y_Rhad1',
 'y_Eratio',
 'y_e277',
 'y_deltae',
 'y_noFF_Reta',
 'y_noFF_Rphi',
 'y_noFF_weta2',
 'y_noFF_fracs1',
 'y_noFF_weta1',
 'y_noFF_emaxs1',
 'y_noFF_f1',
 'y_noFF_wtots1',
 'y_noFF_Rhad',
 'y_noFF_Rhad1',
 'y_noFF_Eratio',
 'y_noFF_e277',
 'y_noFF_deltae',
 'y_pt',
 'y_eta',
 'y_phi',
 'y_e',
 'y_ptcone20',
 'y_ptcone40',
 'y_topoetcone20',
 'y_topoetcone40',
 'y_IsLoose',
 'y_IsTight',
 'y_iso_FixedCutLoose',
 'y_iso_FixedCutTight',
 'y_convType',
 'y_convRadius',
 'y_jmin_dr',
 'y_truth_pt',
 'y_truth_eta',
 'y_truth_phi',
 'y_truth_e',
 'y_truth_type',
 'y_truth_origin',
 'y_truth_pdgId',
 'y_truth_mother_pdgId',
 'evt_mu',
 'evtWeight',
 'intLumi',
 'xs',
 'ge',
 'mcWeight',
 'lumiXsecWeight',
 'lumiXsecWeightOriginal',
 'xsecWeight',
 'mcTotWeight',
 'sumWeights']

In [6]:
# variables to keep

shower_shape_var = ['y_Reta',
                    'y_Rphi',
                    'y_weta2',
                    'y_fracs1',
                    'y_weta1',
                    'y_wtots1',
                    'y_Rhad',
                    'y_Rhad1',
                    'y_Eratio', 
                    'y_deltae',
                   'y_noFF_Reta',
                   'y_noFF_Rphi',
                   'y_noFF_weta2',
                   'y_noFF_fracs1',
                   'y_noFF_weta1',
                   'y_noFF_wtots1',
                   'y_noFF_Rhad',
                   'y_noFF_Rhad1',
                   'y_noFF_Eratio',
                   'y_noFF_deltae']

isEM_var = [ 'y_IsTight', 'y_IsLoose' ]

conv_var = [ 'y_convRadius', 'y_convType']

kinem_var = ['y_pt', 'y_eta', 'y_phi', 'evt_mu', 'y_jmin_dr']

truth_var = ['y_truth_pt', 'y_truth_eta' ] #, 'y_truth_type', 'y_truth_pdgId', 'y_truth_mother_pdgId' ]

In [7]:
# add 'truth_label' variable to df
df_sig["truth_label"]=1.
df_bkg["truth_label"]=0.

# uniform weight column
df_sig["weight"]=df_sig['mcTotWeight']
df_bkg["weight"]=df_bkg["totWeight"]

# Not needed, already selected in previous step
# df_sig = df_sig.query('y_truth_type == 14')

In [8]:
keep_var = shower_shape_var+conv_var+kinem_var+isEM_var+truth_var+['weight',"truth_label"]
df_sig_skim = df_sig[keep_var]
df_bkg_skim = df_bkg[keep_var]

In [9]:
df_sig_skim.head(5)

Unnamed: 0,y_Reta,y_Rphi,y_weta2,y_fracs1,y_weta1,y_wtots1,y_Rhad,y_Rhad1,y_Eratio,y_deltae,...,y_eta,y_phi,evt_mu,y_jmin_dr,y_IsTight,y_IsLoose,y_truth_pt,y_truth_eta,weight,truth_label
0,0.955283,0.721005,0.016857,0.248116,0.602672,6.163973,-0.007951,-0.011676,0.358235,1248.306763,...,0.718952,1.94443,26.49,3.007666,False,False,16.437344,0.719087,71524.424963,1.0
1,0.973579,0.976123,0.010848,0.250266,0.628516,1.934841,-0.003076,0.005439,0.988437,5.962154,...,1.286473,3.135528,34.5,8999999000.0,True,True,16.420219,1.269462,39869.825713,1.0
2,0.971119,0.937279,0.01063,0.315495,0.622121,2.084521,-0.011601,-0.003843,0.956232,4.081696,...,0.552718,-2.840997,25.5,8999999000.0,True,True,16.117561,0.544353,60688.599349,1.0
3,0.988873,0.911961,0.011643,0.285961,0.614036,2.518094,-0.00484,-0.004938,0.885297,57.892002,...,-1.206819,2.729421,32.5,2.393447,True,True,16.089218,-1.207192,60355.22542,1.0
4,0.955146,0.9243,0.011432,0.297172,0.60238,2.1501,-0.011881,-0.011621,0.94006,77.155472,...,0.726452,-1.049119,33.5,8999999000.0,True,True,23.344801,0.763746,44774.051784,1.0


In [10]:
# further reduce dataset size by downsampling (e.g. 50% of the data)
df_sig_skim_50 = df_sig_skim.sample(frac=0.5) 
df_bkg_skim_50 = df_bkg_skim.sample(frac=0.5)

df_sig_skim_30 = df_sig_skim.sample(frac=0.3) 
df_bkg_skim_30 = df_bkg_skim.sample(frac=0.3)

In [11]:
# merge signal and background samples
totald = pd.concat([df_sig_skim, df_bkg_skim], axis=0)
# save skimmed dataframe
totald.to_pickle(datasave+"Py8_yj_jj_mc16ade_pd122_train_w_skim_noFF.pkl")

In [12]:
totald = pd.concat([df_sig_skim_50, df_bkg_skim_50], axis=0)
totald.to_pickle(datasave+"Py8_yj_jj_mc16ade_pd122_train_w_skim_50_noFF.pkl")

In [13]:
totald = pd.concat([df_sig_skim_30, df_bkg_skim_30], axis=0)
totald.to_pickle(datasave+"Py8_yj_jj_mc16ade_pd122_train_w_skim_30_noFF.pkl")