# Photon ID Run 2 filter and reweighting

In [1]:
import time
import uproot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
pd.__version__

'2.2.2'

In [3]:
t0 = time.time()

In [4]:
datadir = "/Users/Marco/Data/PhotonID/Run2/"

In [5]:
df_yj = pd.read_pickle(datadir+"Py8_yj_mc16ade_pd122.pkl")
df_jj = pd.read_pickle(datadir+"Py8_jj_mc16ade_pd122.pkl")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/Marco/Data/PhotonID/Run2/Py8_yj_mc16ade_pd122.pkl'

### Filter datasets for true photons and true jets reconstructed as photons

In [None]:
def true_ph(data):
    return data[(data['y_truth_pdgId'] == 22) & (abs(data['y_truth_mother_pdgId']) < 100)]

def true_bkg(data):
    return data[(abs(data['y_truth_pdgId']) != 11) & ((data['y_truth_pdgId'] != 22) | 
                ((data['y_truth_pdgId'] == 22) & (abs(data['y_truth_mother_pdgId']) > 100))) ]

In [None]:
sig = true_ph(df_yj)
bkg = true_bkg(df_jj)

In [None]:
ptmin = 20
ptmax = 1000
nbin = 49
dpt = int((ptmax-ptmin)/nbin)

bins_pt = np.linspace(ptmin,ptmax,nbin)

plt.hist(sig['y_pt'], weights = sig['mcTotWeight'],
         bins = bins_pt, histtype = 'stepfilled', alpha = 0.5, label = 'True $\gamma$ (reco)', log = True)

plt.hist(bkg['y_pt'], weights = bkg['mcTotWeight'],
         bins = bins_pt, histtype = 'stepfilled', alpha = 0.5, label = 'Fake $\gamma$ (reco)', log = True)

plt.hist(sig['y_truth_pt'], weights = sig['mcTotWeight'],
         bins = bins_pt, histtype = 'stepfilled', alpha = 0.5, label = 'True $\gamma$ (true)', log = True)

plt.hist(bkg['y_truth_pt'], weights = bkg['mcTotWeight'],
         bins = bins_pt, histtype = 'stepfilled', alpha = 0.5, label = 'Fake $\gamma$ (true)', log = True)

plt.legend()
plt.xlabel('$p_T$ [GeV]')
plt.ylabel('Events / {} GeV'.format(dpt))
plt.show()

#### Verify true photon origin (direct vs brem photons)

In [None]:
h_all = plt.hist(sig['y_truth_pt'], 
                 weights = sig['mcTotWeight'],
                 histtype = 'step', log = True, bins = bins_pt, label="all $\gamma$")

h_dir = plt.hist(sig[(sig['y_truth_type']==14)]['y_truth_pt'], 
                 weights = sig[(sig['y_truth_type']==14)]['mcTotWeight'],
                 histtype = 'step', log = True, bins = bins_pt, label="direct $\gamma$")

h_bre = plt.hist(sig[(sig['y_truth_type']!=14)]['y_truth_pt'], 
                 weights = sig[(sig['y_truth_type']!=14)]['mcTotWeight'],
                 histtype = 'step', log = True, bins = bins_pt, label="brem $\gamma$")

plt.xlabel('$p_{T, true}$ [GeV]')
plt.ylabel('Events / {} GeV'.format(dpt))
plt.legend()
plt.show()

In [None]:
#ntot = sum(sig[(sig["y_truth_pt"]<1000.)]['mcTotWeight'])
ntot = sum(h_all[0])
ndir = sum(h_dir[0])
nbre = sum(h_bre[0])

In [None]:
print('Fraction of direct photons = {}'.format(ndir/ntot))
print('Fraction of brem photons   = {}'.format(nbre/ntot))

### Event preselections (as in legacy HH->yybb Run 2 analysis)

In [None]:
# Remove brem photons from signal sample, only keep hard-scattering events (similar to H->yy photons)
sig = sig[(sig["y_truth_type"]==14)]

In [None]:
cutflow_sig = [len(sig)]
cutflow_bkg = [len(bkg)]

In [None]:
# 20 GeV < pt < 1 TeV
sig = sig.drop(sig.index[((sig['y_pt']) < 20.) | ((sig['y_pt']) > 1000.)], axis = 0, inplace = False)
bkg = bkg.drop(bkg.index[((bkg['y_pt']) < 20.) | ((bkg['y_pt']) > 1000.)], axis = 0, inplace = False)
cutflow_sig.append(len(sig))
cutflow_bkg.append(len(bkg))

In [None]:
# f1 > 0.005
sig = sig[(sig['y_f1']>=0.005)]
bkg = bkg[(bkg['y_f1']>=0.005)]
cutflow_sig.append(len(sig))
cutflow_bkg.append(len(bkg))

In [None]:
# e277 > 0.1
sig = sig[(sig['y_e277']>0.1)]
bkg = bkg[(bkg['y_e277']>0.1)]
cutflow_sig.append(len(sig))
cutflow_bkg.append(len(bkg))

In [None]:
# Loose preselection on eta to eliminate jet outliers (photon candidates are defined up to |eta|=2.37, and exclusing 1.37<|eta|<1.51)
sig = sig.drop(sig.index[((sig['y_eta']) <= -2.5) | ((sig['y_eta']) > 2.5)], axis = 0, inplace = False)
bkg = bkg.drop(bkg.index[((bkg['y_eta']) <= -2.5) | ((bkg['y_eta']) > 2.5)], axis = 0, inplace = False)
cutflow_sig.append(len(sig))
cutflow_bkg.append(len(bkg))

In [None]:
presels = ["Initial", "20<pT<1000 GeV", "f1>0.005", "e277>0.1", "eta"]
print("Cutflow           N(sig)    dN(sig)  frac(sig)      N(bkg)  dN(bkg) frac(bkg)")
print("-"*77)
for i,(cut,nsig,nbkg) in enumerate(zip(presels,cutflow_sig,cutflow_bkg)):
    dnsig = 0
    dnbkg = 0
    if i>0:
        dnsig = cutflow_sig[i-1]-cutflow_sig[i] 
        dnbkg = cutflow_bkg[i-1]-cutflow_bkg[i] 
    print(f"{cut:15s} {nsig:8d} {dnsig:10d} {100*dnsig/cutflow_sig[0]:8.3f}%     {nbkg:8d} {dnbkg:8d} {100*dnbkg/cutflow_bkg[0]:8.3f}% ")

### Background sample reweighting

In [None]:
# re-indexing dataframes to improve processing
sig_presel = sig.set_index(np.arange(0,len(sig)), drop = True)
bkg_presel = bkg.set_index(np.arange(0,len(bkg)), drop = True)

In [None]:
print('Number of preselected signal events     =',len(sig_presel))
print('Number of preselected background events =',len(bkg_presel))

In [None]:
sig_presel.head(10)

### $\eta$ reweighting

In [None]:
etamin = -2.5
etamax =  2.5
nbins_eta = 50
eta_bins = np.linspace(etamin,etamax,nbins_eta+1)

h_eta_sig = plt.hist(sig_presel['y_eta'], weights=sig_presel['mcTotWeight'],
                     histtype='step', log=True, bins=eta_bins, label = r'True $\gamma$ from $\gamma-j$ sample')

h_eta_bkg = plt.hist(bkg_presel['y_eta'], weights=bkg_presel['mcTotWeight'],
                     histtype='step', log=True, bins=eta_bins, label = r'True jets as $\gamma$, from $j-j$ sample')

plt.xlabel('$\eta$')
plt.ylabel('Events')
plt.legend(loc ='lower left')
plt.show()

In [None]:
def weight(n_sig, n_bkg):
    try :
        assert (len(n_sig) == len (n_bkg))
    except:
        print("ERROR Arrays have different lengths")
    weight = n_sig / n_bkg
    return weight

In [None]:
eta_w = weight(h_eta_sig[0], h_eta_bkg[0])
eta_w

In [None]:
eta_bins = h_eta_sig[1]
eta_bins_centers = (eta_bins[:-1]+eta_bins[1:])/2

plt.step(eta_bins_centers, h_eta_sig[0]       , linewidth = '1', label = r'True $\gamma$')
plt.step(eta_bins_centers, h_eta_bkg[0]*eta_w , linewidth = '1', label = r'True jets as $\gamma$, $\eta$ reweighted')

plt.xlabel('$\eta$')
plt.ylabel('Events')

plt.yscale('log')
plt.legend(loc='lower left')
plt.show()

In [None]:
# compute weight bin index correponsing to each events in datasets 
bin_idx_bkg_eta = np.digitize(bkg_presel['y_eta'], eta_bins)

In [None]:
# map corresponding weight, add to dataframe
idx_bkg_eta = pd.Series(bin_idx_bkg_eta)
dfw_bkg_eta = idx_bkg_eta.map({i+1: eta_w[i] for i in range(nbins_eta)}) 
bkg_presel['etaWeight'] = dfw_bkg_eta

In [None]:
bkg_presel.head()

In [None]:
np.any(np.isnan(bkg_presel['etaWeight']))

### $p_T$ reweighting

In [None]:
from math import log

ptmin = 20.
ptmax = 1000.
nbins_pt = 40

x = np.linspace(log(ptmin),log(ptmax),nbins_pt+1)
bins_pt = np.exp(x)
print(bins_pt)

In [None]:
h_pt_sig = plt.hist(sig_presel['y_pt'], weights=sig_presel['mcTotWeight'],
                     histtype='step', log=True, bins=bins_pt, label = r'True $\gamma$ from $\gamma-j$ sample')

h_pt_bkg = plt.hist(bkg_presel['y_pt'], weights=bkg_presel['mcTotWeight']*bkg_presel['etaWeight'],
                     histtype='step', log=True, bins=bins_pt, label = r'True jets as $\gamma$, from $j-j$ sample')


plt.xlabel('$p_T [GeV]$')
plt.ylabel('Events')
plt.title('Plot $p_T$')
plt.legend(loc ='lower left')
plt.show()

In [None]:
pt_w = weight(h_pt_sig[0], h_pt_bkg[0])
pt_w

In [None]:
pt_bins = h_pt_sig[1]
pt_bins_centers = (pt_bins[:-1]+pt_bins[1:])/2

plt.step(pt_bins_centers, h_pt_sig[0]      , linewidth = '1', label = r'True $\gamma$')
plt.step(pt_bins_centers, h_pt_bkg[0]*pt_w , linewidth = '1', label = r'True jets as $\gamma$, $p_T$ reweighted')

plt.xlabel('$\eta$')
plt.ylabel('Events')

plt.yscale('log')
plt.legend(loc='lower left')
plt.show()

In [None]:
# compute weight bin index correponsing to each events in datasets 
bin_idx_bkg_pt = np.digitize(bkg_presel['y_pt'], pt_bins)

In [None]:
# map corresponding weight, add to dataframe
idx_bkg_pt = pd.Series(bin_idx_bkg_pt)
dfw_bkg_pt = idx_bkg_pt.map({i+1: pt_w[i] for i in range(nbins_pt)}) 
bkg_presel['ptWeight'] = dfw_bkg_pt

In [None]:
bkg_presel.head()

In [None]:
# total weight as product of eta and pt weight
bkg_presel['totWeight'] = bkg_presel['mcTotWeight']*bkg_presel['etaWeight']*bkg_presel['ptWeight']

### Reweighted $\eta$ and $p_T$ plots

In [None]:
plt.hist(sig_presel['y_eta'], weights = sig_presel['mcTotWeight'], 
         histtype='step', log=True, bins=eta_bins, label = r'True $\gamma$')

plt.hist(bkg_presel['y_eta'], weights = bkg_presel['totWeight'], 
         histtype='step', log=True, bins=eta_bins, label = r'True jets as $\gamma$, $\eta$  and $p_T$ reweighted')

plt.xlabel('$\eta$')
plt.ylabel('Events')
plt.legend(loc ='lower left')
plt.show()

In [None]:
plt.hist(sig_presel['y_pt'], weights = sig_presel['mcTotWeight'], 
         histtype='step', log=True, bins=pt_bins, label = r'True $\gamma$')

plt.hist(bkg_presel['y_pt'], weights = bkg_presel['totWeight'], 
         histtype='step', log=True, bins=pt_bins, label = r'True jets as $\gamma$, $\eta$  and $p_T$ reweighted')

plt.xlabel('$p_T$ [GeV]')
plt.ylabel('Events')
plt.legend(loc ='lower left')
plt.show()

#### Save preselected dataframes with weights

In [None]:
sig_presel.to_pickle(datadir+"Py8_yj_mc16ade_pd122_train_w.pkl")
bkg_presel.to_pickle(datadir+"Py8_jj_mc16ade_pd122_train_w.pkl")

In [None]:
t = time.time()
dt = t-t0
print(f"Notebook executed in {int(dt//60):d}\'{int(dt%60):d}\"")