In [None]:
import xgboost as xgb
from coffea import nanoevents
from coffea.nanoevents.methods.base import NanoEventsArray
import awkward as ak
import vector
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mplhep as hep
plt.style.use(hep.style.CMS)

def to_np_array(ak_array, max_n=2, pad=0):
    return ak.fill_none(ak.pad_none(ak_array, max_n, clip=True, axis=-1), pad).to_numpy()


model_fname = "src/HH4b/boosted/bdt_trainings_run2/model_xgboost_training_weights_qcd_and_ttbar_Run2_bdt_enhanced_v8p2/trained_bdt.model"
bdt_model = xgb.XGBClassifier()
bdt_model.load_model(fname=model_fname)


In [None]:
events = nanoevents.NanoEventsFactory.from_root(
    "src/HH4b/boosted/picoAOD.root",
    schemaclass=nanoevents.NanoAODSchema,
).events()

In [None]:
sorted_by_bbtag = ak.argsort(events["FatJet"]["particleNetMD_Xbb"], ascending=False, axis=-1)
fatjets_sorted = events["FatJet"][sorted_by_bbtag]

In [None]:
fatjet_xbb = to_np_array(fatjets_sorted["particleNetMD_Xbb"], max_n=2, pad=0)
fatjet_qcdb = to_np_array (fatjets_sorted["particleNetMD_QCDb"], max_n=2, pad=0)
fatjet_qcdbb = to_np_array (fatjets_sorted["particleNetMD_QCDbb"], max_n=2, pad=0)
fatjet_qcdothers = to_np_array (fatjets_sorted["particleNetMD_QCDothers"], max_n=2, pad=0)
fatjet_pnetmass = to_np_array(fatjets_sorted["particleNet_mass"], max_n=2, pad=0)

fatjet_pt = to_np_array(fatjets_sorted["pt"], max_n=2, pad=0)
fatjet_eta = to_np_array(fatjets_sorted["eta"], max_n=2, pad=0)
fatjet_phi = to_np_array(fatjets_sorted["phi"], max_n=2, pad=0)
fatjet_msd = to_np_array(fatjets_sorted["msoftdrop"], max_n=2, pad=0)
fatjet_tau2 = to_np_array (fatjets_sorted["tau2"], max_n=2, pad=0)
fatjet_tau3 = to_np_array (fatjets_sorted["tau3"], max_n=2, pad=0)

mask  = (fatjet_pt[:, 0] > 300) & (fatjet_pt[:, 1] > 300) & (fatjet_msd[:, 0] > 40) & (fatjet_pnetmass[:, 1] > 50) & (fatjet_xbb[:, 0] > 0.8) & (np.abs(fatjet_eta[:, 0]) < 2.4)  & (np.abs(fatjet_eta[:, 1]) < 2.4)

In [None]:
h1 = vector.array(
    {
        "pt": fatjet_pt[:, 0],
        "phi": fatjet_phi[:, 0],
        "eta": fatjet_eta[:, 0],
        "M": fatjet_msd[:, 0]
    }
)
h2 = vector.array(
    {
        "pt": fatjet_pt[:, 1],
        "phi": fatjet_phi[:, 1],
        "eta": fatjet_eta[:, 1],
        "M": fatjet_msd[:, 1]
    }
)
hh = h1 + h2

In [None]:
df_events = pd.DataFrame(
    {
        # dihiggs system
        "HHPt": hh.pt,
        "HHeta": hh.eta,
        "HHmass": hh.mass,
        # met in the event
        "MET": events["PuppiMET"]["pt"].to_numpy(),
        # fatjet tau32
        "H1T32": fatjet_tau3[:, 0] / fatjet_tau2[:, 0],
        "H2T32": fatjet_tau3[:, 1] / fatjet_tau2[:, 1],
        # fatjet mass
        "H1Mass": fatjet_msd[:, 0],
        # fatjet kinematics
        "H1Pt": fatjet_pt[:, 0],
        "H1eta": fatjet_eta[:, 0],
        # xbb
        "H1Xbb": fatjet_xbb[:, 0],
        "H1QCDb": fatjet_qcdb[:, 0],
        "H1QCDbb": fatjet_qcdbb[:, 0],
        "H1QCDothers": fatjet_qcdothers[:, 0],
        "H2Pt": fatjet_pt[:, 1],
        # ratios
        "H1Pt_HHmass": fatjet_pt[:, 0] / hh.mass,
        "H2Pt_HHmass": fatjet_pt[:, 1] / hh.mass,
        "H2Pt/H1Pt": fatjet_pt[:, 1] / fatjet_pt[:, 0],
    }
)


In [None]:
df_events.columns

In [None]:
df_events["bdt_score"] = bdt_model.predict_proba(df_events)[:, 1]

In [None]:
# make 18 subfigures
plt.figure()
fig, axs = plt.subplots(3, 6, figsize=(40, 20), sharey=True)
for i, col in enumerate(df_events.columns):
    ax = axs[i//6, i%6]
    ax.hist(df_events[col][mask], bins=50, histtype="step")
    ax.set_xlabel(col)
    if i%6 == 0:
        ax.set_ylabel("Events")
    ax.set_yscale("log")
plt.show()