In [None]:
import xgboost as xgb
from coffea import nanoevents
from coffea.nanoevents.methods.base import NanoEventsArray
import awkward as ak
import vector
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mplhep as hep
import uproot

plt.style.use(hep.style.CMS)


def to_np_array(ak_array, max_n=2, pad=0):
    return ak.fill_none(ak.pad_none(ak_array, max_n, clip=True, axis=-1), pad).to_numpy()


file_name = "/ceph/cms/store/user/woodson/boosted/GluGluToHHTo4B_cHHH1_UL16_preVFP/picoAOD.root"
# file_name = "/ceph/cms/store/user/woodson/boosted/GluGluToHHTo4B_cHHH1_UL16_postVFP/picoAOD.root"
# file_name = "/ceph/cms/store/user/woodson/boosted/GluGluToHHTo4B_cHHH1_UL17/picoAOD.chunk0.root"
# file_name = "/ceph/cms/store/user/woodson/boosted/GluGluToHHTo4B_cHHH1_UL17/picoAOD.chunk1.root"
# file_name = "/ceph/cms/store/user/woodson/boosted/GluGluToHHTo4B_cHHH1_UL18/picoAOD.chunk0.root"
# file_name = "/ceph/cms/store/user/woodson/boosted/GluGluToHHTo4B_cHHH1_UL18/picoAOD.chunk1.root"
model_fname = "src/HH4b/boosted/bdt_trainings_run2/model_xgboost_training_weights_qcd_and_ttbar_Run2_bdt_enhanced_v8p2/trained_bdt.model"
bdt_model = xgb.XGBClassifier()
bdt_model.load_model(fname=model_fname)

In [None]:
events = nanoevents.NanoEventsFactory.from_root(
    file_name,
    schemaclass=nanoevents.NanoAODSchema,
).events()

In [None]:
sorted_by_bbtag = ak.argsort(
    events["FatJet"]["particleNetMD_Xbb"]
    / (events["FatJet"]["particleNetMD_Xbb"] + events["FatJet"]["particleNetMD_QCD"]),
    ascending=False,
    axis=-1,
)
fatjets_sorted = events["FatJet"][sorted_by_bbtag]

In [None]:
fatjet_xbb = to_np_array(fatjets_sorted["particleNetMD_Xbb"], max_n=2, pad=0)
fatjet_qcd = to_np_array(fatjets_sorted["particleNetMD_QCD"], max_n=2, pad=0)
fatjet_txbb = fatjet_xbb / (fatjet_xbb + fatjet_qcd)
fatjet_qcdb = to_np_array(fatjets_sorted["particleNetMD_QCDb"], max_n=2, pad=0)
fatjet_qcdbb = to_np_array(fatjets_sorted["particleNetMD_QCDbb"], max_n=2, pad=0)
fatjet_qcdothers = to_np_array(fatjets_sorted["particleNetMD_QCDothers"], max_n=2, pad=0)
fatjet_pnetmass = to_np_array(fatjets_sorted["particleNet_mass"], max_n=2, pad=0)

fatjet_pt = to_np_array(fatjets_sorted["pt"], max_n=2, pad=0)
fatjet_eta = to_np_array(fatjets_sorted["eta"], max_n=2, pad=0)
fatjet_phi = to_np_array(fatjets_sorted["phi"], max_n=2, pad=0)
fatjet_msd = to_np_array(fatjets_sorted["msoftdrop"], max_n=2, pad=0)
fatjet_tau2 = to_np_array(fatjets_sorted["tau2"], max_n=2, pad=0)
fatjet_tau3 = to_np_array(fatjets_sorted["tau3"], max_n=2, pad=0)

mask = (
    (fatjet_pt[:, 0] > 300)
    & (fatjet_pt[:, 1] > 300)
    & (fatjet_msd[:, 0] > 40)
    & (fatjet_pnetmass[:, 1] > 50)
    & (fatjet_txbb[:, 0] > 0.8)
    & (np.abs(fatjet_eta[:, 0]) < 2.4)
    & (np.abs(fatjet_eta[:, 1]) < 2.4)
)

In [None]:
h1 = vector.array(
    {"pt": fatjet_pt[:, 0], "phi": fatjet_phi[:, 0], "eta": fatjet_eta[:, 0], "M": fatjet_msd[:, 0]}
)
h2 = vector.array(
    {"pt": fatjet_pt[:, 1], "phi": fatjet_phi[:, 1], "eta": fatjet_eta[:, 1], "M": fatjet_msd[:, 1]}
)
hh = h1 + h2

In [None]:
df_events = pd.DataFrame(
    {
        # dihiggs system
        "HHPt": hh.pt,
        "HHeta": hh.eta,
        "HHmass": hh.mass,
        # met in the event
        "MET": events["PuppiMET"]["pt"].to_numpy(),
        # fatjet tau32
        "H1T32": fatjet_tau3[:, 0] / fatjet_tau2[:, 0],
        "H2T32": fatjet_tau3[:, 1] / fatjet_tau2[:, 1],
        # fatjet mass
        "H1Mass": fatjet_msd[:, 0],
        # fatjet kinematics
        "H1Pt": fatjet_pt[:, 0],
        "H1eta": fatjet_eta[:, 0],
        # xbb
        "H1Xbb": fatjet_txbb[:, 0],
        "H1QCDb": fatjet_qcdb[:, 0],
        "H1QCDbb": fatjet_qcdbb[:, 0],
        "H1QCDothers": fatjet_qcdothers[:, 0],
        "H2Pt": fatjet_pt[:, 1],
        # ratios
        "H1Pt_HHmass": fatjet_pt[:, 0] / hh.mass,
        "H2Pt_HHmass": fatjet_pt[:, 1] / hh.mass,
        "H2Pt/H1Pt": fatjet_pt[:, 1] / fatjet_pt[:, 0],
    }
)

In [None]:
df_events["bdt_score"] = bdt_model.predict_proba(df_events)[:, 1]

In [None]:
bdt_fail = 0.03
bdt_bin1 = 0.43
bdt_bin2 = 0.11
xbb_bin1 = 0.98
xbb_bin2 = 0.95

mask_bin1 = mask & (fatjet_txbb[:, 1] > xbb_bin1) & (df_events["bdt_score"] > bdt_bin1)
mask_bin2 = (
    mask
    & ~mask_bin1
    & (
        ((fatjet_txbb[:, 1] > xbb_bin1) & (df_events["bdt_score"] > bdt_bin2))
        | ((fatjet_txbb[:, 1] > xbb_bin2) & (df_events["bdt_score"] > bdt_bin1))
    )
)
mask_bin3 = (
    mask
    & ~mask_bin1
    & ~mask_bin2
    & (fatjet_txbb[:, 1] > xbb_bin2)
    & (df_events["bdt_score"] > bdt_fail)
)
mask_fail = mask & ~mask_bin1 & ~mask_bin2 & ~mask_bin3 & (df_events["bdt_score"] > bdt_fail)

In [None]:
df_events["bdt_bin"] = np.zeros(len(df_events))
df_events.loc[mask_bin1, "bdt_bin"] = 1
df_events.loc[mask_bin2, "bdt_bin"] = 2
df_events.loc[mask_bin3, "bdt_bin"] = 3
df_events.loc[mask_fail, "bdt_bin"] = 0
df_events.loc[~mask, "bdt_bin"] = -1
df_events.loc[~mask, "bdt_score"] = -1

In [None]:
# make 18 subfigures
plt.figure()
fig, axs = plt.subplots(3, 6, figsize=(40, 20), sharey=True)
for i, col in enumerate(df_events.columns):
    if i > 17:
        continue
    ax = axs[i // 6, i % 6]
    ax.hist(df_events[col][mask], bins=50, histtype="step")
    ax.set_xlabel(col)
    if i % 6 == 0:
        ax.set_ylabel("Events")
    ax.set_yscale("log")
plt.show()

In [None]:
with uproot.open(file_name) as f:
    arrays = f["Events"].arrays()
    with uproot.recreate(file_name.replace(".root", ".withBDT.root")) as f_out:
        f_out["Events"] = {field: arrays[field] for field in arrays.fields} | {
            "bdt_score": df_events["bdt_score"].to_numpy()
        }

In [None]:
plt.figure()
plt.scatter(df_events["bdt_score"], fatjet_txbb[:, 1], c=df_events["bdt_bin"])
plt.xlim(0, 1)
plt.ylim(0.93, 1)