In [None]:
import numpy as np
from HH4b import utils
from HH4b import postprocessing
import xgboost as xgb
import importlib
import hist
import os

import mplhep as hep
import matplotlib.ticker as mticker
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

from HH4b.postprocessing.PostProcess import add_bdt_scores
import HH4b

formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))
plt.rcParams.update({"font.size": 12})
plt.rcParams["lines.linewidth"] = 2
plt.rcParams["grid.color"] = "#CCCCCC"
plt.rcParams["grid.linewidth"] = 0.5
plt.rcParams["figure.edgecolor"] = "none"

In [None]:
# automatically reloads imported files on edits
%load_ext autoreload
%autoreload 2

In [None]:
package_path = os.path.dirname(HH4b.__file__)


def get_dataframe(events_dict, year, bdt_model_name, bdt_config):
    bdt_model = xgb.XGBClassifier()
    bdt_model.load_model(
        fname=f"{package_path}/boosted/bdt_trainings_run3/{bdt_model_name}/trained_bdt.model"
    )
    make_bdt_dataframe = importlib.import_module(
        f".{bdt_config}", package="HH4b.boosted.bdt_trainings_run3"
    )

    bdt_events_dict = {}
    for key in events_dict:
        events = events_dict[key]
        bdt_events = make_bdt_dataframe.bdt_dataframe(events)
        preds = bdt_model.predict_proba(bdt_events)
        # inference
        add_bdt_scores(bdt_events, preds)

        # extra variables
        bdt_events["H1PNetMass"] = events["bbFatJetPNetMassLegacy"][0]
        bdt_events["H2PNetMass"] = events["bbFatJetPNetMassLegacy"][1]
        bdt_events["H1Msd"] = events["bbFatJetMsd"][0]
        bdt_events["H1TXbb"] = events[f"bbFatJetPNetTXbb{legacy_label}"][0]
        bdt_events["H2TXbb"] = events[f"bbFatJetPNetTXbb{legacy_label}"][1]
        bdt_events["weight"] = events["finalWeight"].to_numpy()

        bdt_events["hlt"] = np.any(
            np.array(
                [events[trigger][0] for trigger in postprocessing.HLTs[year] if trigger in events]
            ),
            axis=0,
        )
        mask_hlt = bdt_events["hlt"] == 1

        # masks
        mask_presel = (
            (bdt_events["H1Msd"] > 40)
            & (bdt_events["H1Pt"] > 300)
            & (bdt_events["H2Pt"] > 250)
            & (bdt_events["H1TXbb"] > 0.8)
        )
        mask_mass = (bdt_events["H2PNetMass"] > 50) & (bdt_events["H2PNetMass"] < 250)
        bdt_events = bdt_events[(mask_mass) & (mask_hlt) & (mask_presel)]

        columns = ["bdt_score", "H1TXbb", "H2TXbb", "H1PNetMass", "H2PNetMass", "weight"]
        bdt_events_dict[key] = bdt_events[columns]
    return bdt_events_dict

In [None]:
data_dir = "24May24_v12_private_signal"
input_dir = f"/ceph/cms/store/user/cmantill/bbbb/skimmer/{data_dir}"

samples_run3 = {
    "2022EE": {
        # "data": ["JetMET_Run2022E"],
        "hh4b": ["GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV?"],
        # "ttbar": ["TTto"],
        # "qcd": qcd_list,
    },
    "2022": {
        "hh4b": ["GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV?"],
        # "ttbar": ["TTto"],
        # "qcd": qcd_list,
    },
    "2023": {
        "hh4b": ["GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV?"],
        # "ttbar": ["TTto"],
        # "qcd": qcd_list,
    },
    "2023BPix": {
        "hh4b": ["GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV?"],
        # "ttbar": ["TTto"],
        # "qcd": qcd_list,
    },
}

mass_var = "H2PNetMass"
bdt_model_name = "24May31_lr_0p02_md_8_AK4Away"
bdt_config = "24May31_lr_0p02_md_8_AK4Away"
bdt_events_dict_year = {}
legacy_label = "Legacy"
for year in samples_run3:
    events = HH4b.postprocessing.load_run3_samples(
        input_dir=input_dir,
        year=year,
        legacy=True,
        samples_run3=samples_run3,
        reorder_txbb=True,
        txbb=f"bbFatJetPNetTXbb{legacy_label}",
    )
    bdt_events_dict_year[year] = get_dataframe(events, year, bdt_model_name, bdt_config)

In [None]:
events_combined, scaled_by = postprocessing.combine_run3_samples(
    bdt_events_dict_year,
    ["hh4b"],
    bg_keys=["ttbar"],
    scale_processes={},
    years_run3=bdt_events_dict_year.keys(),
)

In [None]:
labels = {
    "hh4b": "HH (4b)",
}
bdt_axis = hist.axis.Variable(list(np.arange(0.99, 1, 0.0001)), name="BDT score")
txbb1_axis = hist.axis.Variable(list(np.arange(0.99, 1, 0.0001)), name=r"Jet 1 $T_{Xbb}$")
txbb2_axis = hist.axis.Variable(list(np.arange(0, 1, 0.0001)), name=r"Jet 2 $T_{Xbb}$")


for key, events in events_combined.items():

    h_xbb1_bdt = hist.Hist(txbb1_axis, bdt_axis, storage=hist.storage.Weight())

    h_xbb1 = hist.Hist(txbb1_axis, storage=hist.storage.Weight())
    h_xbb1_bin1 = hist.Hist(txbb1_axis, storage=hist.storage.Weight())

    h_xbb1_bdt.fill(
        events["H1TXbb"],
        events["bdt_score"],
        # weight=events["weight"]
    )
    mask_t2xbb = events["H2TXbb"] > 0.975
    h_xbb1.fill(events["H1TXbb"])
    mask_bin1 = (events["H2TXbb"] > 0.975) & (events["bdt_score"] > 0.98)
    h_xbb1_bin1.fill(events["H1TXbb"][mask_bin1])

    fig, ax = plt.subplots(1, 1, figsize=(6, 5))
    hep.hist2dplot(h_xbb1_bdt, ax=ax)
    ax.set_title(key)

    fig = plt.figure(figsize=(10, 8))
    main_ax_artists, sublot_ax_arists = h_xbb1_bin1.plot_ratio(
        h_xbb1,
        rp_ylabel=r"Efficiency",
        rp_num_label="Preselection",
        rp_denom_label="ggF Category 1",
        rp_uncert_draw_type="line",  # line or bar
        rp_uncertainty_type="efficiency",
    )

In [None]:
import scipy

result = scipy.stats.pearsonr(events["H1TXbb"], events["bdt_score"])

In [None]:
result