In [None]:
import pandas as pd
import numpy as np
import vector
import os
from xgboost import XGBClassifier

import HH4b.utils as utils
from HH4b.utils import ShapeVar
import HH4b.plotting as plotting
import HH4b.postprocessing as postprocessing
from HH4b.postprocessing import Region

import hist
import matplotlib.pyplot as plt
import mplhep as hep
import matplotlib.ticker as mticker

formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))

In [None]:
# automatically reloads imported files on edits
%load_ext autoreload
%autoreload 2

In [None]:
from HH4b.hh_vars import samples_run3

In [None]:
samples_run3

In [None]:
MAIN_DIR = "../../../"
# dir_name = "24Apr19LegacyFixes_v12_private_signal"
# dir_name = "24Mar31_v12_private_signal"
# dir_name = "24Mar31_v12_signal"
dir_name = "24Mar2_v12_signal"
path_to_dir = f"{MAIN_DIR}/../data/skimmer/{dir_name}/"
dirs = {path_to_dir: samples_run3}

load_columns = [
    ("weight", 1),
    ("MET_pt", 1),
    ("nFatJets", 1),
    ("ak8FatJetPt", 2),
    ("ak8FatJetEta", 2),
    ("ak8FatJetPhi", 2),
    ("ak8FatJetMsd", 2),
    ("bbFatJetPt", 2),
    ("bbFatJetEta", 2),
    ("bbFatJetPhi", 2),
    ("bbFatJetMsd", 2),
    ("bbFatJetPNetMass", 2),
    # ("bbFatJetPNetTXbb", 2),
    ("bbFatJetPNetXbb", 2),
    ("bbFatJetTau3OverTau2", 2),
    ("bbFatJetPNetQCD0HF", 2),
    ("bbFatJetPNetQCD1HF", 2),
    ("bbFatJetPNetQCD2HF", 2),
    # ("bbFatJetPNetXbbLegacy", 2),
    # ("bbFatJetPNetTXbbLegacy", 2),
    # ("bbFatJetPNetPXbbLegacy", 2),
    # ("bbFatJetPNetPQCDbLegacy", 2),
    # ("bbFatJetPNetPQCDbbLegacy", 2),
    # ("bbFatJetPNetPQCDothersLegacy", 2),
    # ("bbFatJetPNetMassLegacy", 2),
    # ("VBFJetPt", 2),
    # ("VBFJetEta", 2),
    # ("VBFJetPhi", 2),
    # ("VBFJetMass", 2),
]

events_dict = {}
years = ["2022", "2022EE"]  # , "2023", "2023BPix"]
for year in years:
    events_dict[year] = {}
    for input_dir, samples in dirs.items():
        events_dict[year] = {
            **events_dict[year],
            **utils.load_samples(
                input_dir,
                samples[year],
                year,
                variations=False,
                reorder_legacy_txbb=False,
                columns=utils.format_columns(load_columns),
            ),
        }

In [None]:
events_dict.keys()

In [None]:
import importlib
import sys

sys.path.append("../boosted/bdt_trainings_run3/")
bdt_model = XGBClassifier()
# config_name = "24Apr21_legacy_vbf_vars"
bdt_model.load_model(fname=f"../boosted/bdt_trainings_run3/{config_name}/trained_bdt.model")
# get function
model_name = "v1_msd30"
make_bdt_dataframe = importlib.import_module(f"{model_name}")

events_dict_bdt = {}
for year in years:
    events_dict_bdt[year] = {}
    for key in events_dict[year].keys():
        events = events_dict[year][key]

        # implement cuts from BDT training
        h1msd = events["bbFatJetMsd"].to_numpy()[:, 0]
        h2msd = events["bbFatJetMsd"].to_numpy()[:, 1]
        events = events[(h1msd > 30) & (h2msd > 30) & (h1msd < 250) & (h2msd < 250)]

        # perform inference
        df_events = make_bdt_dataframe.bdt_dataframe(events)
        # bdt_score = bdt_model.predict_proba(df_events)[:, 1]
        events_dict_bdt[year][key] = df_events

        # add variables
        # events_dict_bdt[year][key]["bdt_score"] = bdt_score
        events_dict_bdt[year][key]["finalWeight"] = events["finalWeight"].to_numpy()
        events_dict_bdt[year][key]["H1Msd"] = events["bbFatJetMsd"].to_numpy()[:, 0]
        events_dict_bdt[year][key]["H2Msd"] = events["bbFatJetMsd"].to_numpy()[:, 1]
        events_dict_bdt[year][key]["H2Xbb"] = events["bbFatJetPNetXbb"].to_numpy()[:, 1]
        # events_dict_bdt[year][key]["H2Xbb"] = events["bbFatJetPNetTXbb"].to_numpy()[:, 1]
        events_dict_bdt[year][key]["H1Xbb"] = events["bbFatJetPNetXbb"].to_numpy()[:, 0]
        # events_dict_bdt[year][key]["H2PNetMass"] = events["bbFatJetPNetMass"].to_numpy()[:, 1]
        # events_dict_bdt[year][key]["H2PNetMassLegacy"] = events["bbFatJetPNetMassLegacy"].to_numpy()[:, 1]
        # events_dict_bdt[year][key]["H2XbbLegacy"] = events["bbFatJetPNetXbbLegacy"].to_numpy()[:, 1]
        # events_dict_bdt[year][key]["H2XbbLegacy"] = events["bbFatJetPNetTXbbLegacy"].to_numpy()[:, 1]
        # events_dict_bdt[year][key]["H1XbbLegacy"] = events["bbFatJetPNetXbbLegacy"].to_numpy()[:, 0]

In [None]:
events_dict_bdt["2022EE"].keys()

Make a control plot to make sure the normalization is not super off

In [None]:
control_plot_vars = [
    ShapeVar(var="H1Msd", label=r"$m_{SD}^{1}$ (GeV)", bins=[30, 0, 300]),
    ShapeVar(var="H2Msd", label=r"$m_{SD}^{2}$ (GeV)", bins=[30, 0, 300]),
    ShapeVar(var="H1Xbb", label=r"Xbb$^{1}$", bins=[30, 0, 1]),
    ShapeVar(var="H2Xbb", label=r"Xbb$^{2}$", bins=[30, 0, 1]),
    # ShapeVar(var="H2PNetMass", label=r"$m_{reg}^{2}$ (GeV)", bins=[30, 0, 300]),
    # ShapeVar(var="H2XbbLegacy", label=r"Xbb$^{2}$ Legacy", bins=[30, 0, 1]),
    # ShapeVar(var="H1XbbLegacy", label=r"Xbb$^{2}$ Legacy", bins=[30, 0, 1]),
    # ShapeVar(var="H2PNetMassLegacy", label=r"$m_{reg}^{2}$ Legacy (GeV)", bins=[30, 0, 300]),
    ShapeVar(var="bdt_score", label=r"BDT score", bins=[30, 0, 1]),
]

ylims = {
    "2022": 5e4,
    "2022EE": 4e5,
    "2023": 4e5,
}

events_plot = {}
for year in years:
    events_plot[year] = {}
    for key in events_dict_bdt[year].keys():
        events = events_dict_bdt[year][key]
        # events = events[(events["H2PNetMass"] > 60) & (events["H2PNetMass"] < 250)]
        events_plot[year][key] = events


for year in ["2022EE"]:
    hists = {}
    for shape_var in control_plot_vars:
        print(shape_var)
        if shape_var.var not in hists:
            hists[shape_var.var] = utils.singleVarHist(
                events_plot[year],
                shape_var,
                weight_key="finalWeight",
            )

        bkgs = ["ttbar", "vhtobb", "vjets", "diboson", "novhhtobb", "qcd"]

        sigs = ["hh4b"]
        if "2023" in year or year == "2022":
            sigs = []

        plotting.ratioHistPlot(
            hists[shape_var.var],
            year,
            sigs,
            bkgs,
            name="test",
            show=True,
            log=True,
            plot_significance=False,
            significance_dir=shape_var.significance_dir,
            ratio_ylims=[0.2, 1.8],
            ylim=ylims[year],
        )

Look into BDT sculpting of H2Msd in data

In [None]:
cat_axis = hist.axis.StrCategory([], name="Sample", growth=True)
cut_axis = hist.axis.StrCategory([], name="Cut", growth=True)
h2_mass_axis = hist.axis.Regular(40, 0, 300, name="mass", label=r"Higgs 2 mass [GeV]")

hist_h2 = hist.Hist(h2_mass_axis, cut_axis, cat_axis)
bdt_cuts = [0, 0.2, 0.3, 0.5, 0.7, 0.9]

for key in ["hh4b", "qcd", "data"]:
    events = events_dict_bdt["2022EE"][key]
    # h2_mass = events["H2Msd"]
    h2_mass = events["H2PNetMass"]

    for cut in bdt_cuts:
        mask = (events["bdt_score"] >= cut) & (h2_mass > 30) & (h2_mass < 250)
        hist_h2.fill(h2_mass[mask], str(cut), key)

for key in ["qcd", "data"]:
    fig, ax = plt.subplots(1, 1, figsize=(12, 4))
    for cut in bdt_cuts:
        hep.histplot(
            hist_h2[{"Sample": key, "Cut": str(cut)}], lw=2, label=f"BDT > {cut}", density=True
        )
    ax.legend()
    ax.set_ylabel("Density")
    ax.set_title(key)

Create sideband in data
- Estimate number of events that pass BDT & H2Xbb selection

In [None]:
# create combined datasets
lumi_weight_2022EEtoall = (7971.4 + 26337.0 + 17650.0 + 9451.0) / 26337.0

events_dict_bdt_combined_2023 = {}
for key in ["data", "hh4b", "qcd", "ttbar", "vhtobb", "vjets", "diboson", "novhhtobb"]:
    if key == "data":
        combined = pd.concat(
            [
                events_dict_bdt["2022"][key],
                events_dict_bdt["2022EE"][key],
                events_dict_bdt["2023"][key],
                events_dict_bdt["2023BPix"][key],
            ]
        )
    else:
        combined = events_dict_bdt["2022EE"][key].copy()
        combined["weight"] = combined["finalWeight"] * lumi_weight_2022EEtoall
    events_dict_bdt_combined_2023[key] = combined

In [None]:
def get_nevents_data(events, xbb_cut, bdt_cut, hvar):
    cut_xbb = events["H2Xbb"] > xbb_cut
    cut_bdt = events["bdt_score"] > bdt_cut
    cut_mass = (events[hvar] > 50) & (events[hvar] < 220)

    if hvar == "H2Msd":
        cut_msd_0 = (events[hvar] < 95) & (events[hvar] > 75)
        cut_msd_1 = (events[hvar] < 155) & (events[hvar] > 135)
    else:
        cut_msd_0 = (events[hvar] < 100) & (events[hvar] > 80)
        cut_msd_1 = (events[hvar] < 160) & (events[hvar] > 140)

    return np.sum(cut_msd_0 & cut_xbb & cut_bdt & cut_mass) + np.sum(
        cut_msd_1 & cut_xbb & cut_bdt & cut_mass
    )


def get_nevents_signal(events, xbb_cut, bdt_cut, hvar):
    cut_xbb = events["H2Xbb"] > xbb_cut
    cut_bdt = events["bdt_score"] > bdt_cut
    if hvar == "H2Msd":
        cut_mass = (events[hvar] > 95) & (events[hvar] < 135)
    else:
        cut_mass = (events[hvar] > 105) & (events[hvar] < 140)

    # get yield
    # for signal, we use the lumi weight to scale the same signal by the total lumi (2022, 2022EE)
    return np.sum(events["weight"][cut_xbb & cut_bdt & cut_mass])


events = events_dict_bdt_combined
# events = events_dict_bdt_combined_2023

for xbb_cut in [0.8, 0.9, 0.94]:
    figure_of_merits = []
    cuts = []
    for bdt_cut in np.arange(0.01, 1, 0.01):
        # hvar = "H2Msd"
        hvar = "H2PNetMass"
        nevents_data = get_nevents_data(events["data"], xbb_cut, bdt_cut, hvar=hvar)
        nevents_signal = get_nevents_signal(events["hh4b"], xbb_cut, bdt_cut, hvar=hvar)

        figure_of_merit = 2 * np.sqrt(nevents_data) / nevents_signal

        if nevents_signal > 0.5:
            cuts.append(bdt_cut)
            figure_of_merits.append(figure_of_merit)
            print(
                f"Xbb_Cut: {xbb_cut}, BDT_Cut: {bdt_cut:.2f}, NBkg: {nevents_data}, NSig: {nevents_signal:.2f}, FigureOfMerit: {figure_of_merit:.2f}"
            )

    if len(cuts) > 0:
        cuts = np.array(cuts)
        figure_of_merits = np.array(figure_of_merits)
        smallest = np.argmin(figure_of_merits)

        print(xbb_cut, cuts[smallest], figure_of_merits[smallest])

In [None]:
# txbb_cut = 0.94
# bdt_cut = 0.96

txbb_cut = 0.9
# bdt_cut = 0.93
bdt_cut = 0.94

# txbb_cut = 0.94
# bdt_cut = 0.978

selection_regions = {
    "pass": Region(
        cuts={
            "H2Xbb": [txbb_cut, 1],
            "bdt_score": [bdt_cut, 1],
        },
        label="Pass",
    ),
    "fail": Region(
        cuts={
            "H2Xbb": [0, txbb_cut],
            "bdt_score": [0.03, 1],
        },
        label="Fail",
    ),
}

fit_shape_var = ShapeVar(
    "H2PNetMass",
    r"$m^{2}_\mathrm{reg}$ (GeV)",
    # "H2Msd",
    # r"$m^{2}_\mathrm{SD}$ (GeV)",
    [16, 60, 220],
    reg=True,
    blind_window=[110, 140],
)

templ_dir = "./templates/15Mar24/"
year = "2022-2023"
os.system(f"mkdir -p {templ_dir}/{year}")
os.system(f"mkdir -p {templ_dir}/cutflows/{year}")

templates = postprocessing.get_templates(
    events_dict_bdt_combined_2023,
    bb_masks=None,
    year="2022-2023",
    sig_keys=["hh4b"],
    selection_regions=selection_regions,
    shape_vars=[fit_shape_var],
    systematics={},
    template_dir=f"{templ_dir}",
    bg_keys=["qcd", "ttbar", "vhtobb", "vjets", "diboson", "novhhtobb"],
    plot_dir=f"{templ_dir}/{year}",
    weight_key="weight",
    show=True,
    energy=13.6,
)

In [None]:
txbb_cut = 0.94

selection_regions = {
    "pass": Region(
        cuts={
            "H1Xbb": [txbb_cut, 1],
            "H2Xbb": [txbb_cut, 1],
            # "H1Msd": [100, 150],
        },
        label="Pass",
    ),
    "fail": Region(
        cuts={
            "H1Xbb": [0, txbb_cut],
            "H2Xbb": [0, txbb_cut],
            # "H1Msd": [100, 150],
        },
        label="Fail",
    ),
}

fit_shape_var = ShapeVar(
    "H2Msd",
    r"$m^{2}_\mathrm{SD}$ (GeV)",
    [17, 50, 220],
    reg=True,
    blind_window=[110, 140],
)

templ_dir = "./templates/15Mar24-cutbased/"
year = "2022-2023"
os.system(f"mkdir -p {templ_dir}/{year}")
os.system(f"mkdir -p {templ_dir}/cutflows/{year}")

templates = postprocessing.get_templates(
    events_dict_bdt_combined_2023,
    bb_masks=None,
    year="2022-2023",
    sig_keys=["hh4b"],
    selection_regions=selection_regions,
    shape_vars=[fit_shape_var],
    systematics={},
    template_dir=f"{templ_dir}",
    bg_keys=["qcd", "ttbar", "vhtobb", "vjets", "diboson", "novhhtobb"],
    plot_dir=f"{templ_dir}/{year}",
    weight_key="weight",
    show=True,
    energy=13.6,
)