In [None]:
import numpy as np
from HH4b import utils
from HH4b.postprocessing import load_columns_legacy
import xgboost as xgb
import importlib
import hist

import mplhep as hep
import matplotlib.ticker as mticker
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))
plt.rcParams.update({"font.size": 12})
plt.rcParams["lines.linewidth"] = 2
plt.rcParams["grid.color"] = "#CCCCCC"
plt.rcParams["grid.linewidth"] = 0.5
plt.rcParams["figure.edgecolor"] = "none"

## Toys for optimization test

Load data events

In [None]:
samples_run3 = {
    "2022EE": {
        "data": ["JetMET_Run"],
        "hh4b": ["GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV"],
    },
}
data_dir = "24Apr23LegacyLowerThresholds_v12_private_signal"
input_dir = f"/eos/uscms/store/user/cmantill/bbbb/skimmer/{data_dir}"
year = "2022EE"

events_dict = utils.load_samples(
    input_dir,
    samples_run3[year],
    year,
    filters=None,
    columns=utils.format_columns(
        load_columns_legacy + [("AK8PFJet250_SoftDropMass40_PFAK8ParticleNetBB0p35", 1)]
    ),
    reorder_txbb=True,
    txbb="bbFatJetPNetTXbbLegacy",
    variations=False,
)

Get dataframe

In [None]:
bdt_config = "24Apr20_legacy_fix"
bdt_model_name = "24Apr20_legacy_fix"
bdt_model = xgb.XGBClassifier()
bdt_model.load_model(fname=f"../boosted/bdt_trainings_run3/{bdt_model_name}/trained_bdt.model")
make_bdt_dataframe = importlib.import_module(
    f".{bdt_config}", package="HH4b.boosted.bdt_trainings_run3"
)

HLTs = {
    "2022EE": [
        "AK8PFJet250_SoftDropMass40_PFAK8ParticleNetBB0p35",
    ],
}

legacy_label = "Legacy"
bdt_events_dict = {}
for key in events_dict:
    events = events_dict[key]
    bdt_events = make_bdt_dataframe.bdt_dataframe(events)
    preds = bdt_model.predict_proba(bdt_events)
    # inference
    bdt_events["bdt_score"] = preds[:, 0]

    bdt_events["H2PNetMass"] = events[f"bbFatJetPNetMassLegacy"][1]
    bdt_events["H1Msd"] = events["bbFatJetMsd"][0]
    bdt_events["H1TXbb"] = events[f"bbFatJetPNetTXbb{legacy_label}"][0]
    bdt_events["H2TXbb"] = events[f"bbFatJetPNetTXbb{legacy_label}"][1]

    bdt_events["weight"] = events["finalWeight"].to_numpy()
    bdt_events["hlt"] = np.any(
        np.array([events[trigger][0] for trigger in HLTs[year] if trigger in events]),
        axis=0,
    )
    mask_hlt = bdt_events["hlt"] == 1

    mask_presel = (
        (bdt_events["H1Msd"] > 30)
        & (bdt_events["H1Pt"] > 300)
        & (bdt_events["H2Pt"] > 300)
        & (bdt_events["H1TXbb"] > 0.8)
    )

    mask_mass = (bdt_events["H2PNetMass"] > 50) & (bdt_events["H2PNetMass"] < 250)
    bdt_events = bdt_events[(mask_mass) & (mask_hlt) & (mask_presel)]

    columns = ["bdt_score", "H2TXbb", "H2PNetMass", "weight"]
    bdt_events_dict[key] = bdt_events[columns]

In [None]:
mass_axis = hist.axis.Regular(20, 50, 250, name="mass")
bdt_axis = hist.axis.Regular(60, 0, 1, name="bdt")
diff_axis = hist.axis.Regular(100, -2, 2, name="diff")
cut_axis = hist.axis.StrCategory([], name="cut", growth=True)

In [None]:
xbb_cut = 0.95
bdt_events_data = bdt_events_dict["data"][bdt_events_dict["data"]["H2TXbb"] > xbb_cut]
bdt_events_sig = bdt_events_dict["hh4b"][bdt_events_dict["hh4b"]["H2TXbb"] > xbb_cut]

Now look at signal

In [None]:
h_mass_sig = hist.Hist(mass_axis)
h_mass_sig.fill(bdt_events_sig["H2PNetMass"], weight=bdt_events_sig["weight"])
h_mass_sig

In [None]:
h_bdt_sig = hist.Hist(bdt_axis)
h_bdt_sig.fill(bdt_events_sig["bdt_score"], weight=bdt_events_sig["weight"])
h_bdt_sig

In [None]:
def get_nevents_data(events, cut, mass, mass_window):
    mw_size = mass_window[1] - mass_window[0]

    # get yield in left sideband (half the size of the mass window)
    cut_mass_0 = (events[mass] < mass_window[0]) & (events[mass] > (mass_window[0] - mw_size / 2))

    # get yield in right sideband (half the size of the mass window)
    cut_mass_1 = (events[mass] < mass_window[1] + mw_size / 2) & (events[mass] > mass_window[1])

    return np.sum((cut_mass_0 | cut_mass_1) & cut)


def get_nevents_signal(events, cut, mass, mass_window):
    cut_mass = (events[mass] >= mass_window[0]) & (events[mass] <= mass_window[1])

    # get yield in Higgs mass window
    return np.sum(events["weight"][cut & cut_mass])

In [None]:
all_bdt_cuts = 0.01 * np.arange(0, 100)
all_bdt_cuts

Check how big the signal should be before the cut

In [None]:
mass_window = [110, 140]
scale_signal_by_bdt_cut = {}
expected_soverb_by_bdt_cut = {}

for bdt_cut in all_bdt_cuts:
    nevents_data = get_nevents_data(
        bdt_events_data,
        (bdt_events_data["bdt_score"] >= bdt_cut),
        "H2PNetMass",
        mass_window,
    )
    nevents_signal = get_nevents_signal(
        bdt_events_sig,
        (bdt_events_sig["bdt_score"] >= bdt_cut),
        "H2PNetMass",
        mass_window,
    )
    # print(nevents_data, nevents_signal, nevents_signal/np.sqrt(nevents_data))
    figure_of_merit = 0
    # for scale_signal in range(10, 100):
    for scale_signal in range(10, 1000):
        nev_signal = nevents_signal * scale_signal
        figure_of_merit = nev_signal / np.sqrt(nev_signal + nevents_data)
        if figure_of_merit > 3:
            # print(scale_signal)
            # print("S / sqrt(B) ", figure_of_merit)
            figure_of_merit_limit = 2 * np.sqrt(nevents_data) / nev_signal
            # print("2sqrt(b)/s ", figure_of_merit_limit)
            scale_signal_by_bdt_cut[bdt_cut] = scale_signal
            expected_soverb_by_bdt_cut[bdt_cut] = figure_of_merit
            break

for bdt_cut in all_bdt_cuts:
    print(
        f"BDT > {bdt_cut}, scale hh4b by {scale_signal_by_bdt_cut[bdt_cut]}: {expected_soverb_by_bdt_cut[bdt_cut]:.2f}"
    )

In [None]:
x = all_bdt_cuts.copy()
y = scale_signal_by_bdt_cut.values()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 5))
ax.scatter(x, y, label="Xbb > 0.8")
ax.scatter(all_bdt_cuts.copy(), scale_signal_by_bdt_cut.values(), label="Xbb > 0.95")
ax.set_xlabel("BDT cut")
ax.set_ylabel("k-factor")
ax.legend()

In [None]:
scale_signal_by_bdt_cut

Let's choose 3

In [None]:
signal_mass = bdt_events_sig["H2PNetMass"]
signal_mass

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 5))
hep.histplot((h_mass_sig * scale_signal_by_bdt_cut[0.9]))

Create a toy
- Construct a CDF: cumulative density function
- Choose based on a uniform random number

In [None]:
h_mass = hist.Hist(mass_axis)
h_mass.fill(bdt_events_data["H2PNetMass"])

# https://stackoverflow.com/questions/17821458/random-number-from-histogram/17822210#17822210
h, bins = h_mass.to_numpy()
integral = int(np.sum(h_mass.values()))

fig, ax = plt.subplots(1, 1, figsize=(6, 5))
hep.histplot(h_mass, ax=ax, label="Data", histtype="errorbar", color="k", capsize=1, yerr=True)

for i in range(1):
    h_mass_toy = hist.Hist(mass_axis)
    bin_midpoints = bins[:-1] + np.diff(bins) / 2
    cdf = np.cumsum(h)
    cdf = cdf / cdf[-1]
    values = np.random.rand(integral)
    value_bins = np.searchsorted(cdf, values)
    random_from_cdf = bin_midpoints[value_bins]
    h_mass_toy.fill(random_from_cdf)

    hep.histplot(h_mass_toy, ax=ax, label="Toy")

    hep.histplot(
        h_mass_toy + h_mass_sig * scale_signal_by_bdt_cut[0.9],
        ax=ax,
        label=r"Toy+Signal$\times$" + f"{scale_signal_by_bdt_cut[0.9]}",
    )

    # print(len(bdt_events["H2PNetMass"]))
    # print(np.sum(h_mass.values()))
    # print(np.sum(h_mass_toy.values()))


ax.legend(title=f"Xbb > {xbb_cut}")

Cross check data sensitivity

In [None]:
mass_window = [110, 140]
scale_signal_by_bdt_cut = {}
expected_soverb_by_bdt_cut = {}

for bdt_cut in [0.9]:
    nevents_data = get_nevents_data(
        bdt_events_data,
        (bdt_events_data["bdt_score"] >= bdt_cut),
        "H2PNetMass",
        mass_window,
    )
    nevents_signal = get_nevents_signal(
        bdt_events_sig,
        (bdt_events_sig["bdt_score"] >= bdt_cut),
        "H2PNetMass",
        mass_window,
    )
    # print(nevents_data, nevents_signal, nevents_signal/np.sqrt(nevents_data))
    figure_of_merit = 0
    print("data ", nevents_data)
    print("sig ", nevents_signal)
    print("s/sqrt(s+b)", nevents_signal / np.sqrt(nevents_data + nevents_signal))
    # for scale_signal in range(10, 100):
    for scale_signal in range(10, 1000):
        nev_signal = nevents_signal * scale_signal
        figure_of_merit = nev_signal / np.sqrt(nevents_data + nev_signal)
        if figure_of_merit > 3:
            # print(scale_signal)
            # print("S / sqrt(B) ", figure_of_merit)
            figure_of_merit_limit = 2 * np.sqrt(nevents_data) / nev_signal
            # print("2sqrt(b)/s ", figure_of_merit_limit)
            scale_signal_by_bdt_cut[bdt_cut] = scale_signal
            expected_soverb_by_bdt_cut[bdt_cut] = figure_of_merit
            print(nev_signal)
            break

for bdt_cut in [0.9]:
    print(
        f"BDT > {bdt_cut}, scale hh4b by {scale_signal_by_bdt_cut[bdt_cut]}: {expected_soverb_by_bdt_cut[bdt_cut]:.2f}"
    )

mass_toy = np.concatenate([bdt_events_data["H2PNetMass"], bdt_events_sig["H2PNetMass"]])
bdt_toy = np.concatenate([bdt_events_data["bdt_score"], bdt_events_sig["bdt_score"]])
weight_toy = np.concatenate(
    [bdt_events_data["weight"], bdt_events_sig["weight"] * scale_signal_by_bdt_cut[bdt_cut]]
)

bdt_cut = 0.9
mw_size = mass_window[1] - mass_window[0]

mask_bdt_cut = bdt_toy >= bdt_cut
cut_mass_0 = (mass_toy < mass_window[0]) & (mass_toy > (mass_window[0] - mw_size / 2))
cut_mass_1 = (mass_toy < mass_window[1] + mw_size / 2) & (mass_toy > mass_window[1])

nevents_data_bdt_cut = int(np.sum(weight_toy[(cut_mass_0 | cut_mass_1) & mask_bdt_cut]))
print(int(nevents_data_bdt_cut))

cut_mass = (bdt_events_sig["H2PNetMass"] >= mass_window[0]) & (
    bdt_events_sig["H2PNetMass"] <= mass_window[1]
)

mask_bdt_cut = bdt_events_sig["bdt_score"] > bdt_cut
nevents_sig_bdt_cut = (
    np.sum(bdt_events_sig["weight"][(cut_mass) & mask_bdt_cut]) * scale_signal_by_bdt_cut[bdt_cut]
)
print(nevents_sig_bdt_cut)

s_over_b = nevents_sig_bdt_cut / np.sqrt(nevents_data_bdt_cut + nevents_sig_bdt_cut)
fom_limit = 2 * np.sqrt(nevents_data_bdt_cut) / nevents_sig_bdt_cut
print("S / sqrt(B) ", s_over_b)
print("2sqrt(b)/s ", fom_limit)

In [None]:
xbb_cut

Now let's add a bdt cut to the toy

In [None]:
print(random_from_cdf)
print(bdt_events_data["H2PNetMass"])
bdt_events_data["bdt_score"]

In [None]:
h_corr = hist.Hist(mass_axis, bdt_axis)
h_corr.fill(bdt_events_sig["H2PNetMass"], bdt_events_sig["bdt_score"])
h_corr.plot2d()

In [None]:
h_corr = hist.Hist(mass_axis, bdt_axis)
h_corr.fill(random_from_cdf, bdt_events_data["bdt_score"])
h_corr.plot2d()

In [None]:
random_from_cdf[bdt_events_data["bdt_score"] > 0.92]

In [None]:
bdt_events_data["H2PNetMass"][bdt_events_data["bdt_score"] > 0.9]

In [None]:
mass_toy = np.concatenate([random_from_cdf, bdt_events_sig["H2PNetMass"]])
bdt_toy = np.concatenate([bdt_events_data["bdt_score"], bdt_events_sig["bdt_score"]])
weight_toy = np.concatenate([bdt_events_data["weight"], bdt_events_sig["weight"]])

bdt_cut = 0.1
h_mass_bdtcut = hist.Hist(mass_axis)
h_mass_bdtcut.fill(mass_toy[bdt_toy > bdt_cut], weight=weight_toy[bdt_toy > bdt_cut])

In [None]:
mass_toy = np.concatenate([random_from_cdf, bdt_events_sig["H2PNetMass"]])
bdt_toy = np.concatenate([bdt_events_data["bdt_score"], bdt_events_sig["bdt_score"]])
weight_toy = np.concatenate([bdt_events_data["weight"], bdt_events_sig["weight"] * nev_signal])

bdt_cut = 0.1
mask_bdt_cut = bdt_toy > bdt_cut
h_mass_bdtcut = hist.Hist(mass_axis)
h_mass_bdtcut.fill(mass_toy[mask_bdt_cut], weight=weight_toy[mask_bdt_cut])

In [None]:
h_mass_bdtcut.values()

In [None]:
h, bins = h_mass_bdtcut.to_numpy()
bins

In [None]:
np.sum(h_mass_bdtcut[6:9].values())

In [None]:
mass_window = [110, 140]
mw_size = mass_window[1] - mass_window[0]

bdt_cut = 0.98
mask_bdt_cut = bdt_toy > bdt_cut
cut_mass_0 = (mass_toy < mass_window[0]) & (mass_toy > (mass_window[0] - mw_size / 2))
cut_mass_1 = (mass_toy < mass_window[1] + mw_size / 2) & (mass_toy > mass_window[1])

nevents_data_bdt_cut = np.sum(weight_toy[(cut_mass_0 | cut_mass_1) & mask_bdt_cut])
print(nevents_data_bdt_cut)

cut_mass = (bdt_events_sig["H2PNetMass"] >= mass_window[0]) & (
    bdt_events_sig["H2PNetMass"] <= mass_window[1]
)

mask_bdt_cut = bdt_events_sig["bdt_score"] > bdt_cut
nevents_sig_bdt_cut = np.sum(bdt_events_sig["weight"][(cut_mass) & mask_bdt_cut]) * nev_signal
print(nevents_sig_bdt_cut)

s_over_b = nevents_sig_bdt_cut / np.sqrt(nevents_data_bdt_cut)
fom_limit = 2 * np.sqrt(nevents_data_bdt_cut) / nevents_sig_bdt_cut
print("S / sqrt(B) ", s_over_b)
print("2sqrt(b)/s ", fom_limit)

Scale up

In [None]:
mass_window = [110, 140]
mw_size = mass_window[1] - mass_window[0]
bdt_cuts = [0.7, 0.75, 0.8, 0.85, 0.9, 0.95]

h_pull = hist.Hist(diff_axis, cut_axis)
for xbb_cut in [0.8, 0.85, 0.9, 0.95]:

    bdt_events_data = bdt_events_dict["data"][bdt_events_dict["data"]["H2TXbb"] > xbb_cut]
    bdt_events_sig = bdt_events_dict["hh4b"][bdt_events_dict["hh4b"]["H2TXbb"] > xbb_cut]

    # compute k-factor
    scale_signal_by_bdt_cut = {}
    expected_soverb_by_bdt_cut = {}

    for bdt_cut in bdt_cuts:
        nevents_data = get_nevents_data(
            bdt_events_data,
            (bdt_events_data["bdt_score"] >= bdt_cut),
            "H2PNetMass",
            mass_window,
        )
        nevents_signal = get_nevents_signal(
            bdt_events_sig,
            (bdt_events_sig["bdt_score"] >= bdt_cut),
            "H2PNetMass",
            mass_window,
        )
        # print(nevents_data, nevents_signal, nevents_signal/np.sqrt(nevents_data))
        figure_of_merit = 0
        for scale_signal in range(10, 1000):
            nev_signal = nevents_signal * scale_signal
            figure_of_merit = nev_signal / np.sqrt(nevents_data + nev_signal)
            if figure_of_merit > 3:
                # print(scale_signal)
                # print("S / sqrt(B) ", figure_of_merit)
                figure_of_merit_limit = 2 * np.sqrt(nevents_data) / nev_signal
                # print("2sqrt(b)/s ", figure_of_merit_limit)
                scale_signal_by_bdt_cut[bdt_cut] = scale_signal
                expected_soverb_by_bdt_cut[bdt_cut] = figure_of_merit
                # print(nev_signal)
                break

    h_mass = hist.Hist(mass_axis)
    h_mass.fill(bdt_events_data["H2PNetMass"])
    h, bins = h_mass.to_numpy()
    integral = int(np.sum(h_mass.values()))

    for toy in range(250):
        h_mass_toy = hist.Hist(mass_axis)
        bin_midpoints = bins[:-1] + np.diff(bins) / 2
        cdf = np.cumsum(h)
        cdf = cdf / cdf[-1]
        values = np.random.rand(integral)
        value_bins = np.searchsorted(cdf, values)
        random_from_cdf = bin_midpoints[value_bins]

        mass_toy = np.concatenate([random_from_cdf, bdt_events_sig["H2PNetMass"]])
        bdt_toy = np.concatenate([bdt_events_data["bdt_score"], bdt_events_sig["bdt_score"]])

        min_fom = 1000
        max_fom = 0
        min_nevents = []
        cuts = []
        figure_of_merits = []
        for bdt_cut in bdt_cuts:
            weight_toy = np.concatenate(
                [
                    bdt_events_data["weight"],
                    bdt_events_sig["weight"] * scale_signal_by_bdt_cut[bdt_cut],
                ]
            )

            mask_bdt_cut = bdt_toy > bdt_cut
            cut_mass_0 = (mass_toy < mass_window[0]) & (mass_toy > (mass_window[0] - mw_size / 2))
            cut_mass_1 = (mass_toy < mass_window[1] + mw_size / 2) & (mass_toy > mass_window[1])

            # get data
            nevents_data_bdt_cut = np.sum(weight_toy[(cut_mass_0 | cut_mass_1) & mask_bdt_cut])

            # get signal
            cut_mass = (bdt_events_sig["H2PNetMass"] >= mass_window[0]) & (
                bdt_events_sig["H2PNetMass"] <= mass_window[1]
            )
            mask_bdt_cut = bdt_events_sig["bdt_score"] > bdt_cut
            nevents_sig_bdt_cut = (
                np.sum(bdt_events_sig["weight"][(cut_mass) & mask_bdt_cut])
                * scale_signal_by_bdt_cut[bdt_cut]
            )

            s_over_b = nevents_sig_bdt_cut / np.sqrt(nevents_data_bdt_cut + nevents_sig_bdt_cut)
            fom_limit = 2 * np.sqrt(nevents_data_bdt_cut) / nevents_sig_bdt_cut

            # print(bdt_cut, fom_limit, s_over_b)
            if nevents_sig_bdt_cut > 0.5 and nevents_data_bdt_cut >= 2:
                # if fom_limit < min_fom:
                if s_over_b > max_fom:
                    cuts.append(bdt_cut)
                    figure_of_merits.append(s_over_b)
                    min_fom = fom_limit
                    min_nevents = [nevents_data_bdt_cut, nevents_sig_bdt_cut]

        if len(cuts) > 0:
            cuts = np.array(cuts)
            figure_of_merits = np.array(figure_of_merits)
            # smallest = np.argmin(figure_of_merits)
            biggest = np.argmax(figure_of_merits)
            # print(biggest)
            # print(figure_of_merits)
            # print(
            #    f"{xbb_cut:.3f} {cuts[biggest]:.2f} {figure_of_merits[biggest]:.2f} "
            #    f"BG: {min_nevents[0]:.2f} S: {min_nevents[1]:.2f}"
            # )
            print(
                f"Xbb: {xbb_cut:.3f} BDT:{cuts[biggest]:.2f} S/(S+B): {figure_of_merits[biggest]:.2f} Pull:{(figure_of_merits[biggest]-expected_soverb_by_bdt_cut[bdt_cut]):.2f} Expected: {expected_soverb_by_bdt_cut[bdt_cut]:.2f}"
            )
            h_pull.fill(
                figure_of_merits[biggest] - expected_soverb_by_bdt_cut[bdt_cut], cut=str(xbb_cut)
            )

In [None]:
h_pull

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 5))
for xbb_cut in [0.8, 0.85, 0.9, 0.95]:
    hep.histplot(
        h_pull[{"cut": f"{xbb_cut}"}],
        ax=ax,
        label=f"Xbb > {xbb_cut}",
        # histtype="errorbar",
        # capsize=1,
        # yerr=True
    )
ax.set_xlabel(f"Difference w.r.t expected" + r"S/$\sqrt{S+B}$")
ax.set_title(r"Injected S, S/$\sqrt{S+B} \sim$ 3")
ax.legend()