# Misc Checks

In [1]:
from __future__ import annotations

import importlib
from pathlib import Path

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
from xgboost import XGBClassifier

import HH4b.plotting as plotting
import HH4b.postprocessing as postprocessing
from HH4b.hh_vars import samples, samples_run3, years
from HH4b.postprocessing import PostProcess

formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))

In [2]:
# automatically reloads imported files on edits
%load_ext autoreload
%autoreload 2

## Load samples

In [3]:
MAIN_DIR = Path("../../../")
plot_dir = MAIN_DIR / "../plots/PostProcess/24Apr24Legacy"
plot_dir.mkdir(parents=True, exist_ok=True)

data_dir = "/eos/uscms/store/user/rkansal/bbbb/skimmer/24Apr19LegacyFixes_v12_private_signal/"
dirs = {data_dir: samples}

bdt_model_name = "24Apr21_legacy_vbf_vars"
bdt_config = "24Apr21_legacy_vbf_vars"

In [4]:
def load_process_run3_samples(data_dir, year, samples_run3):
    events_dict = postprocessing.load_run3_samples(data_dir, year, True, samples_run3)
    legacy_label = "Legacy"

    # define BDT model
    bdt_model = XGBClassifier()
    bdt_model.load_model(fname=f"../boosted/bdt_trainings_run3/{bdt_model_name}/trained_bdt.model")
    # get function
    make_bdt_dataframe = importlib.import_module(
        f".{bdt_config}", package="HH4b.boosted.bdt_trainings_run3"
    )

    # inference and assign score
    events_dict_postprocess = {}
    for key in events_dict:
        bdt_events = make_bdt_dataframe.bdt_dataframe(events_dict[key])
        preds = bdt_model.predict_proba(bdt_events)
        PostProcess.add_bdt_scores(bdt_events, preds)
        bdt_events["weight"] = events_dict[key]["finalWeight"].to_numpy()
        bdt_events["H2TXbb"] = events_dict[key][f"bbFatJetPNetTXbb{legacy_label}"].to_numpy()[:, 1]
        bdt_events["H2PNetMass"] = events_dict[key][f"bbFatJetPNetMass{legacy_label}"].to_numpy()[
            :, 1
        ]
        events_dict[key] = bdt_events

    return events_dict

In [5]:
bg_keys = ["ttbar", "vhtobb", "gghtobb"]
processes = ["data"] + ["hh4b"] + bg_keys

from copy import deepcopy

samples_run3 = deepcopy(samples_run3)

for year in samples_run3:
    for key in list(samples_run3[year].keys()):
        if key not in processes:
            samples_run3[year].pop(key)

In [6]:
bdt_training_keys = PostProcess.get_bdt_training_keys("24Apr21_legacy_vbf_vars")

events_dict_postprocess = {}
cutflows = {}
for year in years:
    print(f"\n{year}")
    events_dict_postprocess[year] = load_process_run3_samples(data_dir, year, samples_run3)

print("Loaded all years")

Found BDT Training keys ['hh4b', 'qcd', 'ttbar', 'vbfhh4b-k2v0']

2022
Loaded JetMET_Run2022C                                   : 201356 entries
Loaded JetMET_Run2022C_single                            : 22682 entries
Loaded JetMET_Run2022D                                   : 131741 entries
Loaded TTto2L2Nu                                         : 17657 entries
Loaded TTto4Q                                            : 623203 entries
Loaded TTtoLNu2Q                                         : 214484 entries
Loaded GluGluHto2B_PT-200_M-125                          : 1307 entries
Loaded WminusH_Hto2B_Wto2Q_M-125                         : 6165 entries
Loaded WplusH_Hto2B_Wto2Q_M-125                          : 8882 entries
Loaded ZH_Hto2B_Zto2Q_M-125                              : 27150 entries
Loaded ggZH_Hto2B_Zto2Q_M-125                            : 27150 entries

2022EE
Loaded JetMET_Run2022E                                   : 232098 entries
Loaded JetMET_Run2022F                     

In [7]:
events_combined = PostProcess.combine_run3_samples(events_dict_postprocess, processes, bg_keys)

## S/B optimization using the ABCD method

In [8]:
def get_nevents_sidebands(events, cut, mass, mass_window):
    mw_size = mass_window[1] - mass_window[0]

    # get yield in left sideband
    cut_mass_0 = (events[mass] < mass_window[0]) & (events[mass] > (mass_window[0] - mw_size / 2))

    # get yield in right sideband
    cut_mass_1 = (events[mass] < mass_window[1] + mw_size / 2) & (events[mass] > mass_window[1])

    return np.sum(events["weight"][(cut_mass_0 | cut_mass_1) & cut])


def get_nevents_signal(events, cut, mass, mass_window):
    cut_mass = (events[mass] >= mass_window[0]) & (events[mass] <= mass_window[1])

    # get yield in Higgs mass window
    return np.sum(events["weight"][cut & cut_mass])


def get_nevents_nosignal(events, cut, mass, mass_window):
    cut_mass = (events[mass] >= mass_window[0]) & (events[mass] <= mass_window[1])

    # get yield in Higgs mass window
    return np.sum(events["weight"][cut & ~cut_mass])


def get_s_b(events_dict, cut_dict, mass, mass_window):
    s = get_nevents_signal(events_dict["hh4b"], cut_dict["hh4b"], mass, mass_window)
    bd = get_nevents_sidebands(events_dict["data"], cut_dict["data"], mass, mass_window)

    bgmcb = 0
    bgmcs = 0
    for key in bg_keys:
        bgmcb += get_nevents_sidebands(events_dict[key], cut_dict[key], mass, mass_window)
        bgmcs += get_nevents_signal(events_dict[key], cut_dict[key], mass, mass_window)

    b = bd - bgmcb + bgmcs
    return s, b

In [9]:
def abcd(events_dict, txbb_cut, bdt_cut, mass, mass_window):
    dicts = {"data": [], **{key: [] for key in bg_keys}}

    for key in ["hh4b", "data"] + bg_keys:
        events = events_dict[key]
        cut = (events["bdt_score"] > bdt_cut) & (events["H2TXbb"] > txbb_cut)

        if key == "hh4b":
            s = get_nevents_signal(events, cut, mass, mass_window)
            continue

        # region A
        if key == "data":
            dicts[key].append(0)
        else:
            dicts[key].append(get_nevents_signal(events, cut, mass, mass_window))

        # region B
        dicts[key].append(get_nevents_nosignal(events, cut, mass, mass_window))

        cut = (events["bdt_score"] < 0.6) & (events["H2TXbb"] < 0.8)
        # region C
        dicts[key].append(get_nevents_signal(events, cut, mass, mass_window))
        # region D
        dicts[key].append(get_nevents_nosignal(events, cut, mass, mass_window))

    bg_tots = np.sum([dicts[key] for key in bg_keys], axis=0)
    dmt = np.array(dicts["data"]) - bg_tots
    bqcd = dmt[2] * dmt[1] / dmt[3]
    # print(dicts)

    return s, bqcd + bg_tots[0], dicts

## Run the optimization:

In [None]:
# Input: abcd_dicts: {key1: [Region A nevents, B nevents, C nevents, D], key2: ...}
# Return: estimated nevents of bg in region B
def get_regionB_nevents(abcd_dicts):
    dicts = abcd_dicts
    nevents_regionB_per_key = [val[1] for key, val in dicts.items()]
    return np.sum(nevents_regionB_per_key)

In [11]:
mass = "H2PNetMass"
mass_window = [115, 135]

txbb_cut_list = []
bdt_cut_list = []
s_list = []
b_list = []
b_vs_s_list = []
nevents_regionB_list = []
for txbb_cut in np.arange(0.95, 1.0, 0.002):
    for bdt_cut in np.arange(0.9, 1.0, 0.005):
        s, b, d = abcd(events_combined, txbb_cut, bdt_cut, mass, mass_window)

        # derive values
        nevents_regionB = d["data"][1]
        nevents_regionB_list.append(nevents_regionB)

        txbb_cut_list.append(txbb_cut)
        bdt_cut_list.append(bdt_cut)
        s_list.append(s)
        b_list.append(b)
        b_vs_s_list.append(b / s)

s, tot, dicts = abcd(events_combined, 0.99, 0.955, mass, mass_window)

  b_vs_s_list.append(b/s)


In [12]:
b_array = np.array(b_list)
s_array = np.array(s_list)
nevents_regionB_array = np.array(nevents_regionB_list)

In [13]:
# 2√(B+sigma_B^2)/S
# sigma_B = B/sqrt(total data in the A + B regions)
# total data in A region = b
# total data in B region: nevents_regionB_array
sigma_B = b_array / np.sqrt(b_array + nevents_regionB_array)
double_sqrtBPlusVarB_vs_s = 2 * np.sqrt(b_array + np.square(sigma_B)) / s_array

  sigma_B = b_array/np.sqrt(b_array + nevents_regionB_array)
  sigma_B = b_array/np.sqrt(b_array + nevents_regionB_array)


In [14]:
# 2*sqrt(B)/S
double_sqrtB_vs_s = 2 * np.sqrt(b_array) / s_array

  double_sqrtB_vs_s = 2*np.sqrt(b_array)/s_array
  double_sqrtB_vs_s = 2*np.sqrt(b_array)/s_array


In [15]:
asimov = np.sqrt(2 * ((s_array + b_array) * np.log(1 + s_array / b_array) - s_array))

  asimov = np.sqrt(2*((s_array+b_array)*np.log(1+s_array/b_array)-s_array))
  asimov = np.sqrt(2*((s_array+b_array)*np.log(1+s_array/b_array)-s_array))


In [16]:
df_billy = pd.DataFrame(
    {
        "txbb_cut": txbb_cut_list,
        "bdt_cut": bdt_cut_list,
        "s": s_list,
        "b": b_list,
        "b_vs_s": b_vs_s_list,
        "2*sqrt(B)/S": double_sqrtB_vs_s,
        "2*sqrt(B+sigma_B^2)/S": double_sqrtBPlusVarB_vs_s,
        "asimov": asimov,
    }
)

df_billy

Unnamed: 0,txbb_cut,bdt_cut,s,b,b_vs_s,2*sqrt(B)/S,2*sqrt(B+sigma_B^2)/S,asimov
0,0.95,0.900,1.297166,67.004510,51.654554,12.620792,13.562174,0.157961
1,0.95,0.905,1.268163,63.297107,49.912457,12.547206,13.478154,0.158870
2,0.95,0.910,1.238501,59.069101,47.694022,12.411200,13.331489,0.160587
3,0.95,0.915,1.206988,54.461977,45.122206,12.228512,13.136869,0.162954
4,0.95,0.920,1.167367,50.693778,43.425737,12.198316,13.102605,0.163334
...,...,...,...,...,...,...,...,...
515,1.00,0.975,0.000000,0.000000,,,,
516,1.00,0.980,0.000000,0.000000,,,,
517,1.00,0.985,0.000000,0.000000,,,,
518,1.00,0.990,0.000000,0.000000,,,,


In [17]:
df_billy["s"].max()

1.2971655887003222

In [18]:
s_greater_0pt6 = df_billy["s"] > 0.6
s_greater_0pt7 = df_billy["s"] > 0.7
s_greater_1 = df_billy["s"] > 1

In [19]:
best_row_s_greater_1 = df_billy[s_greater_1]["2*sqrt(B+sigma_B^2)/S"].argmin()
df_billy[s_greater_1].iloc[best_row_s_greater_1]

txbb_cut                  0.970000
bdt_cut                   0.915000
s                         1.017023
b                        29.944751
b_vs_s                   29.443523
2*sqrt(B)/S              10.761168
2*sqrt(B+sigma_B^2)/S    11.550330
asimov                    0.184816
Name: 203, dtype: float64

In [20]:
best_row_s_greater_0pt7 = df_billy[s_greater_0pt7]["2*sqrt(B)/S"].argmin()
df_billy[s_greater_0pt7].iloc[best_row_s_greater_0pt7]

txbb_cut                  0.974000
bdt_cut                   0.945000
s                         0.753345
b                        14.481900
b_vs_s                   19.223473
2*sqrt(B)/S              10.102970
2*sqrt(B+sigma_B^2)/S    10.827443
asimov                    0.196281
Name: 249, dtype: float64

In [21]:
best_row_s_greater_0pt6 = df_billy[s_greater_0pt6]["2*sqrt(B)/S"].argmin()
df_billy[s_greater_0pt6].iloc[best_row_s_greater_0pt6]

txbb_cut                  0.978000
bdt_cut                   0.950000
s                         0.643572
b                         9.946387
b_vs_s                   15.454962
2*sqrt(B)/S               9.800884
2*sqrt(B+sigma_B^2)/S    10.493749
asimov                    0.201920
Name: 290, dtype: float64

In [22]:
best_row = df_billy["2*sqrt(B)/S"].argmin()
df_billy.iloc[best_row]

txbb_cut                 0.994000
bdt_cut                  0.990000
s                        0.054790
b                        0.002766
b_vs_s                   0.050478
2*sqrt(B)/S              1.919673
2*sqrt(B+sigma_B^2)/S    2.714827
asimov                   0.489733
Name: 458, dtype: float64

In [24]:
b_greater_1 = df_billy["b"] > 1
best_row_b_greater_1 = df_billy[b_greater_1]["2*sqrt(B)/S"].argmin()
df_billy[b_greater_1].iloc[best_row_b_greater_1]

txbb_cut                 0.990000
bdt_cut                  0.960000
s                        0.318916
b                        1.391891
b_vs_s                   4.364449
2*sqrt(B)/S              7.398725
2*sqrt(B+sigma_B^2)/S    7.988777
asimov                   0.260870
Name: 412, dtype: float64

In [25]:
b_greater_2 = df_billy["b"] > 2
best_row_b_greater_2 = df_billy[b_greater_2]["2*sqrt(B)/S"].argmin()
df_billy[b_greater_2].iloc[best_row_b_greater_2]

txbb_cut                 0.988000
bdt_cut                  0.960000
s                        0.369107
b                        2.195266
b_vs_s                   5.947512
2*sqrt(B)/S              8.028266
2*sqrt(B+sigma_B^2)/S    8.670407
asimov                   0.242587
Name: 392, dtype: float64

In [26]:
b_greater_2pt8 = df_billy["b"] > 2.8
best_row_b_greater_2pt8 = df_billy[b_greater_2pt8]["2*sqrt(B)/S"].argmin()
df_billy[b_greater_2pt8].iloc[best_row_b_greater_2pt8]

txbb_cut                 0.988000
bdt_cut                  0.950000
s                        0.428807
b                        3.122548
b_vs_s                   7.281937
2*sqrt(B)/S              8.241806
2*sqrt(B+sigma_B^2)/S    8.858222
asimov                   0.237407
Name: 390, dtype: float64

In [11]:
import pprint

print(s)
print(tot)
pprint.pprint(dicts)

0.3456375263338129
1.9341606302405638
{'data': [5.0, 10.0, 308849.0, 1932472.0],
 'gghtobb': [0.08909892492511226,
             0.06149747496262502,
             41.54549065304297,
             245.85426839043004],
 'ttbar': [0.8107525481447151,
           2.800376876561667,
           16303.09656374311,
           107041.71773143728],
 'vhtobb': [0.0911000119421361,
            1.2528814953534342,
            6.391645754069161,
            115.24300076740114]}


## Old stuff:

In [None]:
sig_samples = {"hh4b": samples[year]["hh4b"]}

In [None]:
events = pd.read_parquet(
    Path(data_dir) / "2022EE" / "GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV" / "parquet"
)

In [None]:
samples = ["qcd", "ttbar"]
mass = "bbFatJetMsd"
tagger = "bbFatJetPNetTXbbLegacy"
i = 1

for sample in samples:
    plt.figure(figsize=(10, 10))
    plt.title(sample)
    for cut in [0, 0.8, 0.9, 0.95]:
        cut_mask = events_dict[sample][tagger][i] >= cut
        plt.hist(
            events_dict[sample][mass][i][cut_mask],
            np.arange(60, 251, 10),
            weights=events_dict[sample]["finalWeight"][cut_mask],
            histtype="step",
            label=rf"$T_{{Xbb}} \geq {cut}$",
            density=True,
        )

    plt.xlabel(f"Jet {i+1} {mass} (GeV)")
    plt.legend()
    plt.savefig(plot_dir / f"{sample}_{mass}{i}_{tagger}_sculpting.pdf", bbox_inches="tight")
    plt.show()

## BDT ROC Curve

## tt ROC curve

In [None]:
jet = 1
tagger = "bbFatJetPNetTXbbLegacy"
sig_jets_score = events_dict["hh4b"][tagger][jet]
bg_jets_score = {
    "qcd": events_dict["qcd"][tagger][jet],
    "ttbar": events_dict["ttbar"][tagger][jet],
}

In [None]:
from sklearn.metrics import roc_curve

bg_skip = 1
sig_key = "hh4b"
weight_key = "finalWeight"
rocs = {}

for bg_key in ["qcd", "ttbar"]:
    print(bg_key)
    y_true = np.concatenate(
        [
            np.ones(len(sig_jets_score)),
            np.zeros((len(bg_jets_score[bg_key]) - 1) // bg_skip + 1),
        ]
    )

    weights = np.concatenate(
        [
            events_dict[sig_key][weight_key].to_numpy(),
            events_dict[bg_key][weight_key].to_numpy()[::bg_skip],
        ]
    )

    scores = np.concatenate((sig_jets_score, bg_jets_score[bg_key][::bg_skip]))

    fpr, tpr, thresholds = roc_curve(y_true, scores, sample_weight=weights)

    rocs[bg_key] = {
        "fpr": fpr,
        "tpr": tpr,
        "thresholds": thresholds,
        "label": plotting.label_by_sample[bg_key],
    }

In [None]:
def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return idx

In [None]:
plotting.multiROCCurveGrey(
    {"test": rocs},
    [0.2, 0.5],
    xlim=[0, 0.8],
    ylim=[1e-5, 1],
    plot_dir=plot_dir,
    name=f"{tagger}_ROCs",
    show=True,
)