# Misc Checks

In [None]:
import pandas as pd
import numpy as np
import vector
import os
from xgboost import XGBClassifier
from pathlib import Path

import HH4b.utils as utils
from HH4b.utils import ShapeVar
import HH4b.plotting as plotting
from HH4b.postprocessing import PostProcess, Region
import HH4b.postprocessing as postprocessing
from HH4b.hh_vars import samples, years, samples_run3

import hist
import matplotlib.pyplot as plt
import mplhep as hep
import matplotlib.ticker as mticker
import importlib

formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))

In [None]:
# automatically reloads imported files on edits
%load_ext autoreload
%autoreload 2

## Load samples

In [None]:
MAIN_DIR = Path("../../../")
plot_dir = MAIN_DIR / "../plots/PostProcess/24Apr24Legacy"
plot_dir.mkdir(parents=True, exist_ok=True)

data_dir = "/eos/uscms/store/user/rkansal/bbbb/skimmer/24Apr19LegacyFixes_v12_private_signal/"
dirs = {data_dir: samples}

bdt_model_name = "24Apr21_legacy_vbf_vars"
bdt_config = "24Apr21_legacy_vbf_vars"

In [None]:
def load_process_run3_samples(data_dir, year, samples_run3):
    events_dict = postprocessing.load_run3_samples(data_dir, year, True, samples_run3)
    legacy_label = "Legacy"

    # define BDT model
    bdt_model = XGBClassifier()
    bdt_model.load_model(fname=f"../boosted/bdt_trainings_run3/{bdt_model_name}/trained_bdt.model")
    # get function
    make_bdt_dataframe = importlib.import_module(
        f".{bdt_config}", package="HH4b.boosted.bdt_trainings_run3"
    )

    # inference and assign score
    events_dict_postprocess = {}
    for key in events_dict:
        bdt_events = make_bdt_dataframe.bdt_dataframe(events_dict[key])
        preds = bdt_model.predict_proba(bdt_events)
        PostProcess.add_bdt_scores(bdt_events, preds)
        bdt_events["weight"] = events_dict[key]["finalWeight"].to_numpy()
        bdt_events["H2TXbb"] = events_dict[key][f"bbFatJetPNetTXbb{legacy_label}"].to_numpy()[:, 1]
        bdt_events["H2PNetMass"] = events_dict[key][f"bbFatJetPNetMass{legacy_label}"].to_numpy()[
            :, 1
        ]
        events_dict[key] = bdt_events

    return events_dict

In [None]:
bg_keys = ["ttbar", "vhtobb", "gghtobb"]
processes = ["data"] + ["hh4b"] + bg_keys

from copy import deepcopy

samples_run3 = deepcopy(samples_run3)

for year in samples_run3:
    for key in list(samples_run3[year].keys()):
        if key not in processes:
            samples_run3[year].pop(key)

In [None]:
bdt_training_keys = PostProcess.get_bdt_training_keys("24Apr21_legacy_vbf_vars")

events_dict_postprocess = {}
cutflows = {}
for year in years:
    print(f"\n{year}")
    events_dict_postprocess[year] = load_process_run3_samples(data_dir, year, samples_run3)

print("Loaded all years")

In [None]:
events_combined = PostProcess.combine_run3_samples(events_dict_postprocess, processes, bg_keys)

## S/B optimization using the ABCD method

In [None]:
def get_nevents_sidebands(events, cut, mass, mass_window):
    mw_size = mass_window[1] - mass_window[0]

    # get yield in left sideband
    cut_mass_0 = (events[mass] < mass_window[0]) & (events[mass] > (mass_window[0] - mw_size / 2))

    # get yield in right sideband
    cut_mass_1 = (events[mass] < mass_window[1] + mw_size / 2) & (events[mass] > mass_window[1])

    return np.sum(events["weight"][(cut_mass_0 | cut_mass_1) & cut])


def get_nevents_signal(events, cut, mass, mass_window):
    cut_mass = (events[mass] >= mass_window[0]) & (events[mass] <= mass_window[1])

    # get yield in Higgs mass window
    return np.sum(events["weight"][cut & cut_mass])


def get_nevents_nosignal(events, cut, mass, mass_window):
    cut_mass = (events[mass] >= mass_window[0]) & (events[mass] <= mass_window[1])

    # get yield in Higgs mass window
    return np.sum(events["weight"][cut & ~cut_mass])


def get_s_b(events_dict, cut_dict, mass, mass_window):
    s = get_nevents_signal(events_dict["hh4b"], cut_dict["hh4b"], mass, mass_window)
    bd = get_nevents_sidebands(events_dict["data"], cut_dict["data"], mass, mass_window)

    bgmcb = 0
    bgmcs = 0
    for key in bg_keys:
        bgmcb += get_nevents_sidebands(events_dict[key], cut_dict[key], mass, mass_window)
        bgmcs += get_nevents_signal(events_dict[key], cut_dict[key], mass, mass_window)

    b = bd - bgmcb + bgmcs
    return s, b

In [None]:
def abcd(events_dict, txbb_cut, bdt_cut, mass, mass_window):
    dicts = {"data": [], **{key: [] for key in bg_keys}}

    for key in ["hh4b", "data"] + bg_keys:
        events = events_dict[key]
        cut = (events["bdt_score"] > bdt_cut) & (events["H2TXbb"] > txbb_cut)

        if key == "hh4b":
            s = get_nevents_signal(events, cut, mass, mass_window)
            continue

        # region A
        if key == "data":
            dicts[key].append(0)
        else:
            dicts[key].append(get_nevents_signal(events, cut, mass, mass_window))

        # region B
        dicts[key].append(get_nevents_nosignal(events, cut, mass, mass_window))

        cut = (events["bdt_score"] < 0.6) & (events["H2TXbb"] < 0.8)
        # region C
        dicts[key].append(get_nevents_signal(events, cut, mass, mass_window))
        # region D
        dicts[key].append(get_nevents_nosignal(events, cut, mass, mass_window))

    bg_tots = np.sum([dicts[key] for key in bg_keys], axis=0)
    dmt = np.array(dicts["data"]) - bg_tots
    bqcd = dmt[2] * dmt[1] / dmt[3]
    # print(dicts)

    return s, bqcd + bg_tots[0], dicts

## Run the optimization:

In [None]:
mass = "H2PNetMass"
mass_window = [115, 135]

txbb_cut_list = []
bdt_cut_list = []
s_list = []
b_list = []
b_vs_s_list = []
nevents_regionB_list = []
d_list = []
for txbb_cut in np.arange(0.95, 1, 0.002):
    for bdt_cut in np.arange(0.90, 1, 0.005):
        s, b, d = abcd(events_combined, txbb_cut, bdt_cut, mass, mass_window)

        # derive values
        nevents_regionB = d["data"][1]
        nevents_regionB_list.append(nevents_regionB)

        txbb_cut_list.append(txbb_cut)
        bdt_cut_list.append(bdt_cut)
        s_list.append(s)
        b_list.append(b)
        b_vs_s_list.append(b / s)
        d_list.append(d)

s, tot, dicts = abcd(events_combined, 0.99, 0.955, mass, mass_window)

### add bg_tots 0, 1 to derive b_smooth

In [None]:
bg_tots_list = []

for d in d_list:
    bg_tots = np.sum([d[key] for key in bg_keys], axis=0)
    bg_tots_list.append(bg_tots)

In [None]:
bg_tots_arr = np.array(bg_tots_list)

In [None]:
b_array = np.array(b_list)
s_array = np.array(s_list)
nevents_regionB_array = np.array(nevents_regionB_list)

In [None]:
# 2√(B+sigma_B^2)/S
# sigma_B = B/sqrt(total data in the A + B regions)
# total data in A region = b
# total data in B region: nevents_regionB_array
sigma_B = b_array / np.sqrt(b_array + nevents_regionB_array)
double_sqrtBPlusVarB_vs_s = 2 * np.sqrt(b_array + np.square(sigma_B)) / s_array

In [None]:
# 2*sqrt(B)/S
double_sqrtB_vs_s = 2 * np.sqrt(b_array) / s_array

In [None]:
asimov = np.sqrt(2 * ((s_array + b_array) * np.log(1 + s_array / b_array) - s_array))

In [None]:
df_billy = pd.DataFrame(
    {
        "txbb_cut": txbb_cut_list,
        "bdt_cut": bdt_cut_list,
        "s": s_list,
        "b": b_list,
        "b_vs_s": b_vs_s_list,
        "2*sqrt(B)/S": double_sqrtB_vs_s,
        "2*sqrt(B+sigma_B^2)/S": double_sqrtBPlusVarB_vs_s,
        "nevents_regionB": nevents_regionB,
        "asimov": asimov,
        "bg_tots_0": bg_tots_arr[:, 0],
        "bg_tots_1": bg_tots_arr[:, 1],
    }
)

df_billy

## Calculate smoothed nevents_regionB_pred

In [None]:
from scipy.special import gammainc, gamma

In [None]:
params = [1.11458993, 1.21115674, 0.68137886, 0.70183846]

In [None]:
Txbb = np.array(df_billy["txbb_cut"])
BDT = np.array(df_billy["bdt_cut"])
Txbb_BDT_rows = np.concatenate([Txbb.reshape(-1, 1), BDT.reshape(-1, 1)], axis=1)

In [None]:
# fit data
def one_minus_cdf_2d(x, a, b, scale1, scale2):
    x1 = x[:, 0]
    x2 = x[:, 1]

    sx1 = x1 / scale1
    sx2 = x2 / scale2

    arg1 = sx1**3 + (sx1) ** 2
    arg2 = sx2**3 + (sx2) ** 2

    cdf_1 = gammainc(a, arg1) / gamma(a)
    cdf_2 = gammainc(b, arg2) / gamma(b)

    return (1 - cdf_1) * (1 - cdf_2)

In [None]:
nevents_regionB_pred = one_minus_cdf_2d(Txbb_BDT_rows, *params)

In [None]:
df_billy["nevents_regionB_pred"] = nevents_regionB_pred

## Calculate b_smooth in region A using the smooth nevents_regionB_pred

In [None]:
pdf = df_billy
b_smooth = (pdf["b"] - pdf["bg_tots_0"]) * (pdf["nevents_regionB_pred"] - pdf["bg_tots_1"]) / (
    pdf["nevents_regionB"] - pdf["bg_tots_1"]
) + pdf["bg_tots_0"]

In [None]:
pdf["b_smooth"] = b_smooth

## Calculate FoM using b_smooth

In [None]:
double_sqrtBSmooth_vs_s = 2 * np.sqrt(b_smooth) / s_array

In [None]:
pdf["double_sqrtBSmooth_vs_s"] = double_sqrtBSmooth_vs_s

In [None]:
s_greater_0pt6 = df_billy["s"] > 0.6
s_greater_0pt7 = df_billy["s"] > 0.7
s_greater_1 = df_billy["s"] > 1

In [None]:
best_row_s_greater_1 = df_billy[s_greater_1]["double_sqrtBSmooth_vs_s"].argmin()
df_billy[s_greater_1].iloc[best_row_s_greater_1]

In [None]:
best_row_s_greater_0pt7 = df_billy[s_greater_0pt7]["double_sqrtBSmooth_vs_s"].argmin()
df_billy[s_greater_0pt7].iloc[best_row_s_greater_0pt7]

In [None]:
best_row_s_greater_0pt6 = df_billy[s_greater_0pt6]["double_sqrtBSmooth_vs_s"].argmin()
df_billy[s_greater_0pt6].iloc[best_row_s_greater_0pt6]

In [None]:
best_row = df_billy["double_sqrtBSmooth_vs_s"].argmin()
df_billy.iloc[best_row]

In [None]:
b_greater_1 = df_billy["b"] > 1
best_row_b_greater_1 = df_billy[b_greater_1]["double_sqrtBSmooth_vs_s"].argmin()
df_billy[b_greater_1].iloc[best_row_b_greater_1]

In [None]:
b_greater_2 = df_billy["b"] > 2
best_row_b_greater_2 = df_billy[b_greater_2]["double_sqrtBSmooth_vs_s"].argmin()
df_billy[b_greater_2].iloc[best_row_b_greater_2]

In [None]:
b_greater_2pt8 = df_billy["b"] > 2.8
best_row_b_greater_2pt8 = df_billy[b_greater_2pt8]["double_sqrtBSmooth_vs_s"].argmin()
df_billy[b_greater_2pt8].iloc[best_row_b_greater_2pt8]