# Misc Checks

In [None]:
import pandas as pd
import numpy as np
import vector
import os
from xgboost import XGBClassifier
from pathlib import Path

import HH4b.utils as utils
from HH4b.utils import ShapeVar
import HH4b.plotting as plotting
import HH4b.postprocessing as postprocessing
from HH4b.postprocessing import Region
from HH4b.hh_vars import samples

import hist
import matplotlib.pyplot as plt
import mplhep as hep
import matplotlib.ticker as mticker

formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))

In [None]:
# automatically reloads imported files on edits
%load_ext autoreload
%autoreload 2

In [None]:
MAIN_DIR = Path("../../../")
plot_dir = MAIN_DIR / "../plots/PostProcess/24Apr20Legacy"
plot_dir.mkdir(parents=True, exist_ok=True)

data_dir = "/ceph/cms/store/user/rkansal/bbbb/skimmer/24Apr19LegacyFixes_v12_private_signal/"
dirs = {data_dir: samples}

In [None]:
# both jets' pT > 300, at least one jet with Legacy PNet TXbb > 0.8
filters = [
    [
        ("('bbFatJetPt', '0')", ">=", 300),
        ("('bbFatJetPt', '1')", ">=", 300),
        ("('bbFatJetPNetTXbbLegacy', '0')", ">=", 0.8),
    ],
    [
        ("('bbFatJetPt', '0')", ">=", 300),
        ("('bbFatJetPt', '1')", ">=", 300),
        ("('bbFatJetPNetTXbbLegacy', '1')", ">=", 0.8),
    ],
]

year = "2022EE"

load_columns = [
    ("weight", 1),
    ("bbFatJetPt", 2),
    ("bbFatJetEta", 2),
    ("bbFatJetPhi", 2),
    ("bbFatJetMsd", 2),
    ("bbFatJetPNetMass", 2),
    ("bbFatJetPNetTXbb", 2),
    ("bbFatJetPNetMassLegacy", 2),
    ("bbFatJetPNetTXbbLegacy", 2),
    # ("bbFatJetTau3OverTau2", 2),
    # ("bbFatJetPNetQCD0HF", 2),
    # ("bbFatJetPNetQCD1HF", 2),
    # ("bbFatJetPNetQCD2HF", 2),
]


events_dict = {}
for input_dir, samples in dirs.items():
    events_dict = {
        **events_dict,
        **utils.load_samples(
            input_dir,
            samples[year],
            year,
            filters=filters,
            columns=utils.format_columns(load_columns),
            variations=False,
        ),
    }

cutflow = pd.DataFrame(index=list(events_dict.keys()))
utils.add_to_cutflow(events_dict, "Preselection", "finalWeight", cutflow)
cutflow

In [None]:
np.mean(
    events_dict["hh4b"]["bbFatJetPNetTXbbLegacy"][0]
    < events_dict["hh4b"]["bbFatJetPNetTXbbLegacy"][1]
)

In [None]:
samples = ["qcd", "ttbar"]
mass = "bbFatJetMsd"
tagger = "bbFatJetPNetTXbbLegacy"
i = 1

for sample in samples:
    plt.figure(figsize=(10, 10))
    plt.title(sample)
    for cut in [0, 0.8, 0.9, 0.95]:
        cut_mask = events_dict[sample][tagger][i] >= cut
        plt.hist(
            events_dict[sample][mass][i][cut_mask],
            np.arange(60, 251, 10),
            weights=events_dict[sample]["finalWeight"][cut_mask],
            histtype="step",
            label=rf"$T_{{Xbb}} \geq {cut}$",
            density=True,
        )

    plt.xlabel(f"Jet {i+1} {mass} (GeV)")
    plt.legend()
    plt.savefig(plot_dir / f"{sample}_{mass}{i}_{tagger}_sculpting.pdf", bbox_inches="tight")
    plt.show()

## tt ROC curve

In [None]:
jet = 1
tagger = "bbFatJetPNetTXbbLegacy"
sig_jets_score = events_dict["hh4b"][tagger][jet]
bg_jets_score = {
    "qcd": events_dict["qcd"][tagger][jet],
    "ttbar": events_dict["ttbar"][tagger][jet],
}

In [None]:
from sklearn.metrics import roc_curve

bg_skip = 1
sig_key = "hh4b"
weight_key = "finalWeight"
rocs = {}

for bg_key in ["qcd", "ttbar"]:
    print(bg_key)
    y_true = np.concatenate(
        [
            np.ones(len(sig_jets_score)),
            np.zeros((len(bg_jets_score[bg_key]) - 1) // bg_skip + 1),
        ]
    )

    weights = np.concatenate(
        [
            events_dict[sig_key][weight_key].to_numpy(),
            events_dict[bg_key][weight_key].to_numpy()[::bg_skip],
        ]
    )

    scores = np.concatenate((sig_jets_score, bg_jets_score[bg_key][::bg_skip]))

    fpr, tpr, thresholds = roc_curve(y_true, scores, sample_weight=weights)

    rocs[bg_key] = {
        "fpr": fpr,
        "tpr": tpr,
        "thresholds": thresholds,
        "label": plotting.label_by_sample[bg_key],
    }

In [None]:
def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return idx

In [None]:
plotting.multiROCCurveGrey(
    {"test": rocs},
    [0.2, 0.5],
    xlim=[0, 0.8],
    ylim=[1e-5, 1],
    plot_dir=plot_dir,
    name=f"{tagger}_ROCs",
    show=True,
)