# Misc Checks

In [None]:
import pandas as pd
import numpy as np
import vector
import os
from xgboost import XGBClassifier

import HH4b.utils as utils
from HH4b.utils import ShapeVar
import HH4b.plotting as plotting
import HH4b.postprocessing as postprocessing
from HH4b.postprocessing import Region
from HH4b.hh_vars import samples

import hist
import matplotlib.pyplot as plt
import mplhep as hep
import matplotlib.ticker as mticker

formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))

In [None]:
# automatically reloads imported files on edits
%load_ext autoreload
%autoreload 2

## v12 PNet

In [None]:
MAIN_DIR = "../../../"
dir_name = "24Mar31_v12_signal"
path_to_dir = f"{MAIN_DIR}/../data/skimmer/{dir_name}/"
dirs = {path_to_dir: samples}


filters = [
    [
        ("('bbFatJetPt', '0')", ">=", 300),
        ("('bbFatJetPt', '1')", ">=", 300),
    ],
]

year = "2022EE"

load_columns = [
    ("weight", 1),
    ("MET_pt", 1),
    ("nFatJets", 1),
    ("bbFatJetPt", 2),
    ("bbFatJetEta", 2),
    ("bbFatJetPhi", 2),
    ("bbFatJetMsd", 2),
    ("bbFatJetPNetMass", 2),
    ("bbFatJetPNetXbb", 2),
    # ("bbFatJetTau3OverTau2", 2),
    # ("bbFatJetPNetQCD0HF", 2),
    # ("bbFatJetPNetQCD1HF", 2),
    # ("bbFatJetPNetQCD2HF", 2),
]


events_dict = {}
for input_dir, samples in dirs.items():
    events_dict = {
        **events_dict,
        **utils.load_samples(
            input_dir,
            samples[year],
            year,
            filters=filters,
            columns=utils.format_columns(load_columns),
            variations=False,
        ),
    }

In [None]:
cutflow = pd.DataFrame(index=list(events_dict.keys()))
utils.add_to_cutflow(events_dict, "Preselection", "finalWeight", cutflow)

In [None]:
cutflow

In [None]:
samples = ["qcd", "ttbar"]
mass = "bbFatJetMsd"
i = 1

for sample in samples:
    plt.figure(figsize=(10, 10))
    plt.title(sample)
    for cut in [0, 0.8, 0.9]:
        cut_mask = events_dict[sample]["bbFatJetPNetXbb"][i] >= cut
        plt.hist(
            events_dict[sample][mass][i][cut_mask],
            np.arange(60, 251, 10),
            weights=events_dict[sample]["finalWeight"][cut_mask],
            histtype="step",
            label=rf"$T_{{Xbb}} \geq {cut}$",
            density=True,
        )

    plt.xlabel(f"Jet {i+1} {mass} (GeV)")
    plt.legend()
    plt.show()

## Legacy PNet

In [None]:
samples[year]

In [None]:
MAIN_DIR = "../../../"
dir_name = "24Mar31_v12_private_signal"
path_to_dir = f"{MAIN_DIR}/../data/skimmer/{dir_name}/"
dirs = {path_to_dir: samples}


filters = [
    [
        ("('bbFatJetPt', '0')", ">=", 300),
        ("('bbFatJetPt', '1')", ">=", 300),
    ],
]

year = "2022EE"

load_columns = [
    ("weight", 1),
    ("MET_pt", 1),
    ("nFatJets", 1),
    ("bbFatJetPt", 2),
    ("bbFatJetEta", 2),
    ("bbFatJetPhi", 2),
    ("bbFatJetMsd", 2),
    ("bbFatJetPNetMassLegacy", 2),
    ("bbFatJetPNetXbbLegacy", 2),
    # ("bbFatJetTau3OverTau2", 2),
    # ("bbFatJetPNetQCD0HF", 2),
    # ("bbFatJetPNetQCD1HF", 2),
    # ("bbFatJetPNetQCD2HF", 2),
]


events_dict_legacy = {}
for input_dir, samples in dirs.items():
    events_dict_legacy = {
        **events_dict_legacy,
        **utils.load_samples(
            input_dir,
            samples[year],
            year,
            filters=filters,
            columns=utils.format_columns(load_columns),
            variations=False,
        ),
    }

In [None]:
cutflow_legacy = pd.DataFrame(index=list(events_dict_legacy.keys()))
utils.add_to_cutflow(events_dict_legacy, "Preselection", "finalWeight", cutflow_legacy)
cutflow_legacy

In [None]:
samples = ["qcd", "ttbar"]
mass = "bbFatJetMsd"
tagger = "bbFatJetPNetXbbLegacy"
i = 1

for sample in samples:
    plt.figure(figsize=(10, 10))
    plt.title(sample)
    for cut in [0, 0.8, 0.9]:
        cut_mask = events_dict_legacy[sample][tagger][i] >= cut
        plt.hist(
            events_dict_legacy[sample][mass][i][cut_mask],
            np.arange(60, 251, 10),
            weights=events_dict_legacy[sample]["finalWeight"][cut_mask],
            histtype="step",
            label=rf"$T_{{Xbb}} \geq {cut}$",
            density=True,
        )

    plt.xlabel(f"Jet {i+1} {mass} (GeV)")
    plt.legend()
    plt.show()

## tt ROC curve

In [None]:
jet = 1
sig_jets_score = events_dict_legacy["hh4b"]["bbFatJetPNetXbbLegacy"][jet]
bg_jets_score = {
    "qcd": events_dict_legacy["qcd"]["bbFatJetPNetXbbLegacy"][jet],
    "ttbar": events_dict_legacy["ttbar"]["bbFatJetPNetXbbLegacy"][jet],
}

In [None]:
sig_jets_score

In [None]:
weights.shape

In [None]:
from sklearn.metrics import roc_curve

bg_skip = 1
sig_key = "hh4b"
weight_key = "finalWeight"
rocs = {}

for bg_key in ["qcd", "ttbar"]:
    print(bg_key)
    y_true = np.concatenate(
        [
            np.ones(len(sig_jets_score)),
            np.zeros((len(bg_jets_score[bg_key]) - 1) // bg_skip + 1),
        ]
    )

    weights = np.concatenate(
        [
            events_dict_legacy[sig_key][weight_key].to_numpy(),
            events_dict_legacy[bg_key][weight_key].to_numpy()[::bg_skip],
        ]
    )

    scores = np.concatenate((sig_jets_score, bg_jets_score[bg_key][::bg_skip]))

    fpr, tpr, thresholds = roc_curve(y_true, scores, sample_weight=weights)

    rocs[bg_key] = {
        "fpr": fpr,
        "tpr": tpr,
        "thresholds": thresholds,
    }

In [None]:
for bg_key in ["qcd", "ttbar"]:
    rocs[bg_key]["label"] = plotting.label_by_sample[bg_key]

In [None]:
def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return idx

In [None]:
plotting.multiROCCurveGrey({"test": rocs}, [0.2, 0.5], xlim=[0, 0.8], ylim=[1e-5, 1], show=True)