# ROC Curves for ParticleNet TXbb

Author: Raghav Kansal

In [None]:
import os
from pathlib import Path
import pandas as pd
import uproot
import numpy as np
import pickle
import vector
from sklearn.metrics import roc_curve, auc
import scipy

from HH4b import utils, plotting

Import libraries

In [None]:
import hist
import matplotlib.pyplot as plt
import mplhep as hep
import matplotlib.ticker as mticker

hep.style.use(["CMS", "firamath"])

formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))
plt.rcParams.update({"font.size": 12})
plt.rcParams["lines.linewidth"] = 2
plt.rcParams["grid.color"] = "#CCCCCC"
plt.rcParams["grid.linewidth"] = 0.5
plt.rcParams["figure.edgecolor"] = "none"

In [None]:
# automatically reloads imported files on edits
%load_ext autoreload
%autoreload 2

In [None]:
MAIN_DIR = Path("../../../")
plot_dir = MAIN_DIR / "plots/PNet/24Apr17"
plot_dir.mkdir(parents=True, exist_ok=True)

In [None]:
year = "2022EE"
samples = {
    "hh4b": [
        "GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV",
    ],
    "qcd": [
        "QCD_HT-1000to1200",
        "QCD_HT-100to200",
        "QCD_HT-1200to1500",
        "QCD_HT-1500to2000",
        "QCD_HT-2000",
        "QCD_HT-200to400",
        "QCD_HT-400to600",
        "QCD_HT-600to800",
        "QCD_HT-800to1000",
    ],
    "ttbar": [
        "TTto2L2Nu",
        "TTto4Q",
        "TTtoLNu2Q",
    ],
}

dirs = {MAIN_DIR / "../data/skimmer/24Mar31_v12_signal": samples}

# columns to load
load_columns = [
    ("weight", 1),
    ("bbFatJetPNetXbb", 2),
]
# reformat into ("column name", "idx") format for reading multiindex columns
columns = []
for key, num_columns in load_columns:
    for i in range(num_columns):
        columns.append(f"('{key}', '{i}')")

filters = [
    [
        ("('bbFatJetPt', '0')", ">=", 300),
        ("('bbFatJetPt', '1')", ">=", 300),
    ],
]

events_dict = {}
for input_dir, samples in dirs.items():
    print(samples)
    events_dict = {
        **events_dict,
        # this function will load files (only the columns selected), apply filters and compute a weight per event
        **utils.load_samples(
            input_dir, samples, year, filters=filters, columns=columns, variations=False
        ),
    }

cutflow = pd.DataFrame(index=list(samples.keys()))
utils.add_to_cutflow(events_dict, "Preselection", "finalWeight", cutflow)
cutflow

In [None]:
jet = 1  # second jet
sig_key = "hh4b"
bg_keys = ["qcd", "ttbar"]
# bg_skip = 1

y_true = np.concatenate(
    [
        np.ones(len(events_dict["hh4b"])),
        np.zeros(np.sum(len(events_dict[bg_key]) for bg_key in bg_keys)),
    ]
)
# print(y_true[np.sum(sig_cut):])

weights = np.concatenate(
    [events_dict[sig_key]["finalWeight"]]
    + [events_dict[bg_key]["finalWeight"] for bg_key in bg_keys],
)

scores = np.concatenate(
    [events_dict[sig_key]["bbFatJetPNetXbb"][jet]]
    + [events_dict[bg_key]["bbFatJetPNetXbb"][jet] for bg_key in bg_keys],
)

In [None]:
fpr, tpr, thresholds = roc_curve(y_true, scores, sample_weight=weights)
roc = {"fpr": fpr, "tpr": tpr, "thresholds": thresholds}

In [None]:
plotting.ROCCurve(
    roc,
    xlim=[0, 0.8],
    ylim=[1e-5, 1],
    thresholds=[0.8, 0.9],
    show=True,
    plot_dir=plot_dir,
    name=f"bbFatJet{jet+1}ROC",
)