In [None]:
import os
import pandas as pd
import uproot
import numpy as np
import pickle
import vector
from sklearn.metrics import roc_curve, auc
import scipy
from HH4b import plotting

Import libraries

In [None]:
import hist
import matplotlib.pyplot as plt
import mplhep as hep
import matplotlib.ticker as mticker

from HH4b import plotting

hep.style.use(["CMS", "firamath"])

formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))
plt.rcParams.update({"font.size": 12})
plt.rcParams["lines.linewidth"] = 2
plt.rcParams["grid.color"] = "#CCCCCC"
plt.rcParams["grid.linewidth"] = 0.5
plt.rcParams["figure.edgecolor"] = "none"
plt.style.use(hep.style.CMS)

bdt_axis = hist.axis.Regular(40, 0, 1, name="bdt", label=r"BDT")
xbb_axis = hist.axis.Regular(40, 0.8, 1, name="xbb", label=r"Xbb")
xbb2_axis = hist.axis.Regular(40, 0.0, 1, name="xbb", label=r"Xbb 2")
msd_axis = hist.axis.Regular(40, 0, 250, name="msd", label=r"m$_{SD}$ GeV")
mreg_axis = hist.axis.Regular(40, 0, 250, name="mreg", label=r"m$_{reg}$ GeV")

cat_axis = hist.axis.StrCategory([], name="cat", growth=True)

pt_bins = {
    "250-300": [250, 300],
    "300-400": [300, 400],
    "400-500": [400, 500],
    "600-Inf": [600, 2000],
}

mass_cuts = [60, 250]
# mass_cuts = [100, 150]

In [None]:
# automatically reloads imported files on edits
%load_ext autoreload
%autoreload 2

In [None]:
MAIN_DIR = "../../../"
year = "2018"
path_to_dir_run2 = f"{MAIN_DIR}/../data/skimmer/20211209_regression/"
# path_to_dir_run2 = f"{MAIN_DIR}/../data/skimmer/20210712_regression/"
samples_run2 = {
    "hh4b": [
        "GluGluToHHTo4B_node_cHHH1_TuneCP5_PSWeights_13TeV-powheg-pythia8_1pb_weighted_Testing_BDTs.root",
        # "GluGluToHHTo4B_node_cHHH1_TuneCP5_PSWeights_13TeV-powheg-pythia8_1pb_weighted_BDTs.root",
    ],
    "hh4b-kl0": [
        "GluGluToHHTo4B_node_cHHH0_TuneCP5_PSWeights_13TeV-powheg-pythia8_1pb_weighted_BDTs.root",
    ],
    "hh4b-kl2p45": [
        "GluGluToHHTo4B_node_cHHH2p45_TuneCP5_PSWeights_13TeV-powheg-pythia8_1pb_weighted_BDTs.root",
    ],
    "qcd": [
        "QCD_HT1000to1500_TuneCP5_13TeV-madgraphMLM-pythia8_1pb_weighted_Testing_BDTs.root",
        "QCD_HT1500to2000_TuneCP5_13TeV-madgraphMLM-pythia8_1pb_weighted_Testing_BDTs.root",
        "QCD_HT2000toInf_TuneCP5_13TeV-madgraphMLM-pythia8_1pb_weighted_Testing_BDTs.root",
        "QCD_HT200to300_TuneCP5_13TeV-madgraphMLM-pythia8_1pb_weighted_Testing_BDTs.root",
        "QCD_HT300to500_TuneCP5_13TeV-madgraphMLM-pythia8_1pb_weighted_Testing_BDTs.root",
        "QCD_HT500to700_TuneCP5_13TeV-madgraphMLM-pythia8_1pb_weighted_Testing_BDTs.root",
        "QCD_HT700to1000_TuneCP5_13TeV-madgraphMLM-pythia8_1pb_weighted_Testing_BDTs.root",
    ],
    "ttbar": [
        "TTToHadronic_TuneCP5_13TeV-powheg-pythia8-combined_1pb_weighted_Testing_BDTs.root",
        "TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8-combined_1pb_weighted_Testing_BDTs.root",
    ],
}

columns = [
    "run",
    "luminosityBlock",
    "event",
    "fatJet1Pt",
    "fatJet1Eta",
    "fatJet1Phi",
    "fatJet1Mass",
    "fatJet1MassSD",
    "fatJet1MassRegressed",
    "fatJet1PNetXbb",
    "fatJet1PNetQCDb",
    "fatJet1PNetQCDbb",
    "fatJet1PNetQCDothers",
    "fatJet1Tau3OverTau2",
    "fatJet2Pt",
    "fatJet2Eta",
    "fatJet2Phi",
    "fatJet2Mass",
    "fatJet2MassSD",
    "fatJet2PNetXbb",
    "fatJet2PNetQCDb",
    "fatJet2PNetQCDbb",
    "fatJet2PNetQCDothers",
    "fatJet2Tau3OverTau2",
    "fatJet2MassRegressed",
    "fatJet1PtOverMHH",
    "fatJet2PtOverMHH",
    # "fatJet1MassSD_noJMS",
    "ptj2_over_ptj1",
    "hh_pt",
    "hh_eta",
    "hh_mass",
    "met",
    "genHiggs1Pt",
    "genHiggs1Eta",
    "genHiggs1Phi",
    "disc_qcd_and_ttbar_Run2_enhanced_v8p2",
    "xsecWeight",
    "weight",
    "genWeight",
    "triggerEffWeight",
    "puWeight",
]

events_dict = {}
for key, datasets in samples_run2.items():
    dfs = []
    for dset in datasets:
        pdf = uproot.open(f"{path_to_dir_run2}/{year}/{dset}:Events").arrays(columns, library="pd")
        # df.rename(columns={"luminosityBlock": "lumi"})
        dfs.append(pdf)
    events_dict[key] = pd.concat(dfs)

Check BDT performance

In [None]:
h_bdt = hist.Hist(bdt_axis, cat_axis)
h_xbb0 = hist.Hist(xbb_axis, cat_axis)
h_xbb1 = hist.Hist(xbb2_axis, cat_axis)
h_msd1 = hist.Hist(msd_axis, cat_axis)
h_mreg1 = hist.Hist(mreg_axis, cat_axis)

lumi = 136143

scores = {}
weights = {}
txbb1 = {}
maskspt = {}
for key in ["hh4b", "qcd", "ttbar"]:
    # pt mask
    # mask = (events_dict[key].fatJet1Pt > 300) & (events_dict[key].fatJet2Pt > 300) & (events_dict[key].fatJet1MassSD > 50) & (events_dict[key].fatJet2MassSD > 50) & (events_dict[key].fatJet1MassSD < 250) & (events_dict[key].fatJet2MassSD < 250)
    mask = (
        (events_dict[key].fatJet1Pt > 300)
        & (events_dict[key].fatJet2Pt > 300)
        & (events_dict[key].fatJet1MassSD >= 40)
        & (events_dict[key].fatJet1MassRegressed >= 50)
        & (events_dict[key].fatJet2MassRegressed >= 50)
        # & (events_dict[key].fatJet2MassRegressed >= mass_cuts[0])
        # & (events_dict[key].fatJet2MassRegressed <= mass_cuts[1])
        # & (events_dict[key].fatJet1MassRegressed >= mass_cuts[0])
        # & (events_dict[key].fatJet1MassRegressed <= mass_cuts[1])
    )

    scores[key] = events_dict[key][mask]["disc_qcd_and_ttbar_Run2_enhanced_v8p2"].to_numpy()

    # get event weight
    # event weights as in Run-2 analysis (except for trigger weight)
    if "hh4b" in key:
        weights[key] = (
            events_dict[key][mask]["xsecWeight"]
            * events_dict[key][mask]["weight"]
            # * lumi[year]
            # * trigger_weight
            # * events_dict[key][mask]["l1PreFiringWeight"]
            * events_dict[key][mask]["puWeight"]
        )
    else:
        weights[key] = (
            events_dict[key][mask]["xsecWeight"]
            * events_dict[key][mask]["genWeight"]
            # * lumi[year]
            # * trigger_weight
            # * events_dict[key][mask]["l1PreFiringWeight"]
            * events_dict[key][mask]["puWeight"]
        )

    # weights[key] = (
    #    events_dict[key][mask]["weight"]
    # * events_dict[key][mask]["puWeight"]
    # * events_dict[key][mask]["triggerEffWeight"]
    #    * lumi
    # )
    txbb1[key] = events_dict[key][mask]["fatJet2PNetXbb"]

    # weight = 1
    weight = weights[key]

    h_bdt.fill(
        bdt=events_dict[key][mask]["disc_qcd_and_ttbar_Run2_enhanced_v8p2"].to_numpy(),
        cat=key,
        weight=weight,
    )
    h_xbb0.fill(xbb=events_dict[key][mask]["fatJet1PNetXbb"].to_numpy(), cat=key, weight=weight)
    h_msd1.fill(msd=events_dict[key][mask]["fatJet2MassSD"].to_numpy(), cat=key, weight=weight)
    h_mreg1.fill(
        mreg=events_dict[key][mask]["fatJet2MassRegressed"].to_numpy(), cat=key, weight=weight
    )
    h_xbb1.fill(xbb=events_dict[key][mask]["fatJet2PNetXbb"].to_numpy(), cat=key, weight=weight)

    pt1 = events_dict[key][mask]["fatJet2Pt"]
    maskspt[key] = {}
    for ptkey, value in pt_bins.items():
        maskspt[key][ptkey] = (pt1 >= value[0]) & (pt1 < value[1])

In [None]:
colors = {"ttbar": "b", "hh4b": "k", "qcd": "r"}

hists = {
    "bdt": h_bdt,
    "xbb_0": h_xbb0,
    "xbb_1": h_xbb1,
    "msd_1": h_msd1,
    "mreg_1": h_mreg1,
}

for hname, hhist in hists.items():
    fig, ax = plt.subplots(1, 1, figsize=(12, 8))
    for key in ["hh4b", "qcd", "ttbar"]:
        hep.histplot(
            hhist[{"cat": key}],
            ax=ax,
            label=f"{key}",
            histtype="step",
            linewidth=1,
            color=colors[key],
            density=True,
        )

    ax.set_yscale("log")
    ax.legend(
        title=r"FatJets $p_T^{(1,2)}>$300" + "\n" + "m$_{reg}^{(1,2)}$:" + f"{mass_cuts} GeV",
        bbox_to_anchor=(1.03, 1),
        loc="upper left",
    )
    ax.set_ylabel("Density")
    ax.set_title("Pre-Selection")
    ax.xaxis.grid(True, which="major")
    ax.yaxis.grid(True, which="major")

BDT roc curve

In [None]:
# make BDT roc curve
tt_str = "alltt"
# tt_str = ""

fpr_dict = {}
auc_dict = {}
tpr_dict = {}
thresholds_dict = {}

fpr_dict_now = {}
tpr_dict_now = {}
thresholds_dict_now = {}

for bkg in ["qcd", "ttbar", "merged"]:
    sig_jets_score = scores["hh4b"]
    if bkg == "merged":
        bkg_jets_score = np.concatenate((scores["qcd"], scores["ttbar"]))
        scores_weights = pd.concat([weights["hh4b"], weights["qcd"], weights["ttbar"]], axis=0)
    else:
        bkg_jets_score = scores[bkg]
        scores_weights = pd.concat([weights["hh4b"], weights[bkg]], axis=0)

    scores_roc = np.concatenate((sig_jets_score, bkg_jets_score))
    scores_true = np.concatenate(
        [
            np.ones(len(sig_jets_score)),
            np.zeros(len(bkg_jets_score)),
        ]
    )
    fpr_dict_now[bkg], tpr_dict_now[bkg], thresholds_dict_now[bkg] = roc_curve(
        scores_true, scores_roc
    )

    fpr, tpr, thresholds = roc_curve(scores_true, scores_roc, sample_weight=scores_weights)

    sorted_index = np.argsort(fpr)
    fpr_sorted = np.array(fpr)[sorted_index]
    tpr_sorted = np.array(tpr)[sorted_index]
    auc_dict[bkg] = scipy.integrate.trapz(y=tpr_sorted, x=fpr_sorted)
    fpr_dict[bkg] = fpr
    tpr_dict[bkg] = tpr
    thresholds_dict[bkg] = thresholds

apply_weights = True
# apply_weights = False

bkg_colors = {**plotting.color_by_sample, "merged": "orange"}
legends = {**plotting.label_by_sample, "merged": "Total Background"}
for log in [True, False]:
    fig, ax = plt.subplots(1, 1, figsize=(18, 12))

    for bkg in ["qcd", "ttbar", "merged"]:
        print(bkg, auc_dict[bkg])

        def find_nearest(array, value):
            array = np.asarray(array)
            idx = (np.abs(array - value)).argmin()
            return idx

        plot_thresholds = [0.03, 0.11, 0.43]
        th_colours = ["#9381FF", "#1f78b4", "#a6cee3"]

        if apply_weights:
            ax.plot(
                tpr_dict[bkg], fpr_dict[bkg], linewidth=2, color=bkg_colors[bkg], label=legends[bkg]
            )
        else:
            ax.plot(
                tpr_dict_now[bkg],
                fpr_dict_now[bkg],
                linewidth=2,
                color=bkg_colors[bkg],
                label=legends[bkg],
            )

        if bkg == "merged":
            pths = {th: [[], []] for th in plot_thresholds}

            for th in plot_thresholds:
                idx = find_nearest(thresholds_dict[bkg], th)
                print(idx, th, tpr_dict[bkg][idx], fpr_dict[bkg][idx])
                pths[th][0].append(tpr_dict[bkg][idx])
                pths[th][1].append(fpr_dict[bkg][idx])

            print(pths)

            for k, th in enumerate(plot_thresholds):
                ax.scatter(
                    *pths[th],
                    marker="o",
                    s=40,
                    label=rf"BDT > {th}",
                    color=th_colours[k],
                    zorder=100,
                )

                ax.vlines(
                    x=pths[th][0],
                    ymin=0,
                    ymax=pths[th][1],
                    color=th_colours[k],
                    linestyles="dashed",
                    alpha=0.5,
                )

                ax.hlines(
                    y=pths[th][1],
                    xmin=0,
                    xmax=pths[th][0],
                    color=th_colours[k],
                    linestyles="dashed",
                    alpha=0.5,
                )

    ax.set_title("ggF HH4b BDT ROC Curve Run-2")
    legtitle = r"FatJet p$_T^{(1,2)}$ > 300 GeV" + "\n" + "Xbb$^{1}$>0.8"
    legtitle += "\n" + r"m$_{reg}^{(1,2)}$ > 50 GeV"
    legtitle += "\n" + r"m$_{SD}^{(1)}$ > 40 GeV"
    # legtitle += "\n" + r"m$_{reg}^{(0,1)}$:"+f"{mass_cuts} GeV"
    if apply_weights:
        legtitle += "\n" + "Weights applied"
    else:
        legtitle += "\n" + "NO Weights applied"
    ax.set_xlabel("Signal efficiency")
    ax.set_ylabel("Background efficiency")
    if log:
        ax.set_xlim([0.0, 0.6])
        ax.set_ylim([1e-5, 1e-1])
        ax.set_yscale("log")
    else:
        ax.set_xlim([0.0, 0.7])
        ax.set_ylim([0, 0.08])
    ax.xaxis.grid(True, which="major")
    ax.yaxis.grid(True, which="major")
    ax.legend(
        title=legtitle,
        bbox_to_anchor=(1.03, 1),
        loc="upper left",
    )
    fig.tight_layout()
    weight_str = "weights" if apply_weights else "noweights"
    if log:
        fig.savefig(f"2018_bdt_roc_{weight_str}{tt_str}_log.png")
    else:
        fig.savefig(f"2018_bdt_roc_{weight_str}{tt_str}.png")

PNet ROC curve

In [None]:
# ROC curve for 2nd jet

fpr_dict = {}
auc_dict = {}
tpr_dict = {}
thresholds_dict = {}

fpr_dict_now = {}
tpr_dict_now = {}
thresholds_dict_now = {}

for bkg in ["qcd", "ttbar", "merged"]:
    sig_jets_score = txbb1["hh4b"]
    if bkg == "merged":
        bkg_jets_score = np.concatenate((txbb1["qcd"], txbb1["ttbar"]))
        scores_weights = pd.concat([weights["hh4b"], weights["qcd"], weights["ttbar"]], axis=0)
    else:
        bkg_jets_score = txbb1[bkg]
        scores_weights = pd.concat([weights["hh4b"], weights[bkg]], axis=0)

    scores_roc = np.concatenate((sig_jets_score, bkg_jets_score))
    scores_true = np.concatenate(
        [
            np.ones(len(sig_jets_score)),
            np.zeros(len(bkg_jets_score)),
        ]
    )
    fpr_dict_now[bkg], tpr_dict_now[bkg], thresholds_dict_now[bkg] = roc_curve(
        scores_true, scores_roc
    )

    fpr, tpr, thresholds = roc_curve(scores_true, scores_roc, sample_weight=scores_weights)

    sorted_index = np.argsort(fpr)
    fpr_sorted = np.array(fpr)[sorted_index]
    tpr_sorted = np.array(tpr)[sorted_index]
    auc_dict[bkg] = scipy.integrate.trapz(y=tpr_sorted, x=fpr_sorted)
    fpr_dict[bkg] = fpr
    tpr_dict[bkg] = tpr
    thresholds_dict[bkg] = thresholds

for log in [True, False]:

    fig, ax = plt.subplots(1, 1, figsize=(18, 12))
    for bkg in ["qcd", "ttbar", "merged"]:
        print(bkg, auc_dict[bkg])

        def find_nearest(array, value):
            array = np.asarray(array)
            idx = (np.abs(array - value)).argmin()
            return idx

        plot_thresholds = [0.03, 0.11, 0.43]
        th_colours = ["#9381FF", "#1f78b4", "#a6cee3"]

        ax.plot(
            tpr_dict[bkg], fpr_dict[bkg], linewidth=2, color=bkg_colors[bkg], label=legends[bkg]
        )

        if bkg == "merged":
            pths = {th: [[], []] for th in plot_thresholds}

            for th in plot_thresholds:
                idx = find_nearest(thresholds_dict[bkg], th)
                print(idx, th, tpr_dict[bkg][idx], fpr_dict[bkg][idx])
                pths[th][0].append(tpr_dict[bkg][idx])
                pths[th][1].append(fpr_dict[bkg][idx])

            print(pths)

            for k, th in enumerate(plot_thresholds):
                ax.scatter(
                    *pths[th],
                    marker="o",
                    s=40,
                    label=rf"TXbb > {th}",
                    color=th_colours[k],
                    zorder=100,
                )

                ax.vlines(
                    x=pths[th][0],
                    ymin=0,
                    ymax=pths[th][1],
                    color=th_colours[k],
                    linestyles="dashed",
                    alpha=0.5,
                )

                ax.hlines(
                    y=pths[th][1],
                    xmin=0,
                    xmax=pths[th][0],
                    color=th_colours[k],
                    linestyles="dashed",
                    alpha=0.5,
                )
    ax.set_title("ggF HH4b PNet Xbb ROC Curve")
    legtitle = r"FatJet p$_T^{(1,2)}$ > 300 GeV" + "\n" + "Xbb$^{1}$>0.8"
    legtitle += "\n" + r"m$_{reg}^{(1,2)}$:" + f"{mass_cuts} GeV"
    ax.set_xlabel("Signal efficiency")
    ax.set_ylabel("Background efficiency")
    if log:
        ax.set_xlim([0.0, 0.6])
        ax.set_ylim([1e-5, 1e-1])
        ax.set_yscale("log")
    else:
        ax.set_xlim([0.0, 0.7])
        ax.set_ylim([0, 0.08])
    ax.xaxis.grid(True, which="major")
    ax.yaxis.grid(True, which="major")
    ax.xaxis.grid(True, which="major")
    ax.yaxis.grid(True, which="major")
    ax.legend(
        title=legtitle,
        bbox_to_anchor=(1.03, 1),
        loc="upper left",
    )
    fig.tight_layout()
    if log:
        fig.savefig(f"2018_pnetxbb2_roc_weights{tt_str}_log.png")
    else:
        fig.savefig(f"2018_pnetxbb2_roc_weights{tt_str}.png")

In [None]:
# ROCs for pt bins
# j2

fpr_pt_dict = {}
tpr_pt_dict = {}
thresholds_pt_dict = {}

for ptkey, ptsel in pt_bins.items():
    fpr_pt_dict[ptkey] = {}
    tpr_pt_dict[ptkey] = {}
    thresholds_pt_dict[ptkey] = {}
    for bkg in ["qcd", "ttbar", "merged"]:
        hh4b_mask = maskspt["hh4b"][ptkey]
        qcd_mask = maskspt["qcd"][ptkey]
        tt_mask = maskspt["ttbar"][ptkey]

        if np.any(hh4b_mask):
            sig_jets_score = txbb1["hh4b"][hh4b_mask]
            if bkg == "merged":
                bkg_jets_score = np.concatenate((txbb1["qcd"][qcd_mask], txbb1["ttbar"][tt_mask]))
                scores_weights = pd.concat(
                    [
                        weights["hh4b"][hh4b_mask],
                        weights["qcd"][qcd_mask],
                        weights["ttbar"][tt_mask],
                    ],
                    axis=0,
                )
            else:
                bkg_mask = qcd_mask if bkg == "qcd" else tt_mask
                bkg_jets_score = txbb1[bkg][bkg_mask]
                scores_weights = pd.concat(
                    [weights["hh4b"][hh4b_mask], weights[bkg][bkg_mask]], axis=0
                )

            scores_roc = np.concatenate((sig_jets_score, bkg_jets_score))
            scores_true = np.concatenate(
                [
                    np.ones(len(sig_jets_score)),
                    np.zeros(len(bkg_jets_score)),
                ]
            )
            fpr, tpr, thresholds = roc_curve(scores_true, scores_roc, sample_weight=scores_weights)

            sorted_index = np.argsort(fpr)
            fpr_sorted = np.array(fpr)[sorted_index]
            tpr_sorted = np.array(tpr)[sorted_index]
            fpr_pt_dict[ptkey][bkg] = fpr
            tpr_pt_dict[ptkey][bkg] = tpr
            thresholds_pt_dict[ptkey][bkg] = thresholds

In [None]:
fpr_pt_dict

Compare with Run-3

In [None]:
from HH4b.utils import format_columns, load_samples

samples = {
    "2022EE": {
        "qcd": [
            "QCD_HT-400to600",
            "QCD_HT-600to800",
            "QCD_HT-800to1000",
            "QCD_HT-1000to1200",
            "QCD_HT-1200to1500",
            "QCD_HT-1500to2000",
            "QCD_HT-2000",
        ],
        "ttbar": ["TTto4Q", "TTtoLNu2Q"],
        "ttlep": [
            "TTtoLNu2Q",
        ],
        "hh4b": [
            "GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV",
        ],
    }
}
year = "2022EE"
data_path = "../../../../data/skimmer/24Apr19LegacyFixes_v12_private_signal"
dirs = {data_path: samples}
mass_key = "bbFatJetPNetMassLegacy"
load_columns = [
    ("weight", 1),
    ("event", 1),
    ("MET_pt", 1),
    ("bbFatJetPt", 2),
    ("bbFatJetEta", 2),
    ("bbFatJetPhi", 2),
    ("bbFatJetMsd", 2),
    ("bbFatJetTau3OverTau2", 2),
    ("bbFatJetPNetTXbb", 2),
    ("bbFatJetPNetMass", 2),
]
load_columns += [
    ("bbFatJetPNetTXbbLegacy", 2),
    ("bbFatJetPNetPXbbLegacy", 2),
    ("bbFatJetPNetPQCDbLegacy", 2),
    ("bbFatJetPNetPQCDbbLegacy", 2),
    ("bbFatJetPNetPQCDothersLegacy", 2),
    ("bbFatJetPNetMassLegacy", 2),
]

events_dict = {}
print(f"Loading samples from {year}")
for input_dir, samples in dirs.items():
    events_dict = {
        **events_dict,
        **load_samples(
            input_dir,
            samples[year],
            year,
            filters=None,
            variations=False,
            reorder_txbb=True,
            txbb="bbFatJetPNetTXbbLegacy",
            columns=format_columns(load_columns),
        ),
    }

fprs_legacy = {}
tprs_legacy = {}
thresholds_legacy = {}

fprs_legacy_now = {}
tprs_legacy_now = {}
thresholds_legacy_now = {}

for key in events_dict:
    # guarantee that Xbb>0.8 is applied to first bb jet
    msd_0 = events_dict[key]["bbFatJetMsd"].to_numpy()[:, 0]
    xbb_0 = events_dict[key]["bbFatJetPNetTXbbLegacy"].to_numpy()[:, 0]
    pt_0 = events_dict[key]["bbFatJetPt"].to_numpy()[:, 0]
    pt_1 = events_dict[key]["bbFatJetPt"].to_numpy()[:, 1]
    mass_0 = events_dict[key]["bbFatJetPNetMassLegacy"].to_numpy()[:, 0]
    mass_1 = events_dict[key]["bbFatJetPNetMassLegacy"].to_numpy()[:, 1]
    mask = (
        (xbb_0 >= 0.8)
        & (pt_0 > 300)
        & (pt_1 > 300)
        & (mass_0 >= mass_cuts[0])
        & (mass_1 >= mass_cuts[0])
        & (mass_0 <= mass_cuts[1])
        & (mass_1 <= mass_cuts[1])
    )
    # mask = mask & (msd_0 > 40)
    events_dict[key] = events_dict[key][mask]

pnet_xbb_str = "bbFatJetPNetTXbbLegacy"
legacy_weights = {}
xbb_dict = {}
pt1_dict = {}
for key in events_dict:
    legacy_weights[key] = events_dict[key]["finalWeight"]
    xbb_dict[key] = events_dict[key][pnet_xbb_str].to_numpy()[:, 1]
    pt1 = events_dict[key]["bbFatJetPt"].to_numpy()[:, 1]
    pt1_dict[key] = {}
    for ptkey, val in pt_bins.items():
        pt1_dict[key][ptkey] = (pt1 >= val[0]) & (pt1 < val[1])

for bkg in ["qcd", "ttbar", "merged"]:
    if bkg != "merged":
        scores_roc = np.concatenate([xbb_dict["hh4b"], xbb_dict[bkg]])
        sig_jets_score = xbb_dict["hh4b"]
        bkg_jets_score = xbb_dict[bkg]
        scores_true = np.concatenate(
            [
                np.ones(len(sig_jets_score)),
                np.zeros(len(bkg_jets_score)),
            ]
        )
        scores_weights = np.concatenate([legacy_weights["hh4b"], legacy_weights[bkg]])
        fprs_legacy_now[bkg], tprs_legacy_now[bkg], thresholds_legacy_now[bkg] = roc_curve(
            scores_true, scores_roc
        )
        fprs_legacy[bkg], tprs_legacy[bkg], thresholds_legacy[bkg] = roc_curve(
            scores_true, scores_roc, sample_weight=scores_weights
        )
    else:
        scores_roc = np.concatenate([xbb_dict["hh4b"], xbb_dict["qcd"], xbb_dict["ttbar"]])
        sig_jets_score = xbb_dict["hh4b"]
        bkg_jets_score = np.concatenate([xbb_dict["qcd"], xbb_dict["ttbar"]])
        scores_true = np.concatenate(
            [
                np.ones(len(sig_jets_score)),
                np.zeros(len(bkg_jets_score)),
            ]
        )
        scores_weights = np.concatenate(
            [legacy_weights["hh4b"], legacy_weights["qcd"], legacy_weights["ttbar"]]
        )
        fprs_legacy_now[bkg], tprs_legacy_now[bkg], thresholds_legacy_now[bkg] = roc_curve(
            scores_true, scores_roc
        )
        fprs_legacy[bkg], tprs_legacy[bkg], thresholds_legacy[bkg] = roc_curve(
            scores_true, scores_roc, sample_weight=scores_weights
        )

In [None]:
mass_cuts

In [None]:
fpr_pt_dict_run3 = {}
tpr_pt_dict_run3 = {}
thresholds_pt_dict_run3 = {}

for ptkey, ptsel in pt_bins.items():
    hh4b_mask = pt1_dict["hh4b"][ptkey]
    qcd_mask = pt1_dict["qcd"][ptkey]
    tt_mask = pt1_dict["ttbar"][ptkey]

    fpr_pt_dict_run3[ptkey] = {}
    tpr_pt_dict_run3[ptkey] = {}
    thresholds_pt_dict_run3[ptkey] = {}

    if np.any(hh4b_mask):

        for bkg in ["qcd", "ttbar", "merged"]:
            if bkg != "merged":
                bkg_mask = qcd_mask if bkg == "qcd" else tt_mask
                scores_roc = np.concatenate([xbb_dict["hh4b"][hh4b_mask], xbb_dict[bkg][bkg_mask]])
                sig_jets_score = xbb_dict["hh4b"][hh4b_mask]
                bkg_jets_score = xbb_dict[bkg][bkg_mask]
                scores_true = np.concatenate(
                    [
                        np.ones(len(sig_jets_score)),
                        np.zeros(len(bkg_jets_score)),
                    ]
                )
                scores_weights = np.concatenate(
                    [legacy_weights["hh4b"][hh4b_mask], legacy_weights[bkg][bkg_mask]]
                )
                (
                    fpr_pt_dict_run3[ptkey][bkg],
                    tpr_pt_dict_run3[ptkey][bkg],
                    thresholds_pt_dict_run3[ptkey][bkg],
                ) = roc_curve(scores_true, scores_roc, sample_weight=scores_weights)
            else:
                scores_roc = np.concatenate(
                    [
                        xbb_dict["hh4b"][hh4b_mask],
                        xbb_dict["qcd"][qcd_mask],
                        xbb_dict["ttbar"][tt_mask],
                    ]
                )
                sig_jets_score = xbb_dict["hh4b"][hh4b_mask]
                bkg_jets_score = np.concatenate(
                    [xbb_dict["qcd"][qcd_mask], xbb_dict["ttbar"][tt_mask]]
                )
                scores_true = np.concatenate(
                    [
                        np.ones(len(sig_jets_score)),
                        np.zeros(len(bkg_jets_score)),
                    ]
                )
                scores_weights = np.concatenate(
                    [
                        legacy_weights["hh4b"][hh4b_mask],
                        legacy_weights["qcd"][qcd_mask],
                        legacy_weights["ttbar"][tt_mask],
                    ]
                )
                (
                    fpr_pt_dict_run3[ptkey][bkg],
                    tpr_pt_dict_run3[ptkey][bkg],
                    thresholds_pt_dict_run3[ptkey][bkg],
                ) = roc_curve(scores_true, scores_roc, sample_weight=scores_weights)

Make plot of distribution

In [None]:
h_xbb0 = hist.Hist(xbb_axis, cat_axis)
h_xbb1 = hist.Hist(xbb2_axis, cat_axis)

for key in ["hh4b", "qcd", "ttbar"]:

    msd_0 = events_dict[key]["bbFatJetMsd"].to_numpy()[:, 0]
    msd_1 = events_dict[key]["bbFatJetMsd"].to_numpy()[:, 1]
    xbb_0 = events_dict[key]["bbFatJetPNetTXbbLegacy"].to_numpy()[:, 0]
    xbb_1 = events_dict[key]["bbFatJetPNetTXbbLegacy"].to_numpy()[:, 1]
    pt_0 = events_dict[key]["bbFatJetPt"].to_numpy()[:, 0]
    pt_1 = events_dict[key]["bbFatJetPt"].to_numpy()[:, 1]
    mass_0 = events_dict[key]["bbFatJetPNetMassLegacy"].to_numpy()[:, 0]
    mass_1 = events_dict[key]["bbFatJetPNetMassLegacy"].to_numpy()[:, 1]
    weight = events_dict[key]["finalWeight"].to_numpy()

    h_xbb0.fill(xbb=xbb_0, cat=key, weight=weight)
    h_xbb1.fill(xbb=xbb_1, cat=key, weight=weight)

In [None]:
events_dict.keys()

In [None]:
colors = {"ttbar": "b", "hh4b": "k", "qcd": "r"}

hists = {
    "xbb_0": h_xbb0,
    "xbb_1": h_xbb1,
}
xlabels = {
    "xbb_0": r"Jet 1 Xbb",
    "xbb_1": r"Jet 2 Xbb",
}
label_by_sample = {
    "qcd": "Multijet",
    "hh4b": r"ggF HH4b",
    "ttbar": r"$t\bar{t}$ + Jets",
}
for hname, hhist in hists.items():
    fig, ax = plt.subplots(1, 1, figsize=(12, 8))
    for key in ["hh4b", "qcd", "ttbar"]:
        hep.histplot(
            hhist[{"cat": key}],
            ax=ax,
            label=f"{label_by_sample[key]}",
            histtype="step",
            linewidth=1,
            color=colors[key],
            density=True,
        )

    ax.set_yscale("log")
    ax.legend(
        title=r"FatJets $p_T^{(1,2)}>$300" + "\n" + "m$_{reg}^{(1,2)}$:" + f"{mass_cuts} GeV",
        bbox_to_anchor=(1.03, 1),
        loc="upper left",
    )
    # (xbb_0 >= 0.8) & (pt_0 > 300) & (pt_1 > 300) & (mass_0 >= mass_cuts[0]) & (mass_1 >= mass_cuts[0]) & (mass_0 <= mass_cuts[1]) & (mass_1 <= mass_cuts[1])
    ax.set_ylabel("Density")
    ax.set_xlabel(xlabels[hname])
    ax.set_title("Pre-Selection")
    ax.xaxis.grid(True, which="major")
    ax.yaxis.grid(True, which="major")
    fig.tight_layout()
    fig.savefig(f"2022EE_{hname}.png")

In [None]:
samples = {
    "2022EE": {
        "qcd": [
            "QCD_HT-400to600",
            "QCD_HT-600to800",
            "QCD_HT-800to1000",
            "QCD_HT-1000to1200",
            "QCD_HT-1200to1500",
            "QCD_HT-1500to2000",
            "QCD_HT-2000",
        ],
        "ttbar": ["TTto4Q", "TTtoLNu2Q"],
        "ttlep": [
            "TTtoLNu2Q",
        ],
        "hh4b": [
            "GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV",
        ],
    }
}
year = "2022EE"
data_path = "../../../../data/skimmer/24Apr19LegacyFixes_v12_private_signal"
dirs = {data_path: samples}
mass_key = "bbFatJetPNetMassLegacy"
load_columns = [
    ("weight", 1),
    ("event", 1),
    ("MET_pt", 1),
    ("bbFatJetPt", 2),
    ("bbFatJetEta", 2),
    ("bbFatJetPhi", 2),
    ("bbFatJetMsd", 2),
    ("bbFatJetTau3OverTau2", 2),
    ("bbFatJetPNetTXbb", 2),
    ("bbFatJetPNetMass", 2),
]

events_dict = {}
print(f"Loading samples from {year}")
for input_dir, samples in dirs.items():
    loaded_events = load_samples(
        input_dir,
        samples[year],
        year,
        filters=None,
        variations=False,
        reorder_txbb=True,
        txbb="bbFatJetPNetTXbb",
        columns=format_columns(load_columns),
    )

    # reorder
    for sample in loaded_events.keys():
        events = loaded_events[sample]
        bbord = np.argsort(events["bbFatJetPNetTXbb"].to_numpy(), axis=1)[:, ::-1]
        for key in np.unique(events.columns.get_level_values(0)):
            if key.startswith("bbFatJet"):
                events[key] = np.take_along_axis(events[key].to_numpy(), bbord, axis=1)
        events_dict[sample] = events

print(events_dict.keys())
fprs_v12 = {}
tprs_v12 = {}
thresholds_v12 = {}
fprs_v12_now = {}
tprs_v12_now = {}
thresholds_v12_now = {}

pt1_dict_v12 = {}

for key in events_dict:
    # guarantee that Xbb>0.8 is applied to first bb jet
    xbb_0 = events_dict[key]["bbFatJetPNetTXbb"].to_numpy()[:, 0]
    msd_0 = events_dict[key]["bbFatJetMsd"].to_numpy()[:, 0]
    msd_1 = events_dict[key]["bbFatJetMsd"].to_numpy()[:, 1]
    pt_0 = events_dict[key]["bbFatJetPt"].to_numpy()[:, 0]
    pt_1 = events_dict[key]["bbFatJetPt"].to_numpy()[:, 1]
    mass_0 = events_dict[key]["bbFatJetPNetMass"].to_numpy()[:, 0]
    mass_1 = events_dict[key]["bbFatJetPNetMass"].to_numpy()[:, 1]
    mask = (
        (xbb_0 >= 0.8)
        & (pt_0 > 300)
        & (pt_1 > 300)
        & (mass_0 >= mass_cuts[0])
        & (mass_1 >= mass_cuts[0])
        & (mass_0 <= mass_cuts[1])
        & (mass_1 <= mass_cuts[1])
    )
    # apply msd cut
    mask = mask & (msd_0 > 40) & (msd_1 > 40)
    events_dict[key] = events_dict[key][mask]
    # ptbin
    pt1 = events_dict[key]["bbFatJetPt"].to_numpy()[:, 1]
    pt1_dict_v12[key] = {}
    for ptkey, val in pt_bins.items():
        pt1_dict_v12[key][ptkey] = (pt1 >= val[0]) & (pt1 < val[1])

pnet_xbb_str = "bbFatJetPNetTXbb"
v12_weights = {}
xbb_dict = {}
for key in events_dict:
    v12_weights[key] = events_dict[key]["finalWeight"]
    xbb_dict[key] = events_dict[key][pnet_xbb_str].to_numpy()[:, 1]

for bkg in ["qcd", "ttbar", "merged"]:
    if bkg != "merged":
        scores_roc = np.concatenate([xbb_dict["hh4b"], xbb_dict[bkg]])
        sig_jets_score = xbb_dict["hh4b"]
        bkg_jets_score = xbb_dict[bkg]
        scores_true = np.concatenate(
            [
                np.ones(len(sig_jets_score)),
                np.zeros(len(bkg_jets_score)),
            ]
        )
        scores_weights = np.concatenate([v12_weights["hh4b"], v12_weights[bkg]])
        fprs_v12_now[bkg], tprs_v12_now[bkg], thresholds_v12_now[bkg] = roc_curve(
            scores_true, scores_roc
        )
        fprs_v12[bkg], tprs_v12[bkg], thresholds_v12[bkg] = roc_curve(
            scores_true, scores_roc, sample_weight=scores_weights
        )
    else:
        scores_roc = np.concatenate([xbb_dict["hh4b"], xbb_dict["qcd"], xbb_dict["ttbar"]])
        sig_jets_score = xbb_dict["hh4b"]
        bkg_jets_score = np.concatenate([xbb_dict["qcd"], xbb_dict["ttbar"]])
        scores_true = np.concatenate(
            [
                np.ones(len(sig_jets_score)),
                np.zeros(len(bkg_jets_score)),
            ]
        )
        scores_weights = np.concatenate(
            [v12_weights["hh4b"], v12_weights["qcd"], v12_weights["ttbar"]]
        )
        fprs_v12_now[bkg], tprs_v12_now[bkg], thresholds_v12_now[bkg] = roc_curve(
            scores_true, scores_roc
        )
        fprs_v12[bkg], tprs_v12[bkg], thresholds_v12[bkg] = roc_curve(
            scores_true, scores_roc, sample_weight=scores_weights
        )

In [None]:
fpr_pt_dict_v12 = {}
tpr_pt_dict_v12 = {}
thresholds_pt_dict_v12 = {}

for ptkey, ptsel in pt_bins.items():
    hh4b_mask = pt1_dict_v12["hh4b"][ptkey]
    qcd_mask = pt1_dict_v12["qcd"][ptkey]
    tt_mask = pt1_dict_v12["ttbar"][ptkey]

    fpr_pt_dict_v12[ptkey] = {}
    tpr_pt_dict_v12[ptkey] = {}
    thresholds_pt_dict_v12[ptkey] = {}

    if np.any(hh4b_mask):

        for bkg in ["qcd", "ttbar", "merged"]:
            if bkg != "merged":
                bkg_mask = qcd_mask if bkg == "qcd" else tt_mask
                scores_roc = np.concatenate([xbb_dict["hh4b"][hh4b_mask], xbb_dict[bkg][bkg_mask]])
                sig_jets_score = xbb_dict["hh4b"][hh4b_mask]
                bkg_jets_score = xbb_dict[bkg][bkg_mask]
                scores_true = np.concatenate(
                    [
                        np.ones(len(sig_jets_score)),
                        np.zeros(len(bkg_jets_score)),
                    ]
                )
                scores_weights = np.concatenate(
                    [v12_weights["hh4b"][hh4b_mask], v12_weights[bkg][bkg_mask]]
                )
                (
                    fpr_pt_dict_v12[ptkey][bkg],
                    tpr_pt_dict_v12[ptkey][bkg],
                    thresholds_pt_dict_v12[ptkey][bkg],
                ) = roc_curve(scores_true, scores_roc, sample_weight=scores_weights)
            else:
                scores_roc = np.concatenate(
                    [
                        xbb_dict["hh4b"][hh4b_mask],
                        xbb_dict["qcd"][qcd_mask],
                        xbb_dict["ttbar"][tt_mask],
                    ]
                )
                sig_jets_score = xbb_dict["hh4b"][hh4b_mask]
                bkg_jets_score = np.concatenate(
                    [xbb_dict["qcd"][qcd_mask], xbb_dict["ttbar"][tt_mask]]
                )
                scores_true = np.concatenate(
                    [
                        np.ones(len(sig_jets_score)),
                        np.zeros(len(bkg_jets_score)),
                    ]
                )
                scores_weights = np.concatenate(
                    [
                        v12_weights["hh4b"][hh4b_mask],
                        v12_weights["qcd"][qcd_mask],
                        v12_weights["ttbar"][tt_mask],
                    ]
                )
                (
                    fpr_pt_dict_v12[ptkey][bkg],
                    tpr_pt_dict_v12[ptkey][bkg],
                    thresholds_pt_dict_v12[ptkey][bkg],
                ) = roc_curve(scores_true, scores_roc, sample_weight=scores_weights)

In [None]:
tpr_pt_dict_v12

In [None]:
apply_weights = True
# apply_weights = False

for log in [True, False]:

    fig, ax = plt.subplots(1, 1, figsize=(18, 12))
    for bkg in ["qcd", "ttbar", "merged"]:
        print(bkg, auc_dict[bkg])

        def find_nearest(array, value):
            array = np.asarray(array)
            idx = (np.abs(array - value)).argmin()
            return idx

        plot_thresholds = [0.9, 0.98]
        th_colours = ["#9381FF", "#1f78b4", "#a6cee3"]

        if apply_weights:
            ax.plot(
                tpr_dict[bkg],
                fpr_dict[bkg],
                linewidth=2,
                color=bkg_colors[bkg],
                label=legends[bkg] + " Run-2",
            )
            ax.plot(
                tprs_legacy[bkg],
                fprs_legacy[bkg],
                linewidth=2,
                linestyle="dashed",
                color=bkg_colors[bkg],
                label=legends[bkg],
            )
        else:
            ax.plot(
                tpr_dict_now[bkg],
                fpr_dict_now[bkg],
                linewidth=2,
                color=bkg_colors[bkg],
                label=legends[bkg] + " Run-2",
            )
            ax.plot(
                tprs_legacy_now[bkg],
                fprs_legacy_now[bkg],
                linewidth=2,
                linestyle="dashed",
                color=bkg_colors[bkg],
                label=legends[bkg],
            )

        if bkg == "merged":
            pths = {th: [[], []] for th in plot_thresholds}

            for th in plot_thresholds:
                idx = find_nearest(thresholds_dict[bkg], th)
                print(idx, th, tpr_dict[bkg][idx], fpr_dict[bkg][idx])
                pths[th][0].append(tpr_dict[bkg][idx])
                pths[th][1].append(fpr_dict[bkg][idx])

            print(pths)

            for k, th in enumerate(plot_thresholds):
                ax.scatter(
                    *pths[th],
                    marker="o",
                    s=40,
                    label=rf"TXbb > {th}",
                    color=th_colours[k],
                    zorder=100,
                )

                ax.vlines(
                    x=pths[th][0],
                    ymin=0,
                    ymax=pths[th][1],
                    color=th_colours[k],
                    linestyles="dashed",
                    alpha=0.5,
                )

                ax.hlines(
                    y=pths[th][1],
                    xmin=0,
                    xmax=pths[th][0],
                    color=th_colours[k],
                    linestyles="dashed",
                    alpha=0.5,
                )
    ax.set_title("ggF HH4b PNet Xbb Run-2 vs Run-3 Legacy")
    legtitle = r"FatJet p$_T^{(1,2)}$ > 300 GeV" + "\n" + "Xbb$^{1}$>0.8"
    legtitle += "\n" + r"m$_{reg}^{(1,2)}$:" + f"{mass_cuts} GeV"
    # legtitle += "\n" + r"$m_{SD}^{0}$ > 40 GeV"
    if apply_weights:
        legtitle += "\n" + "Weights applied"
    else:
        legtitle += "\n" + "NO Weights applied"

    ax.set_xlabel("Signal efficiency")
    ax.set_ylabel("Background efficiency")
    if log:
        ax.set_xlim([0.0, 0.6])
        ax.set_ylim([1e-5, 1e-1])
        ax.set_yscale("log")
    else:
        ax.set_xlim([0.0, 0.7])
        ax.set_ylim([0, 0.08])
    ax.xaxis.grid(True, which="major")
    ax.yaxis.grid(True, which="major")
    ax.xaxis.grid(True, which="major")
    ax.yaxis.grid(True, which="major")
    ax.legend(
        title=legtitle,
        bbox_to_anchor=(1.03, 1),
        loc="upper left",
    )
    fig.tight_layout()
    weight_str = "weights" if apply_weights else "noweights"
    if log:
        fig.savefig(f"2018_pnetxbb2_roc_{weight_str}{tt_str}_log.png")
    else:
        fig.savefig(f"2018_pnetxbb2_roc_{weight_str}{tt_str}.png")

In [None]:
apply_weight = True

for log in [True]:

    fig, ax = plt.subplots(1, 1, figsize=(18, 12))
    for bkg in ["qcd", "ttbar"]:
        print(bkg, auc_dict[bkg])

        def find_nearest(array, value):
            array = np.asarray(array)
            idx = (np.abs(array - value)).argmin()
            return idx

        plot_thresholds = [0.9, 0.98]
        th_colours = ["#9381FF", "#1f78b4", "#a6cee3"]

        if apply_weights:
            ax.plot(
                tpr_dict[bkg],
                fpr_dict[bkg],
                linewidth=2,
                color=bkg_colors[bkg],
                label=legends[bkg] + " BTV-22-001 PNet on Run-2",
            )
            ax.plot(
                tprs_legacy[bkg],
                fprs_legacy[bkg],
                linewidth=2,
                linestyle="dashed",
                color=bkg_colors[bkg],
                label=legends[bkg] + " BTV-22-001 PNet on Run-3",
            )
            # ax.plot(
            #     tprs_v12[bkg],
            #     fprs_v12[bkg],
            #     linewidth=2,
            #     linestyle="dashdot",
            #     color=bkg_colors[bkg],
            #     label=legends[bkg] + " v12 PNet on Run-3",
            # )
        else:
            ax.plot(
                tpr_dict_now[bkg],
                fpr_dict_now[bkg],
                linewidth=2,
                color=bkg_colors[bkg],
                label=legends[bkg] + " BTV-22-001 PNet on Run-2",
            )
            ax.plot(
                tprs_legacy_now[bkg],
                fprs_legacy_now[bkg],
                linewidth=2,
                linestyle="dashed",
                color=bkg_colors[bkg],
                label=legends[bkg] + " BTV-22-001 PNet on Run-3",
            )
            ax.plot(
                tprs_v12_now[bkg],
                fprs_v12_now[bkg],
                linewidth=2,
                linestyle="dashdot",
                color=bkg_colors[bkg],
                label=legends[bkg] + " v12 PNet on Run-3",
            )

        if bkg == "merged":
            pths = {th: [[], []] for th in plot_thresholds}

            for th in plot_thresholds:
                idx = find_nearest(thresholds_dict[bkg], th)
                print(idx, th, tpr_dict[bkg][idx], fpr_dict[bkg][idx])
                pths[th][0].append(tpr_dict[bkg][idx])
                pths[th][1].append(fpr_dict[bkg][idx])

            print(pths)

            for k, th in enumerate(plot_thresholds):
                ax.scatter(
                    *pths[th],
                    marker="o",
                    s=40,
                    label=rf"TXbb > {th}",
                    color=th_colours[k],
                    zorder=100,
                )

                ax.vlines(
                    x=pths[th][0],
                    ymin=0,
                    ymax=pths[th][1],
                    color=th_colours[k],
                    linestyles="dashed",
                    alpha=0.5,
                )

                ax.hlines(
                    y=pths[th][1],
                    xmin=0,
                    xmax=pths[th][0],
                    color=th_colours[k],
                    linestyles="dashed",
                    alpha=0.5,
                )
    ax.set_title("ggF HH4b PNet Xbb Run-2 vs Run-3")
    legtitle = r"FatJet p$_T^{(1,2)}$ > 300 GeV" + "\n" + "Xbb$^{1}$>0.8"
    legtitle += "\n" + r"m$_{reg}^{(1,2)}$:" + f"{mass_cuts} GeV"
    # legtitle += "\n" + r"+ $m_{SD}^{(0)}>40$ GeV"
    # legtitle += "\n" + r"+ v12 $m_{SD}^{(0,1)}>40$ GeV"

    if apply_weights:
        legtitle += "\n" + "Weights applied"
    else:
        legtitle += "\n" + "NO Weights applied"

    ax.set_xlabel("Signal efficiency")
    ax.set_ylabel("Background efficiency")
    if log:
        ax.set_xlim([0.0, 0.6])
        ax.set_ylim([1e-5, 1e-1])
        ax.set_yscale("log")
    else:
        ax.set_xlim([0.0, 0.7])
        ax.set_ylim([0, 0.08])
    ax.xaxis.grid(True, which="major")
    ax.yaxis.grid(True, which="major")
    ax.xaxis.grid(True, which="major")
    ax.yaxis.grid(True, which="major")
    ax.legend(
        title=legtitle,
        fontsize=15,
        bbox_to_anchor=(1.03, 1),
        loc="upper left",
    )
    fig.tight_layout()
    weight_str = "weights" if apply_weights else "noweights"
    if log:
        fig.savefig(f"2018_vsRun3_pnetxbb2_roc_{weight_str}{tt_str}_log.png")
    else:
        fig.savefig(f"2018_vsRun3_pnetxbb2_roc_{weight_str}{tt_str}.png")

PT comparison

In [None]:
for log in [True]:

    i = 0
    for ptkey, ptsel in pt_bins.items():

        if tpr_pt_dict[ptkey] == {}:
            continue

        fig, ax = plt.subplots(1, 1, figsize=(18, 12))
        for bkg in ["qcd", "ttbar"]:

            def find_nearest(array, value):
                array = np.asarray(array)
                idx = (np.abs(array - value)).argmin()
                return idx

            plot_thresholds = [0.9, 0.98]
            th_colours = ["#9381FF", "#1f78b4", "#a6cee3"]

            ax.plot(
                tpr_pt_dict[ptkey][bkg],
                fpr_pt_dict[ptkey][bkg],
                linewidth=2,
                color=bkg_colors[bkg],
                label=legends[bkg] + " Run-2",
            )
            ax.plot(
                tpr_pt_dict_run3[ptkey][bkg],
                fpr_pt_dict_run3[ptkey][bkg],
                linewidth=2,
                linestyle="dashed",
                color=bkg_colors[bkg],
                label=legends[bkg] + " Legacy",
            )
            ax.plot(
                tpr_pt_dict_v12[ptkey][bkg],
                fpr_pt_dict_v12[ptkey][bkg],
                linewidth=2,
                linestyle="dashdot",
                color=bkg_colors[bkg],
                label=legends[bkg] + " v12",
            )

            if bkg == "merged":
                pths = {th: [[], []] for th in plot_thresholds}

                for th in plot_thresholds:
                    idx = find_nearest(thresholds_dict[bkg], th)
                    print(idx, th, tpr_dict[bkg][idx], fpr_dict[bkg][idx])
                    pths[th][0].append(tpr_dict[bkg][idx])
                    pths[th][1].append(fpr_dict[bkg][idx])

                print(pths)

                for k, th in enumerate(plot_thresholds):
                    ax.scatter(
                        *pths[th],
                        marker="o",
                        s=40,
                        label=rf"TXbb > {th}",
                        color=th_colours[k],
                        zorder=100,
                    )

                    ax.vlines(
                        x=pths[th][0],
                        ymin=0,
                        ymax=pths[th][1],
                        color=th_colours[k],
                        linestyles="dashed",
                        alpha=0.5,
                    )

                    ax.hlines(
                        y=pths[th][1],
                        xmin=0,
                        xmax=pths[th][0],
                        color=th_colours[k],
                        linestyles="dashed",
                        alpha=0.5,
                    )
        ax.set_title(f"PNet Xbb Run-2 vs Run-3 {pt_bins[ptkey]}")
        legtitle = r"FatJet p$_T^{(1,2)}$" + f": {pt_bins[ptkey]} GeV"
        legtitle += "\n" + "Xbb$^{1}$>0.8"
        legtitle += "\n" + r"m$_{reg}^{(1,2)}$:" + f"{mass_cuts} GeV"
        legtitle += "\n" + r"+ v12 $m_{SD}^{(1)}>30$ GeV"
        legtitle += "\n" + "Weights applied"

        ax.set_xlabel("Signal efficiency")
        ax.set_ylabel("Background efficiency")
        if log:
            ax.set_xlim([0.0, 0.6])
            ax.set_ylim([1e-5, 1e-1])
            ax.set_yscale("log")
        else:
            ax.set_xlim([0.0, 0.7])
            ax.set_ylim([0, 0.08])
        ax.xaxis.grid(True, which="major")
        ax.yaxis.grid(True, which="major")
        ax.xaxis.grid(True, which="major")
        ax.yaxis.grid(True, which="major")
        ax.legend(
            title=legtitle,
            bbox_to_anchor=(1.03, 1),
            loc="upper left",
        )
        fig.tight_layout()
        weight_str = "weights"
        fig.savefig(f"2018_vsRun3_pnetxbb2_roc_{weight_str}{tt_str}_log_ptbin{i}.png")
        i += 1

In [None]:
tpr_pt_dict_run3

Adding Xbb cut in the efficiency

In [None]:
fpr_dict = {}
tpr_dict = {}
thresholds_dict = {}

plt.figure(figsize=(6, 6))
for bkg in ["qcd", "ttbar", "merged"]:
    sig_jets_score = scores["hh4b"]
    if bkg == "merged":
        bkg_jets_score = np.concatenate((scores["qcd"], scores["ttbar"]))
        scores_weights = pd.concat([weights["hh4b"], weights["qcd"], weights["ttbar"]], axis=0)
        scores_txbb = pd.concat([txbb1["hh4b"], txbb1["qcd"], txbb1["ttbar"]], axis=0)
    else:
        bkg_jets_score = scores[bkg]
        scores_weights = pd.concat([weights["hh4b"], weights[bkg]], axis=0)
        scores_txbb = pd.concat([txbb1["hh4b"], txbb1[bkg]], axis=0)

    scores_roc = np.concatenate((sig_jets_score, bkg_jets_score))
    scores_true = np.concatenate(
        [
            np.ones(len(sig_jets_score)),
            np.zeros(len(bkg_jets_score)),
        ]
    )
    scores_roc_thresholded = scores_roc.copy()
    scores_roc_thresholded[scores_txbb < 0.98] = 0

    fpr, tpr, thresholds = roc_curve(
        scores_true, scores_roc_thresholded, sample_weight=scores_weights
    )
    fpr_dict[bkg] = fpr
    tpr_dict[bkg] = tpr
    thresholds_dict[bkg] = thresholds
    sorted_index = np.argsort(fpr)
    fpr_sorted = np.array(fpr)[sorted_index]
    tpr_sorted = np.array(tpr)[sorted_index]
    auc_dict[bkg] = scipy.integrate.trapz(y=tpr_sorted, x=fpr_sorted)

for bkg in ["qcd", "ttbar", "merged"]:
    print(bkg, auc_dict[bkg])

    def find_nearest(array, value):
        array = np.asarray(array)
        idx = (np.abs(array - value)).argmin()
        return idx

    plot_thresholds = [0.03, 0.11, 0.43]
    th_colours = ["#9381FF", "#1f78b4", "#a6cee3"]

    pths = {th: [[], []] for th in plot_thresholds}

    plt.plot(tpr_dict[bkg], fpr_dict[bkg], linewidth=2, color=bkg_colors[bkg])

    if bkg != "merged":
        continue

    for th in plot_thresholds:
        idx = find_nearest(thresholds_dict[bkg], th)
        pths[th][0].append(tpr_dict[bkg][idx])
        pths[th][1].append(fpr_dict[bkg][idx])

    for k, th in enumerate(plot_thresholds):
        plt.scatter(
            *pths[th],
            marker="o",
            s=40,
            label=rf"BDT > {th}",
            color=th_colours[k],
            zorder=100,
        )

        plt.vlines(
            x=pths[th][0],
            ymin=0,
            ymax=pths[th][1],
            color=th_colours[k],
            linestyles="dashed",
            alpha=0.5,
        )

        plt.hlines(
            y=pths[th][1],
            xmin=0,
            xmax=pths[th][0],
            color=th_colours[k],
            linestyles="dashed",
            alpha=0.5,
        )

# plt.title(r"FatJets pT > 300 GeV, Xbb>0.8, m$_{SD}$:[50-250] GeV, Xbb1>0.98")
# plt.title(r"FatJets pT > 250 GeV, Xbb>0.8, m$_{SD}$>50 GeV, Xbb1>0.98")
plt.title(r"FatJets pT > 300 GeV, Xbb>0.8, m$_{SD}$>50 GeV, Xbb1>0.98")
plt.xlabel("Signal efficiency")
plt.ylabel("Background efficiency")
plt.xlim([0.0, 0.25])
plt.ylim([0, 0.002])
plt.legend()
plt.text(0.2, 0.00025, "QCD", color=bkg_colors["qcd"])
plt.text(0.2, 0.00015, "TT", color=bkg_colors["ttbar"])
plt.text(0.2, 0.00005, "QCD+TT", color=bkg_colors["merged"])

Obtain signal and background efficiency manually

In [None]:
yields = {}
yields_bdt043 = {}
yields_bdt043_xbb098 = {}


for key in ["hh4b", "qcd", "ttbar"]:
    # pt mask
    mask = (events_dict[key].fatJet1Pt > 300) & (events_dict[key].fatJet2Pt > 300)

    # bdt043
    mask = mask & (events_dict[key].disc_qcd_and_ttbar_Run2_enhanced_v8p2 > 0.43)

    # xbb 098
    mask_xbb = mask & (events_dict[key].fatJet2PNetXbb > 0.98)

    lumi = 13700

    if "hh4b" not in key:
        yields[key] = np.sum(lumi * events_dict[key]["weight"] * events_dict[key]["xsecWeight"])
        yields_bdt043[key] = np.sum(
            lumi * events_dict[key][mask]["weight"] * events_dict[key][mask]["xsecWeight"]
        )
        yields_bdt043_xbb098[key] = np.sum(
            lumi * events_dict[key][mask_xbb]["weight"] * events_dict[key][mask_xbb]["xsecWeight"]
        )
    else:
        yields[key] = np.sum(lumi * events_dict[key]["genWeight"] * events_dict[key]["xsecWeight"])
        yields_bdt043[key] = np.sum(
            lumi * events_dict[key][mask]["genWeight"] * events_dict[key][mask]["xsecWeight"]
        )
        yields_bdt043_xbb098[key] = np.sum(
            lumi
            * events_dict[key][mask_xbb]["genWeight"]
            * events_dict[key][mask_xbb]["xsecWeight"]
        )

sig_eff_bdt043 = yields_bdt043["hh4b"] / yields["hh4b"]
bkg_eff_bdt043 = (yields_bdt043["qcd"] + yields_bdt043["ttbar"]) / (yields["qcd"] + yields["ttbar"])

sig_eff_bdt043_xbb098 = yields_bdt043_xbb098["hh4b"] / yields["hh4b"]
bkg_eff_bdt043_xbb098 = (yields_bdt043_xbb098["qcd"] + yields_bdt043_xbb098["ttbar"]) / (
    yields["qcd"] + yields["ttbar"]
)

print(yields)
print(
    yields_bdt043,
    sig_eff_bdt043,
    bkg_eff_bdt043,
    2 * yields_bdt043["hh4b"] / np.sqrt(yields_bdt043["qcd"] + yields_bdt043["ttbar"]),
)
print(
    yields_bdt043_xbb098,
    sig_eff_bdt043_xbb098,
    bkg_eff_bdt043_xbb098,
    2
    * yields_bdt043_xbb098["hh4b"]
    / np.sqrt(yields_bdt043_xbb098["qcd"] + yields_bdt043_xbb098["ttbar"]),
)

In [None]:
yields = {}
yields_bdt043 = {}
yields_bdt043_xbb098 = {}


for key in ["hh4b", "qcd", "ttbar"]:
    # pt mask
    # mask = (events_dict[key].fatJet1Pt > 250) & (events_dict[key].fatJet2Pt > 250)
    mask = (events_dict[key].fatJet1Pt > 300) & (events_dict[key].fatJet2Pt > 300)
    # bdt043
    mask = mask & (events_dict[key].disc_qcd_and_ttbar_Run2_enhanced_v8p2 > 0.43)

    # xbb 098
    mask_xbb = mask & (events_dict[key].fatJet2PNetXbb > 0.98)

    lumi = 13700

    weight = events_dict[key]["puWeight"] * events_dict[key]["triggerEffWeight"] * lumi
    weight_mask = (
        events_dict[key][mask]["puWeight"] * events_dict[key][mask]["triggerEffWeight"] * lumi
    )
    weight_mask_xbb = (
        events_dict[key][mask_xbb]["puWeight"]
        * events_dict[key][mask_xbb]["triggerEffWeight"]
        * lumi
    )

    # if "hh4b" not in key:
    yields[key] = np.sum(weight * events_dict[key]["weight"])
    yields_bdt043[key] = np.sum(weight_mask * events_dict[key][mask]["weight"])
    yields_bdt043_xbb098[key] = np.sum(weight_mask_xbb * events_dict[key][mask_xbb]["weight"])
    # else:
    #    yields[key] = np.sum(weight * events_dict[key]["genWeight"])
    #    yields_bdt043[key] = np.sum(weight_mask * events_dict[key]["genWeight"])
    #    yields_bdt043_xbb098[key] = np.sum(weight_mask_xbb * events_dict[key]["genWeight"])

sig_eff_bdt043 = yields_bdt043["hh4b"] / yields["hh4b"]
bkg_eff_bdt043 = (yields_bdt043["qcd"] + yields_bdt043["ttbar"]) / (yields["qcd"] + yields["ttbar"])

sig_eff_bdt043_xbb098 = yields_bdt043_xbb098["hh4b"] / yields["hh4b"]
bkg_eff_bdt043_xbb098 = (yields_bdt043_xbb098["qcd"] + yields_bdt043_xbb098["ttbar"]) / (
    yields["qcd"] + yields["ttbar"]
)

print(yields)
print(
    yields_bdt043,
    sig_eff_bdt043,
    bkg_eff_bdt043,
    2 * yields_bdt043["hh4b"] / np.sqrt(yields_bdt043["qcd"] + yields_bdt043["ttbar"]),
)
print(
    yields_bdt043_xbb098,
    sig_eff_bdt043_xbb098,
    bkg_eff_bdt043_xbb098,
    2
    * yields_bdt043_xbb098["hh4b"]
    / np.sqrt(yields_bdt043_xbb098["qcd"] + yields_bdt043_xbb098["ttbar"]),
),

In [None]:
h_bdt = hist.Hist(bdt_axis, cat_axis)
h_xbb0 = hist.Hist(xbb_axis, cat_axis)
h_msd1 = hist.Hist(msd_axis, cat_axis)
h_mreg1 = hist.Hist(mreg_axis, cat_axis)

scores = {}
weights = {}
txbb1 = {}
for key in ["hh4b", "qcd", "ttbar"]:
    # pt mask
    mask = (
        (events_dict[key].fatJet1Pt > 300)
        & (events_dict[key].fatJet2Pt > 300)
        & (events_dict[key].fatJet1MassSD > 50)
        & (events_dict[key].fatJet2MassSD > 50)
        & (events_dict[key].fatJet1MassSD < 250)
        & (events_dict[key].fatJet2MassSD < 250)
        & (events_dict[key]["disc_qcd_and_ttbar_Run2_enhanced_v8p2"] > 0.43)
    )

    h_bdt.fill(
        bdt=events_dict[key][mask]["disc_qcd_and_ttbar_Run2_enhanced_v8p2"].to_numpy(), cat=key
    )
    h_xbb0.fill(xbb=events_dict[key][mask]["fatJet1PNetXbb"].to_numpy(), cat=key)
    h_msd1.fill(msd=events_dict[key][mask]["fatJet2MassSD"].to_numpy(), cat=key)
    h_mreg1.fill(mreg=events_dict[key][mask]["fatJet2MassRegressed"].to_numpy(), cat=key)

    scores[key] = events_dict[key][mask]["disc_qcd_and_ttbar_Run2_enhanced_v8p2"].to_numpy()
    if "hh4b" not in key:
        weights[key] = events_dict[key][mask]["weight"] * events_dict[key][mask]["xsecWeight"]
    else:
        weights[key] = events_dict[key][mask]["genWeight"] * events_dict[key][mask]["xsecWeight"]
    txbb1[key] = events_dict[key][mask]["fatJet2PNetXbb"]

colors = {"ttbar": "b", "hh4b": "k", "qcd": "r"}

hists = {
    "bdt": h_bdt,
    "xbb_0": h_xbb0,
    "msd_1": h_msd1,
    "mreg_1": h_mreg1,
}

for hname, hhist in hists.items():
    fig, ax = plt.subplots(1, 1, figsize=(6, 4))
    for key in ["hh4b", "qcd", "ttbar"]:
        hep.histplot(
            hhist[{"cat": key}],
            ax=ax,
            label=f"{key}",
            histtype="step",
            linewidth=1,
            color=colors[key],
            density=True,
        )

    ax.set_yscale("log")
    ax.legend(title=r"FatJets $p_T>$300, m$_{SD}$:[50-250] GeV")
    ax.set_ylabel("Density")
    ax.set_title("BDT > 0.43")
    ax.xaxis.grid(True, which="major")
    ax.yaxis.grid(True, which="major")