In [None]:
import uproot
import os
import pickle
import coffea

from HH4b.utils import load_samples, get_feat, format_columns

from coffea import nanoevents
from coffea.lookup_tools.dense_lookup import dense_lookup

import numpy as np
import awkward as ak
import pandas as pd

In [None]:
import mplhep as hep
import matplotlib.ticker as mticker
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

import hist

hep.style.use(["CMS", "firamath"])

formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))
plt.rcParams.update({"font.size": 12})
plt.rcParams["lines.linewidth"] = 2
plt.rcParams["grid.color"] = "#CCCCCC"
plt.rcParams["grid.linewidth"] = 0.5
plt.rcParams["figure.edgecolor"] = "none"

v11 vs v12 after skimmer

In [None]:
MAIN_DIR = "../../../../"

filters = [
    [
        ("('bbFatJetPt', '0')", ">=", 250),
        ("('bbFatJetPt', '1')", ">=", 250),
        ("('bbFatJetMsd', '0')", ">=", 60),
        ("('bbFatJetMsd', '1')", ">=", 60),
        ("('bbFatJetPNetXbb', '0')", ">=", 0.8),
    ],
]
year = "2022EE"

sample_dirs = {
    f"{MAIN_DIR}/data/skimmer/Feb10_v12_pre-sel": {
        "hh4b_v12": ["GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV"],
    },
    f"{MAIN_DIR}/data/skimmer/Feb10_v11_private_pre-sel": {
        "hh4b_v11": ["GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV"],
    },
}

# dictionary that will contain all information (from all samples)
events_dict = {}
for input_dir, samples_dict in sample_dirs.items():
    events_dict = {
        **events_dict,
        # this function will load files (only the columns selected), apply filters and compute a weight per event
        **load_samples(
            input_dir,
            samples_dict,
            year,
            # filters=filters,
            # columns_mc=utils.format_columns(load_columns_mc),
        ),
    }

In [None]:
list(events_dict["hh4b_v12"].columns)

In [None]:
ghpt_axis = hist.axis.Regular(40, 0, 1000, name="pt", label="Gen Higgs pT")
xbb_axis = hist.axis.Regular(60, 0, 1, name="xbb", label="Xbb ordered FatJet - PNet Xbb")
mass_axis = hist.axis.Regular(80, 0, 250, name="mass", label="Xbb ordered FatJet - PNet Mass")
msd_axis = hist.axis.Regular(80, 0, 250, name="msd", label="Xbb ordered FatJet - SD Mass")
pxbb_axis = hist.axis.Regular(60, 0, 1, name="xbb", label="pT ordered FatJet - PNet Xbb")
mratio_axis = hist.axis.Regular(
    80, 0, 1.85, name="ratio", label=r"Xbb ordered FatJet - m / m$_{truth}$"
)

version_axis = hist.axis.StrCategory([], name="version", growth=True)
sample_axis = hist.axis.StrCategory([], name="sample", growth=True)

cut_axis = hist.axis.StrCategory([], name="xbbcut", growth=True)
massversion_axis = hist.axis.StrCategory([], name="massversion", growth=True)

In [None]:
hists = {
    "xbb": (xbb_axis, "bbFatJetPNetXbb"),
    "mass": (mass_axis, "bbFatJetPNetMass"),
    # "msd": (msd_axis, "bbFatJetMsd"),
    # "ptord_xbb": (pxbb_axis, "ak8FatJetPNetXbb"),
}

Take a look at matched HH4b events

In [None]:
for key, hinfo in hists.items():
    axs = hinfo[0]
    var = hinfo[1]
    h1 = hist.Hist(axs, version_axis)
    h2 = hist.Hist(axs, version_axis)
    for version in ["v11", "v12"]:
        events = events_dict[f"hh4b_{version}"]
        # is fatjet (0,1) matched
        m_0 = events["bbFatJetHiggsMatch"].to_numpy()[:, 0]
        m_1 = events["bbFatJetHiggsMatch"].to_numpy()[:, 1]
        mask_1 = m_0 == 1
        mask_2 = m_1 == 1
        # is fatjet (0,1) matched to 2bs
        nb_0_h1 = events["bbFatJetNumBMatchedH1"].to_numpy()[:, 0]
        nb_0_h2 = events["bbFatJetNumBMatchedH2"].to_numpy()[:, 0]
        nb_1_h1 = events["bbFatJetNumBMatchedH1"].to_numpy()[:, 1]
        nb_1_h2 = events["bbFatJetNumBMatchedH2"].to_numpy()[:, 1]
        mask_1 = mask_1 & ((nb_0_h1 == 2) | (nb_0_h2 == 2))
        mask_2 = mask_2 & ((nb_1_h1 == 2) | (nb_1_h2 == 2))

        hpt_0 = get_feat(events, f"{var}0")[mask_1]
        hpt_1 = get_feat(events, f"{var}1")[mask_2]
        h1.fill(hpt_0, version)
        h2.fill(hpt_1, version)

    for i in range(2):
        h = h1 if i == 0 else h2
        fig, ax = plt.subplots(1, 1, figsize=(6, 5))
        hep.histplot(h[{"version": "v12"}], ax=ax, label="v12", density=True, color="r")
        hep.histplot(h[{"version": "v11"}], ax=ax, label="v11", density=True, color="b")
        ax.legend(title=f"bb AK8 Jet {i}")
        ax.set_ylabel("Density")
        ax.set_title("GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00 (2022EE)")
        ax.xaxis.grid(True, which="major")
        ax.yaxis.grid(True, which="major")

Take a look at the mass after successive Xbb cuts

In [None]:
cuts = [0, 0.95]
# linestyles = ["solid", "dotted", "dashed", (0, (5, 1)), (5, (10, 3))]
colors = {
    "v11": ["red", "salmon", "lightcoral"],
    "v12": ["blue", "cornflowerblue", "navy"],
}

hists = {
    "mass": (mass_axis, "bbFatJetPNetMass"),
    "msd": (msd_axis, "bbFatJetMsd"),
}

for key, hinfo in hists.items():
    axs = hinfo[0]
    var = hinfo[1]
    print(var)
    h1 = hist.Hist(axs, version_axis, cut_axis)
    h2 = hist.Hist(axs, version_axis, cut_axis)
    for version in ["v11", "v12"]:
        events = events_dict[f"hh4b_{version}"]
        # is fatjet (0,1) matched
        m_0 = events["bbFatJetHiggsMatch"].to_numpy()[:, 0]
        m_1 = events["bbFatJetHiggsMatch"].to_numpy()[:, 1]
        mask_1 = m_0 == 1
        mask_2 = m_1 == 1
        # is fatjet (0,1) matched to 2bs
        nb_0_h1 = events["bbFatJetNumBMatchedH1"].to_numpy()[:, 0]
        nb_0_h2 = events["bbFatJetNumBMatchedH2"].to_numpy()[:, 0]
        nb_1_h1 = events["bbFatJetNumBMatchedH1"].to_numpy()[:, 1]
        nb_1_h2 = events["bbFatJetNumBMatchedH2"].to_numpy()[:, 1]
        mask_1 = mask_1 & ((nb_0_h1 == 2) | (nb_0_h2 == 2))
        mask_2 = mask_2 & ((nb_1_h1 == 2) | (nb_1_h2 == 2))

        for cut in cuts:
            xbb_0 = events["bbFatJetPNetXbb"].to_numpy()[:, 0]
            xbb_1 = events["bbFatJetPNetXbb"].to_numpy()[:, 1]
            mask_1_cut = mask_1 & (xbb_0 > cut)
            mask_2_cut = mask_2 & (xbb_1 > cut)
            h_0 = get_feat(events, f"{var}0")[mask_1_cut]
            h_1 = get_feat(events, f"{var}1")[mask_2_cut]
            h1.fill(h_0, version, str(cut))
            h2.fill(h_1, version, str(cut))

    for i in range(2):
        h = h1 if i == 0 else h2
        fig, ax = plt.subplots(1, 1, figsize=(6, 5))
        for ic, cut in enumerate(cuts):
            hep.histplot(
                h[{"version": "v12", "xbbcut": str(cut)}],
                ax=ax,
                label=f"v12, Xbb$_{i}$ > {cut}",
                density=True,
                color=colors["v12"][ic],
            )
            hep.histplot(
                h[{"version": "v11", "xbbcut": str(cut)}],
                ax=ax,
                label=f"v12, Xbb$_{i}$ > {cut}",
                density=True,
                color=colors["v11"][ic],
            )
        ax.legend(title=f"bb AK8 Jet {i}")
        ax.set_ylabel("Density")
        ax.set_title("GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00 (2022EE)")
        ax.xaxis.grid(True, which="major")
        ax.yaxis.grid(True, which="major")

Take a look at the mass ratios

In [None]:
# h1 = hist.Hist(mass_axis, version_axis, massversion_axis)
# h2 = hist.Hist(mass_axis, version_axis, massversion_axis)

h1 = hist.Hist(mratio_axis, version_axis, massversion_axis)
h2 = hist.Hist(mratio_axis, version_axis, massversion_axis)
mass_versions = {
    "msd": "bbFatJetMsd",
    "pnet_mass": "bbFatJetPNetMass",
}

params_0 = {
    "v11": {},
    "v12": {},
}
params_1 = {
    "v11": {},
    "v12": {},
}
for version in ["v11", "v12"]:
    events = events_dict[f"hh4b_{version}"]
    # is fatjet (0,1) matched
    m_0 = events["bbFatJetHiggsMatch"].to_numpy()[:, 0]
    m_1 = events["bbFatJetHiggsMatch"].to_numpy()[:, 1]
    mask_1 = m_0 == 1
    mask_2 = m_1 == 1
    # is fatjet (0,1) matched to 2bs
    nb_0_h1 = events["bbFatJetNumBMatchedH1"].to_numpy()[:, 0]
    nb_0_h2 = events["bbFatJetNumBMatchedH2"].to_numpy()[:, 0]
    nb_1_h1 = events["bbFatJetNumBMatchedH1"].to_numpy()[:, 1]
    nb_1_h2 = events["bbFatJetNumBMatchedH2"].to_numpy()[:, 1]
    mask_1 = mask_1 & ((nb_0_h1 == 2) | (nb_0_h2 == 2))
    mask_2 = mask_2 & ((nb_1_h1 == 2) | (nb_1_h2 == 2))

    # mask fatjet pt
    pt_0 = events["bbFatJetPt"].to_numpy()[:, 0]
    pt_1 = events["bbFatJetPt"].to_numpy()[:, 1]
    mask_1 = mask_1 & ((pt_0 > 400) & (pt_0 < 600))
    mask_2 = mask_2 & ((pt_1 > 400) & (pt_1 < 600))

    for mass_label, var in mass_versions.items():
        val_0 = get_feat(events, f"{var}0")[mask_1] / 125
        val_1 = get_feat(events, f"{var}1")[mask_2] / 125

        params_0[version][mass_label] = (val_0.mean(), val_0.std())
        params_1[version][mass_label] = (val_1.mean(), val_1.std())

        h1.fill(val_0, version, mass_label)
        h2.fill(val_1, version, mass_label)

for i in range(2):
    h = h1 if i == 0 else h2
    params = params_0 if i == 0 else params_1
    fig, ax = plt.subplots(1, 1, figsize=(6, 5))
    hep.histplot(
        h[{"version": "v11", "massversion": "msd"}],
        ax=ax,
        label=f"Sofdrop v11",
        density=True,
        color="k",
    )
    hep.histplot(
        h[{"version": "v11", "massversion": "pnet_mass"}],
        ax=ax,
        label=f"PNet mass v11",
        density=True,
        color="r",
    )
    hep.histplot(
        h[{"version": "v12", "massversion": "pnet_mass"}],
        ax=ax,
        label=f"PNet mass v12",
        density=True,
        color="b",
    )
    ax.legend(title=f"Xbb AK8 Jet {i} \n " + r"400 < p$_T$ < 600")
    ax.set_ylabel("Density")
    ax.set_title("GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00 (2022EE)")
    ax.xaxis.grid(True, which="major")
    ax.yaxis.grid(True, which="major")

    # plot the pdf
    # mu, std = params["v11"]["pnet_mass"]
    # mu, std, exp = params["v11"]["pnet_mass"]
    min, max = ax.get_ylim()
    mu, std = params["v11"]["pnet_mass"]
    ax.text(2, max - 0.5, r"$\mu$" + f"= {mu:.3f}," + r"$\sigma$ =" + f"{std:.3f}", color="r")
    mu, std = params["v12"]["pnet_mass"]
    ax.text(2, max - 0.8, r"$\mu$" + f"= {mu:.3f}," + r"$\sigma$ =" + f"{std:.3f}", color="b")
    # xmin, xmax = ax.get_xlim()
    # x = np.linspace(xmin, xmax, 100)
    # p = norm.pdf(x, mu, std)
    # p = exponnorm.ppf(x, mu, std, exp)
    # p = crystalball.pdf(x, beta, m, loc, scale)
    # ax.plot(x, p, 'r', linewidth=2)

Add in Xbb cut

In [None]:
h1 = hist.Hist(mratio_axis, version_axis, massversion_axis)
h2 = hist.Hist(mratio_axis, version_axis, massversion_axis)

mass_versions = {
    "msd": "bbFatJetMsd",
    "pnet_mass": "bbFatJetPNetMass",
}

params_0 = {
    "v11": {},
    "v12": {},
}
params_1 = {
    "v11": {},
    "v12": {},
}
for version in ["v11", "v12"]:
    events = events_dict[f"hh4b_{version}"]
    # is fatjet (0,1) matched
    m_0 = events["bbFatJetHiggsMatch"].to_numpy()[:, 0]
    m_1 = events["bbFatJetHiggsMatch"].to_numpy()[:, 1]
    mask_1 = m_0 == 1
    mask_2 = m_1 == 1
    # is fatjet (0,1) matched to 2bs
    nb_0_h1 = events["bbFatJetNumBMatchedH1"].to_numpy()[:, 0]
    nb_0_h2 = events["bbFatJetNumBMatchedH2"].to_numpy()[:, 0]
    nb_1_h1 = events["bbFatJetNumBMatchedH1"].to_numpy()[:, 1]
    nb_1_h2 = events["bbFatJetNumBMatchedH2"].to_numpy()[:, 1]
    mask_1 = mask_1 & ((nb_0_h1 == 2) | (nb_0_h2 == 2))
    mask_2 = mask_2 & ((nb_1_h1 == 2) | (nb_1_h2 == 2))

    # mask fatjet pt
    pt_0 = events["bbFatJetPt"].to_numpy()[:, 0]
    pt_1 = events["bbFatJetPt"].to_numpy()[:, 1]
    mask_1 = mask_1 & ((pt_0 > 400) & (pt_0 < 600))
    mask_2 = mask_2 & ((pt_1 > 400) & (pt_1 < 600))

    # mask fatjet xbb
    xbb_0 = events["bbFatJetPNetXbb"].to_numpy()[:, 0]
    xbb_1 = events["bbFatJetPNetXbb"].to_numpy()[:, 1]
    mask_1 = mask_1 & ((xbb_0 > 0.95))
    mask_2 = mask_2 & ((xbb_1 > 0.95))

    for mass_label, var in mass_versions.items():
        val_0 = get_feat(events, f"{var}0")[mask_1] / 125
        val_1 = get_feat(events, f"{var}1")[mask_2] / 125

        params_0[version][mass_label] = (val_0.mean(), val_0.std())
        params_1[version][mass_label] = (val_1.mean(), val_1.std())

        h1.fill(val_0, version, mass_label)
        h2.fill(val_1, version, mass_label)

for i in range(2):
    h = h1 if i == 0 else h2
    params = params_0 if i == 0 else params_1
    fig, ax = plt.subplots(1, 1, figsize=(6, 5))
    hep.histplot(
        h[{"version": "v11", "massversion": "msd"}],
        ax=ax,
        label=f"Sofdrop v11",
        density=True,
        color="k",
    )
    hep.histplot(
        h[{"version": "v11", "massversion": "pnet_mass"}],
        ax=ax,
        label=f"PNet mass v11",
        density=True,
        color="r",
    )
    hep.histplot(
        h[{"version": "v12", "massversion": "pnet_mass"}],
        ax=ax,
        label=f"PNet mass v12",
        density=True,
        color="b",
    )
    ax.legend(title=f"Xbb AK8 Jet {i} \n " + r"400 < p$_T$ < 600" + f"\n Xbb$_{i}$ > 0.95")
    ax.set_ylabel("Density")
    ax.set_title("GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00 (2022EE)")
    ax.xaxis.grid(True, which="major")
    ax.yaxis.grid(True, which="major")

    # plot the pdf
    # mu, std = params["v11"]["pnet_mass"]
    # mu, std, exp = params["v11"]["pnet_mass"]
    min, max = ax.get_ylim()
    mu, std = params["v11"]["pnet_mass"]
    ax.text(2, max - 0.5, r"$\mu$" + f"= {mu:.3f}," + r"$\sigma$ =" + f"{std:.3f}", color="r")
    mu, std = params["v12"]["pnet_mass"]
    ax.text(2, max - 0.8, r"$\mu$" + f"= {mu:.3f}," + r"$\sigma$ =" + f"{std:.3f}", color="b")
    # xmin, xmax = ax.get_xlim()
    # x = np.linspace(xmin, xmax, 100)
    # p = norm.pdf(x, mu, std)
    # p = exponnorm.ppf(x, mu, std, exp)
    # p = crystalball.pdf(x, beta, m, loc, scale)
    # ax.plot(x, p, 'r', linewidth=2)

Now look at different pt bins

In [None]:
ptbins = {
    r"300 < p$_T$ < 400": [300, 400],
    r"270 < p$_T$ < 300": [270, 300],
    r"400 < p$_T$ < 600": [400, 600],
    r"600 < p$_T$": [600, 1200],
}

h1 = hist.Hist(mratio_axis, version_axis, massversion_axis)
h2 = hist.Hist(mratio_axis, version_axis, massversion_axis)

mass_versions = {
    "msd": "bbFatJetMsd",
    "pnet_mass": "bbFatJetPNetMass",
}


ptbini = 0
for ptbin_str, ptbin in ptbins.items():
    params_0 = {
        "v11": {},
        "v12": {},
    }
    params_1 = {
        "v11": {},
        "v12": {},
    }

    for version in ["v11", "v12"]:
        events = events_dict[f"hh4b_{version}"]
        # is fatjet (0,1) matched
        m_0 = events["bbFatJetHiggsMatch"].to_numpy()[:, 0]
        m_1 = events["bbFatJetHiggsMatch"].to_numpy()[:, 1]
        mask_1 = m_0 == 1
        mask_2 = m_1 == 1
        # is fatjet (0,1) matched to 2bs
        nb_0_h1 = events["bbFatJetNumBMatchedH1"].to_numpy()[:, 0]
        nb_0_h2 = events["bbFatJetNumBMatchedH2"].to_numpy()[:, 0]
        nb_1_h1 = events["bbFatJetNumBMatchedH1"].to_numpy()[:, 1]
        nb_1_h2 = events["bbFatJetNumBMatchedH2"].to_numpy()[:, 1]
        mask_1 = mask_1 & ((nb_0_h1 == 2) | (nb_0_h2 == 2))
        mask_2 = mask_2 & ((nb_1_h1 == 2) | (nb_1_h2 == 2))

        # mask fatjet pt
        pt_0 = events["bbFatJetPt"].to_numpy()[:, 0]
        pt_1 = events["bbFatJetPt"].to_numpy()[:, 1]
        mask_1 = mask_1 & (pt_0 < ptbin[0]) & (pt_0 < ptbin[1])
        mask_2 = mask_2 & (pt_1 < ptbin[0]) & (pt_1 < ptbin[1])

        # mask fatjet xbb
        xbb_0 = events["bbFatJetPNetXbb"].to_numpy()[:, 0]
        xbb_1 = events["bbFatJetPNetXbb"].to_numpy()[:, 1]
        mask_1 = mask_1 & ((xbb_0 > 0.95))
        mask_2 = mask_2 & ((xbb_1 > 0.95))

        for mass_label, var in mass_versions.items():
            val_0 = get_feat(events, f"{var}0")[mask_1] / 125
            val_1 = get_feat(events, f"{var}1")[mask_2] / 125

            params_0[version][mass_label] = (val_0.mean(), val_0.std())
            params_1[version][mass_label] = (val_1.mean(), val_1.std())

            h1.fill(val_0, version, mass_label)
            h2.fill(val_1, version, mass_label)

    for i in range(2):
        h = h1 if i == 0 else h2
        params = params_0 if i == 0 else params_1

        fig, ax = plt.subplots(1, 1, figsize=(6, 5))
        hep.histplot(
            h[{"version": "v11", "massversion": "msd"}],
            ax=ax,
            label=f"Sofdrop v11",
            density=True,
            color="k",
        )
        hep.histplot(
            h[{"version": "v11", "massversion": "pnet_mass"}],
            ax=ax,
            label=f"PNet mass v11",
            density=True,
            color="r",
        )
        hep.histplot(
            h[{"version": "v12", "massversion": "pnet_mass"}],
            ax=ax,
            label=f"PNet mass v12",
            density=True,
            color="b",
        )
        ax.legend(title=f"Xbb AK8 Jet {i} \n " + ptbin_str + f"\n Xbb$_{i}$ > 0.95")
        ax.set_ylabel("Density")
        ax.set_title("GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00 (2022EE)")
        ax.xaxis.grid(True, which="major")
        ax.yaxis.grid(True, which="major")

        min, max = ax.get_ylim()
        mu, std = params["v11"]["pnet_mass"]
        ax.text(
            1.27, max - 2.4, r"$\mu$" + f"= {mu:.3f}," + r"$\sigma$ =" + f"{std:.3f}", color="r"
        )
        mu, std = params["v12"]["pnet_mass"]
        ax.text(
            1.27, max - 2.8, r"$\mu$" + f"= {mu:.3f}," + r"$\sigma$ =" + f"{std:.3f}", color="b"
        )
        fig.tight_layout()
        fig.savefig(f"mratio_jet{i}_ptbin{ptbini}.png")
        plt.show()
        plt.close()

    ptbini += 1

In [None]:
ptbins = {
    r"300 < p$_T$ < 400": [300, 400],
    r"270 < p$_T$ < 300": [270, 300],
    r"400 < p$_T$ < 600": [400, 600],
    r"600 < p$_T$": [600, 1200],
}

h1 = hist.Hist(mratio_axis, version_axis, massversion_axis)
h2 = hist.Hist(mratio_axis, version_axis, massversion_axis)

mass_versions = {
    "msd": "bbFatJetMsd",
    "pnet_mass": "bbFatJetPNetMass",
}


ptbini = 0
for ptbin_str, ptbin in ptbins.items():
    params_0 = {
        "v11": {},
        "v12": {},
    }
    params_1 = {
        "v11": {},
        "v12": {},
    }

    for version in ["v11", "v12"]:
        events = events_dict[f"hh4b_{version}"]
        # is fatjet (0,1) matched
        m_0 = events["bbFatJetHiggsMatch"].to_numpy()[:, 0]
        m_1 = events["bbFatJetHiggsMatch"].to_numpy()[:, 1]
        mask_1 = m_0 == 1
        mask_2 = m_1 == 1
        # is fatjet (0,1) matched to 2bs
        nb_0_h1 = events["bbFatJetNumBMatchedH1"].to_numpy()[:, 0]
        nb_0_h2 = events["bbFatJetNumBMatchedH2"].to_numpy()[:, 0]
        nb_1_h1 = events["bbFatJetNumBMatchedH1"].to_numpy()[:, 1]
        nb_1_h2 = events["bbFatJetNumBMatchedH2"].to_numpy()[:, 1]
        mask_1 = mask_1 & ((nb_0_h1 == 2) | (nb_0_h2 == 2))
        mask_2 = mask_2 & ((nb_1_h1 == 2) | (nb_1_h2 == 2))

        # mask fatjet pt
        pt_0 = events["bbFatJetPt"].to_numpy()[:, 0]
        pt_1 = events["bbFatJetPt"].to_numpy()[:, 1]
        mask_1 = mask_1 & (pt_0 < ptbin[0]) & (pt_0 < ptbin[1])
        mask_2 = mask_2 & (pt_1 < ptbin[0]) & (pt_1 < ptbin[1])

        # mask fatjet xbb
        xbb_0 = events["bbFatJetPNetXbb"].to_numpy()[:, 0]
        xbb_1 = events["bbFatJetPNetXbb"].to_numpy()[:, 1]
        mask_1 = mask_1 & ((xbb_0 > 0.95))
        mask_2 = mask_2 & ((xbb_0 > 0.95))

        for mass_label, var in mass_versions.items():
            val_0 = get_feat(events, f"{var}0")[mask_1] / 125
            val_1 = get_feat(events, f"{var}1")[mask_2] / 125

            params_0[version][mass_label] = (val_0.mean(), val_0.std())
            params_1[version][mass_label] = (val_1.mean(), val_1.std())

            h1.fill(val_0, version, mass_label)
            h2.fill(val_1, version, mass_label)

    for i in range(2):
        h = h1 if i == 0 else h2
        params = params_0 if i == 0 else params_1

        fig, ax = plt.subplots(1, 1, figsize=(6, 5))
        hep.histplot(
            h[{"version": "v11", "massversion": "msd"}],
            ax=ax,
            label=f"Sofdrop v11",
            density=True,
            color="k",
        )
        hep.histplot(
            h[{"version": "v11", "massversion": "pnet_mass"}],
            ax=ax,
            label=f"PNet mass v11",
            density=True,
            color="r",
        )
        hep.histplot(
            h[{"version": "v12", "massversion": "pnet_mass"}],
            ax=ax,
            label=f"PNet mass v12",
            density=True,
            color="b",
        )
        ax.legend(title=f"Xbb AK8 Jet {i} \n " + ptbin_str + f"\n Xbb$_0$ > 0.95")
        ax.set_ylabel("Density")
        ax.set_title("GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00 (2022EE)")
        ax.xaxis.grid(True, which="major")
        ax.yaxis.grid(True, which="major")

        min, max = ax.get_ylim()
        mu, std = params["v11"]["pnet_mass"]
        ax.text(
            1.27, max - 2.4, r"$\mu$" + f"= {mu:.3f}," + r"$\sigma$ =" + f"{std:.3f}", color="r"
        )
        mu, std = params["v12"]["pnet_mass"]
        ax.text(
            1.27, max - 2.6, r"$\mu$" + f"= {mu:.3f}," + r"$\sigma$ =" + f"{std:.3f}", color="b"
        )
        fig.tight_layout()
        fig.savefig(f"mratio_jet{i}_ptbin{ptbini}_xbb0_095.png")
        plt.show()
        plt.close()

    ptbini += 1

What do the 0 values correspond to?

In [None]:
ptbin_str = r"300 < p$_T$ < 400"
ptbin = [300, 400]
# ptbin = [270, 300]
# ptbin = [400, 600]
# ptbin = [600, 1200]

h1 = hist.Hist(mass_axis, massversion_axis)

for version in ["v12"]:
    events = events_dict[f"hh4b_{version}"]
    # is fatjet (0,1) matched
    m_0 = events["bbFatJetHiggsMatch"].to_numpy()[:, 0]
    m_1 = events["bbFatJetHiggsMatch"].to_numpy()[:, 1]
    mask_1 = m_0 == 1
    mask_2 = m_1 == 1
    # is fatjet (0,1) matched to 2bs
    nb_0_h1 = events["bbFatJetNumBMatchedH1"].to_numpy()[:, 0]
    nb_0_h2 = events["bbFatJetNumBMatchedH2"].to_numpy()[:, 0]
    nb_1_h1 = events["bbFatJetNumBMatchedH1"].to_numpy()[:, 1]
    nb_1_h2 = events["bbFatJetNumBMatchedH2"].to_numpy()[:, 1]
    mask_1 = mask_1 & ((nb_0_h1 == 2) | (nb_0_h2 == 2))
    mask_2 = mask_2 & ((nb_1_h1 == 2) | (nb_1_h2 == 2))

    # mask fatjet pt
    pt_0 = events["bbFatJetPt"].to_numpy()[:, 0]
    pt_1 = events["bbFatJetPt"].to_numpy()[:, 1]
    print(ptbin)
    mask_1 = mask_1 & (pt_0 < ptbin[0]) & (pt_0 < ptbin[1])
    mask_2 = mask_2 & (pt_1 < ptbin[0]) & (pt_1 < ptbin[1])

    # mask fatjet xbb
    xbb_0 = events["bbFatJetPNetXbb"].to_numpy()[:, 0]
    xbb_1 = events["bbFatJetPNetXbb"].to_numpy()[:, 1]
    mask_1 = mask_1 & ((xbb_0 > 0.95))
    mask_2 = mask_2 & ((xbb_0 > 0.95))

    msd_1 = get_feat(events, f"bbFatJetMsd1")[mask_2]

    for mass_label, var in mass_versions.items():
        val_0 = get_feat(events, f"{var}0")[mask_1]
        val_1 = get_feat(events, f"{var}1")[mask_2]

        low_val_1 = (val_1 < 20) & (val_1 > -1)
        if mass_label == "pnet_mass":
            h1.fill(val_1[low_val_1], "pnet_mass")
            h1.fill(msd_1[low_val_1], "msd")

fig, ax = plt.subplots(1, 1, figsize=(6, 5))
hep.histplot(h1[{"massversion": "msd"}], ax=ax, label=f"Sofdrop v12", color="k", density=True)
hep.histplot(
    h1[{"massversion": "pnet_mass"}], ax=ax, label=f"PNet mass v12", color="b", density=True
)
ax.legend(title=f"Xbb AK8 Jet {i} \n " + ptbin_str + f"\n Xbb$_0$ > 0.95")
ax.set_ylabel("Density")
ax.set_xlabel("Mass [GeV]")
ax.set_title("Corresponding mass values (2022EE)")
ax.xaxis.grid(True, which="major")
ax.yaxis.grid(True, which="major")

fig.tight_layout()
plt.show()
plt.close()

Now let's try to do a ROC curve

In [None]:
MAIN_DIR = "../../../../"

filters = [
    [
        ("('bbFatJetPt', '0')", ">=", 270),
        ("('bbFatJetPt', '1')", ">=", 270),
        # ("('bbFatJetMsd', '0')", ">=", 60),
        # ("('bbFatJetMsd', '1')", ">=", 60),
        # ("('bbFatJetPNetXbb', '0')", ">=", 0.8),
    ],
]
year = "2022EE"

sample_dirs = {
    f"{MAIN_DIR}/data/skimmer/Feb10_v12_pre-sel/": {
        "qcd_v12": [
            "QCD_HT-800to1200",
            "QCD_HT-1000to1200",
            "QCD_HT-1200to1500",
        ],
    },
    f"{MAIN_DIR}/data/skimmer/Feb14_v11_pre-sel/": {
        "qcd_v11": [
            "QCD_HT-800to1200",
            "QCD_HT-1000to1200",
            "QCD_HT-1200to1500",
        ],
    },
}

# columns to load
load_columns = [
    ("weight", 1),
    ("bbFatJetPt", 2),
    ("bbFatJetMsd", 2),
    ("bbFatJetPNetMass", 2),
    ("bbFatJetPNetXbb", 2),
]
# reformat into ("column name", "idx") format for reading multiindex columns
columns = []
for key, num_columns in load_columns:
    for i in range(num_columns):
        columns.append(f"('{key}', '{i}')")

events_dict = {}
# dictionary that will contain all information (from all samples)
for input_dir, samples_dict in sample_dirs.items():
    events_dict = {
        **events_dict,
        # this function will load files (only the columns selected), apply filters and compute a weight per event
        **load_samples(
            input_dir,
            samples_dict,
            year,
            filters=filters,
            columns_mc=columns,
        ),
    }

sample_dirs = {
    f"{MAIN_DIR}/data/skimmer/Feb10_v12_pre-sel": {
        "hh4b_v12": ["GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV"],
    },
    f"{MAIN_DIR}/data/skimmer/Feb10_v11_private_pre-sel": {
        "hh4b_v11": ["GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV"],
    },
}

# columns to load
load_columns = [
    ("weight", 1),
    ("bbFatJetPt", 2),
    ("bbFatJetMsd", 2),
    ("bbFatJetPNetMass", 2),
    ("bbFatJetPNetXbb", 2),
    ("bbFatJetHiggsMatch", 2),
    ("bbFatJetNumBMatchedH1", 2),
    ("bbFatJetNumBMatchedH2", 2),
]
# reformat into ("column name", "idx") format for reading multiindex columns
columns = []
for key, num_columns in load_columns:
    for i in range(num_columns):
        columns.append(f"('{key}', '{i}')")

# dictionary that will contain all information (from all samples)
for input_dir, samples_dict in sample_dirs.items():
    events_dict = {
        **events_dict,
        # this function will load files (only the columns selected), apply filters and compute a weight per event
        **load_samples(
            input_dir,
            samples_dict,
            year,
            filters=filters,
            columns_mc=columns,
        ),
    }

In [None]:
events_dict.keys()

In [None]:
from sklearn.metrics import roc_curve

fpr_dict = {}
tpr_dict = {}
thresholds_dict = {}

for version in ["v11", "v12"]:
    events = events_dict[f"hh4b_{version}"]

    # is fatjet (0,1) matched
    matched = events["bbFatJetHiggsMatch"].to_numpy()

    # is fatjet (0,1) matched to 2bs
    nb_h1 = events["bbFatJetNumBMatchedH1"].to_numpy()
    nb_h2 = events["bbFatJetNumBMatchedH2"].to_numpy()

    # fatjet pt
    pt = events["bbFatJetPt"].to_numpy()

    # mask = (matched == 1) & ((nb_h1 == 2) | (nb_h2 == 2)) & (pt > 400) & (pt < 600)
    mask = (matched == 1) & ((nb_h1 == 2) | (nb_h2 == 2)) & (pt > 270)
    score = events["bbFatJetPNetXbb"].to_numpy()

    sig_jets_score = score[mask]
    weights_sig = events["weight"].to_numpy()
    weights_sig = np.concatenate([weights_sig, weights_sig], axis=1)[mask]

    events = events_dict[f"qcd_{version}"]

    pt = events["bbFatJetPt"].to_numpy()
    # mask = (pt > 400) & (pt < 600)
    mask = pt > 270

    score = events["bbFatJetPNetXbb"].to_numpy()

    bkg_jets_score = score[mask]
    weights_bkg = events["weight"].to_numpy()
    weights_bkg = np.concatenate([weights_bkg, weights_bkg], axis=1)[mask]

    print(sig_jets_score)
    print(bkg_jets_score)

    weights = np.concatenate([weights_sig, weights_bkg])
    scores = np.concatenate((sig_jets_score, bkg_jets_score))
    y_true = np.concatenate(
        [
            np.ones(len(sig_jets_score)),
            np.zeros(len(bkg_jets_score)),
        ]
    )

    fpr, tpr, thresholds = roc_curve(y_true, scores, sample_weight=weights)

    fpr_dict[version] = fpr
    tpr_dict[version] = tpr
    thresholds_dict[version] = thresholds

In [None]:
plt.figure(figsize=(6, 6))
for version in ["v11", "v12"]:

    def find_nearest(array, value):
        array = np.asarray(array)
        idx = (np.abs(array - value)).argmin()
        return idx

    plot_thresholds = [0.975, 0.985, 0.99]
    th_colours = ["#9381FF", "#1f78b4", "#a6cee3", "#ff7f00", "#7CB518", "#EDB458", "#36213E"]

    pths = {th: [[], []] for th in plot_thresholds}

    plt.plot(
        tpr_dict[version], fpr_dict[version], linewidth=2, color="r" if version == "v11" else "b"
    )

    for th in plot_thresholds:
        idx = find_nearest(thresholds_dict[version], th)
        pths[th][0].append(tpr_dict[version][idx])
        pths[th][1].append(fpr_dict[version][idx])

    for k, th in enumerate(plot_thresholds):
        plt.scatter(
            *pths[th],
            marker="o",
            s=40,
            label=rf"$T_{{Xbb}}$ > {th}" if version == "v12" else None,
            color=th_colours[k],
            zorder=100,
        )

        plt.vlines(
            x=pths[th][0],
            ymin=0,
            ymax=pths[th][1],
            color=th_colours[k],
            linestyles="dashed",
            alpha=0.5,
        )

        plt.hlines(
            y=pths[th][1],
            xmin=0,
            xmax=pths[th][0],
            color=th_colours[k],
            linestyles="dashed",
            alpha=0.5,
        )

    hep.cms.label(data=False, rlabel="")
    # plt.hlines(y=0.01, xmin=0, xmax=1, colors="lightgrey", linestyles="dashed")
    plt.yscale("log")
    plt.xlabel("Signal efficiency")
    plt.ylabel("Background efficiency")
    plt.suptitle(f"Xbb FatJets (0, 1) PNetXbb", y=0.95)
    # plt.title(cut_labels[cutstr], fontsize=20)
    plt.xlim([0.1, 1])
    plt.ylim([0.001, 1])
    # plt.ylim(*ylim)
    if version == "v12":
        # plt.legend(title=r"400 < p$_T$ < 600")
        plt.legend(title=r"270 < p$_T$")
    plt.text(0.6, 0.5, "NanoAOD v11", color="red")
    plt.text(0.6, 0.2, "NanoAOD v12", color="b")

Only using jet 0

In [None]:
from sklearn.metrics import roc_curve

fpr_dict = {}
tpr_dict = {}
thresholds_dict = {}

for version in ["v11", "v12"]:
    events = events_dict[f"hh4b_{version}"]

    # is fatjet (0) matched
    matched = events["bbFatJetHiggsMatch"].to_numpy()[:, 0]

    # is fatjet (0) matched to 2bs
    nb_h1 = events["bbFatJetNumBMatchedH1"].to_numpy()[:, 0]
    nb_h2 = events["bbFatJetNumBMatchedH2"].to_numpy()[:, 0]

    # fatjet pt
    pt = events["bbFatJetPt"].to_numpy()[:, 0]

    mask = (matched == 1) & ((nb_h1 == 2) | (nb_h2 == 2)) & (pt > 400) & (pt < 600)
    score = events["bbFatJetPNetXbb"].to_numpy()[:, 0]

    sig_jets_score = score[mask]

    events = events_dict[f"qcd_{version}"]

    pt = events["bbFatJetPt"].to_numpy()[:, 0]
    mask = (pt > 400) & (pt < 600)

    score = events["bbFatJetPNetXbb"].to_numpy()[:, 0]

    bkg_jets_score = score[mask]

    print(sig_jets_score)
    print(bkg_jets_score)

    scores = np.concatenate((sig_jets_score, bkg_jets_score))
    y_true = np.concatenate(
        [
            np.ones(len(sig_jets_score)),
            np.zeros(len(bkg_jets_score)),
        ]
    )

    fpr, tpr, thresholds = roc_curve(y_true, scores)

    fpr_dict[version] = fpr
    tpr_dict[version] = tpr
    thresholds_dict[version] = thresholds

In [None]:
plt.figure(figsize=(6, 6))
for version in ["v11", "v12"]:

    def find_nearest(array, value):
        array = np.asarray(array)
        idx = (np.abs(array - value)).argmin()
        return idx

    plot_thresholds = [0.975, 0.985, 0.99]
    th_colours = ["#9381FF", "#1f78b4", "#a6cee3", "#ff7f00", "#7CB518", "#EDB458", "#36213E"]

    plt.plot(
        tpr_dict[version], fpr_dict[version], linewidth=2, color="r" if version == "v11" else "b"
    )

    for th in plot_thresholds:
        idx = find_nearest(thresholds_dict[version], th)
        pths[th][0].append(tpr_dict[version][idx])
        pths[th][1].append(fpr_dict[version][idx])

    for k, th in enumerate(plot_thresholds):
        plt.scatter(
            *pths[th],
            marker="o",
            s=40,
            label=rf"$T_{{Xbb}}$ > {th}" if version == "v12" else None,
            color=th_colours[k],
            zorder=100,
        )

        plt.vlines(
            x=pths[th][0],
            ymin=0,
            ymax=pths[th][1],
            color=th_colours[k],
            linestyles="dashed",
            alpha=0.5,
        )

        plt.hlines(
            y=pths[th][1],
            xmin=0,
            xmax=pths[th][0],
            color=th_colours[k],
            linestyles="dashed",
            alpha=0.5,
        )

    hep.cms.label(data=False, rlabel="")
    # plt.hlines(y=0.01, xmin=0, xmax=1, colors="lightgrey", linestyles="dashed")
    plt.yscale("log")
    plt.xlabel("Signal efficiency")
    plt.ylabel("Background efficiency")
    plt.suptitle(f"Xbb FatJets (0) PNetXbb", y=0.95)
    # plt.title(cut_labels[cutstr], fontsize=20)
    plt.text(0.6, 0.5, "NanoAOD v11", color="red")
    plt.text(0.6, 0.3, "NanoAOD v12", color="b")

    plt.xlim([0.1, 1])
    plt.ylim([0.001, 1])
    # plt.ylim(*ylim)
    if version == "v12":
        plt.legend(title=r"400 < p$_T$ < 600", loc="upper left")

Now compare shape of discriminant

In [None]:
hists = {
    "xbb": (xbb_axis, "bbFatJetPNetXbb"),
    "mass": (mass_axis, "bbFatJetPNetMass"),
    "msd": (msd_axis, "bbFatJetMsd"),
}

for key, hinfo in hists.items():
    axs = hinfo[0]
    var = hinfo[1]
    h1 = hist.Hist(axs, version_axis, sample_axis)
    h2 = hist.Hist(axs, version_axis, sample_axis)

    for version in ["v11", "v12"]:
        for sample in ["qcd", "hh4b"]:
            events = events_dict[f"{sample}_{version}"]
            pt_0 = events["bbFatJetPt"].to_numpy()[:, 0]
            pt_1 = events["bbFatJetPt"].to_numpy()[:, 1]

            mask_1 = pt_0 > 270
            mask_2 = pt_1 > 270

            if "hh4b" in sample:
                # is fatjet (0,1) matched
                m_0 = events["bbFatJetHiggsMatch"].to_numpy()[:, 0]
                m_1 = events["bbFatJetHiggsMatch"].to_numpy()[:, 1]
                # is fatjet (0,1) matched to 2bs
                nb_0_h1 = events["bbFatJetNumBMatchedH1"].to_numpy()[:, 0]
                nb_0_h2 = events["bbFatJetNumBMatchedH2"].to_numpy()[:, 0]
                nb_1_h1 = events["bbFatJetNumBMatchedH1"].to_numpy()[:, 1]
                nb_1_h2 = events["bbFatJetNumBMatchedH2"].to_numpy()[:, 1]
                mask_1 = mask_1 & (m_0 == 1) & ((nb_0_h1 == 2) | (nb_0_h2 == 2))
                mask_2 = mask_2 & (m_1 == 1) & ((nb_1_h1 == 2) | (nb_1_h2 == 2))

            h_0 = get_feat(events, f"{var}0")[mask_1]
            h_1 = get_feat(events, f"{var}1")[mask_2]
            h1.fill(h_0, version, sample)
            h2.fill(h_1, version, sample)

    for i in range(2):
        h = h1 if i == 0 else h2
        fig, ax = plt.subplots(1, 1, figsize=(6, 5))
        hep.histplot(h[{"version": "v12", "sample": "hh4b"}], ax=ax, density=True, color="r")
        hep.histplot(
            h[{"version": "v12", "sample": "qcd"}],
            ax=ax,
            density=True,
            color="r",
            linestyle="dashed",
        )
        hep.histplot(h[{"version": "v11", "sample": "hh4b"}], ax=ax, density=True, color="b")
        hep.histplot(
            h[{"version": "v11", "sample": "qcd"}],
            ax=ax,
            density=True,
            color="b",
            linestyle="dashed",
        )
        legend_elements = [
            Line2D([0], [0], color="r", lw=2, label="SM HH4b v12", ls="solid"),
            Line2D([0], [0], color="r", lw=2, label="QCD v12", ls="dashed"),
            Line2D([0], [0], color="b", lw=2, label="SM HH4b v11", ls="solid"),
            Line2D([0], [0], color="b", lw=2, label="QCD v11", ls="dashed"),
        ]
        ax.legend(handles=legend_elements, title=f"bb AK8 Jet {i}")
        ax.set_ylabel("Density")
        ax.set_title("2022EE")
        ax.set_yscale("log")
        ax.xaxis.grid(True, which="major")
        ax.yaxis.grid(True, which="major")
        plt.show()
        plt.close()

Checks on NanoAOD directly

In [None]:
MAIN_DIR = "../../../"

plot_dir = f"{MAIN_DIR}/plots/v11v12Checks/23Nov10"
_ = os.system(f"mkdir -p {plot_dir}")

In [None]:
paths = {
    "v11": {
        "qcd": "root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/mc/Run3Summer22EENanoAODv11/QCD-4Jets_HT-2000_TuneCP5_13p6TeV_madgraphMLM-pythia8/NANOAODSIM/126X_mcRun3_2022_realistic_postEE_v1-v2/2810000/02d3ed0c-74c1-464d-bff9-8345ae4a6dd5.root",
        "hh4b": "root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/mc/Run3Summer22EENanoAODv10/GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV_powheg-pythia8/NANOAODSIM/Poisson60KeepRAW_124X_mcRun3_2022_realistic_postEE_v1-v2/2540000/46c288ba-4f36-450b-9108-8070eca82d95.root",
        "data": "root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2022F/JetMET/NANOAOD/PromptNanoAODv11_v1-v2/2540000/02b8ddba-9103-4801-bb74-cab4d1fcc6ca.root",
    },
    "v12": {
        "qcd": "root://storage01.lcg.cscs.ch:1096//pnfs/lcg.cscs.ch/cms/trivcat//store/mc/Run3Summer22EENanoAODv12/QCD-4Jets_HT-2000_TuneCP5_13p6TeV_madgraphMLM-pythia8/NANOAODSIM/130X_mcRun3_2022_realistic_postEE_v6-v2/2520000/05e001cc-8902-4fa0-b706-98fc89013dc2.root",
        "hh4b": "root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/mc/Run3Summer22EENanoAODv12/GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV_powheg-pythia8/NANOAODSIM/Poisson60KeepRAW_130X_mcRun3_2022_realistic_postEE_v6-v2/2540000/00d98799-ada3-4a26-8558-5052891a8d23.root",
        "data": "root://cmsdcadisk.fnal.gov//dcache/uscmsdisk/store/data/Run2022E/JetMET/NANOAOD/22Sep2023-v1/30000/002cde4c-efa1-430d-b651-61b6dff4d208.root",
    },
}

In [None]:
events_dict = {
    v: {
        key: nanoevents.NanoEventsFactory.from_root(
            path, schemaclass=nanoevents.NanoAODSchema
        ).events()
        for key, path in paths[v].items()
    }
    for v in paths
}

In [None]:
events_dict["v12"]["data"].FatJet.fields

In [None]:
import sys

In [None]:
sys.getsizeof(events_dict["v11"]["qcd"])

In [None]:
ak.count(events_dict["v11"]["qcd"].Muon.pt, axis=1)

In [None]:
plt.hist(ak.count(events_dict["v11"]["qcd"][:157660].Muon.pt, axis=1), histtype="step")
plt.hist(ak.count(events_dict["v12"]["qcd"][:157660].Muon.pt, axis=1), histtype="step")

In [None]:
qcdv11_events.FatJet.pt

In [None]:
qcdv12_events.FatJet.pt

In [None]:
import matplotlib.pyplot as plt

bins = np.linspace(0, 2000, 31)
plt.hist(qcdv11_events.FatJet.pt[:, 0:1], bins=bins, histtype="step", density=True)
plt.hist(qcdv12_events.FatJet.pt[:, 0:1], bins=bins, histtype="step", density=True)
plt.show()

In [None]:
bins = np.linspace(0, 300, 31)
plt.hist(qcdv11_events.FatJet.msoftdrop[:, 0:1], bins=bins, histtype="step", density=True)
plt.hist(qcdv12_events.FatJet.msoftdrop[:, 0:1], bins=bins, histtype="step", density=True)
plt.show()

In [None]:
events_dict

In [None]:
pnet_masses = {
    "v11": {
        key: events.FatJet.particleNet_mass[:, 0:1] for key, events in events_dict["v11"].items()
    }
}
pnet_masses["v12 JEC Mass"] = {
    key: (events.FatJet.mass * events.FatJet.particleNet_massCorr)[:, 0:1]
    for key, events in events_dict["v12"].items()
}
pnet_masses["v12 Raw Mass"] = {
    key: (events.FatJet.mass * (1 - events.FatJet.rawFactor) * events.FatJet.particleNet_massCorr)[
        :, 0:1
    ]
    for key, events in events_dict["v12"].items()
}

In [None]:
import matplotlib.pyplot as plt
import mplhep as hep
import matplotlib.ticker as mticker

hep.style.use("CMS")

formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))
plt.rcParams["lines.linewidth"] = 2
plt.rcParams["grid.color"] = "#CCCCCC"
plt.rcParams["grid.linewidth"] = 0.5
plt.rcParams["figure.edgecolor"] = "none"

labels = {
    "qcd": "QCD-HT2000",
    "hh4b": "ggF HH4b",
    "data": "Data",
}

In [None]:
bins = np.linspace(0, 300, 31)

for key in pnet_masses["v11"]:
    bins = np.linspace(0, 300, 31) if key == "qcd" else np.linspace(0, 200, 31)
    fig, ax = plt.subplots(1, 1, figsize=(12, 12))
    plt.rcParams.update({"font.size": 24})
    for v, samples in pnet_masses.items():
        masses = samples[key]
        ax.hist(ak.flatten(masses), bins=bins, histtype="step", density=True, label=v)

    ax.set_title(labels[key], x=0.45)
    ax.legend()
    hep.cms.label(data=True, label="Internal", year="2022EE", lumi="21")
    plt.savefig(f"{plot_dir}/{key}.pdf", bbox_inches="tight")
    plt.show()

In [None]:
bins = np.linspace(0, 300, 31)
plt.hist(
    qcdv11_events.FatJet.particleNet_mass[:, 0:1],
    bins=bins,
    histtype="step",
    density=True,
    label="v11",
)
plt.hist(
    (qcdv12_events.FatJet.mass * qcdv12_events.FatJet.particleNet_massCorr)[:, 0:1],
    bins=bins,
    histtype="step",
    density=True,
    label="v12",
)
plt.xlabel("ParticleNet Mass")
plt.legend()
plt.show()