In [None]:
import numpy as np
import pandas as pd
from os import listdir
from os.path import exists
from typing import Union, List
from hist import Hist
from hist.intervals import clopper_pearson_interval
import vector

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import hist
import mplhep as hep

hep.style.use(["CMS", "firamath"])

formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))
plt.rcParams.update({"font.size": 12})
plt.rcParams["lines.linewidth"] = 2
plt.rcParams["grid.color"] = "#CCCCCC"
plt.rcParams["grid.linewidth"] = 0.5
plt.rcParams["figure.edgecolor"] = "none"


import sys

In [None]:
def check_selector(sample: str, selector: Union[str, List[str]]):
    if isinstance(selector, list) or isinstance(selector, tuple):
        for s in selector:
            if s.startswith("*"):
                if s[1:] in sample:
                    return True
            else:
                if sample.startswith(s):
                    return True
    else:
        if selector.startswith("*"):
            if selector[1:] in sample:
                return True
        else:
            if sample.startswith(selector):
                return True

    return False

In [None]:
# automatically reloads imported files on edits
%load_ext autoreload
%autoreload 2

In [None]:
data_dir = "/eos/uscms/store/user/cmantill/bbbb/trigger_boosted/23Nov12_v11/"
y = "2022EE"
samples = {
    "2022EE": {
        # "hh4b": ["GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV_TSG_Pu60"],
        # "vbfhh4b": ["VBFHHto4B_CV_1_C2V_1_C3_1_TuneCP5_13p6TeV_madgraph-pythia8"],
        "vbfhh4b-c2v0": ["VBFHHto4B_CV_1_C2V_0_C3_1_TuneCP5_13p6TeV_madgraph-pythia8"],
    },
}[y]


# dictionary that will contain all information (from all samples)
full_samples_list = listdir(f"{data_dir}/{y}")
events_dict = {}
for label, selector in samples.items():
    events_dict[label] = []
    print(selector)
    for sample in full_samples_list:
        if not check_selector(sample, selector):
            continue
        if not exists(f"{data_dir}/{y}/{sample}/parquet"):
            print(f"No parquet file for {sample}")
            continue

        events = pd.read_parquet(f"{data_dir}/{y}/{sample}/parquet", columns=None)
        not_empty = len(events) > 0
        if not_empty:
            events_dict[label].append(events)

        print(f"Loaded {sample: <50}: {len(events)} entries")

    if len(events_dict[label]):
        events_dict[label] = pd.concat(events_dict[label])
    else:
        del events_dict[label]

In [None]:
def make_vector(events: pd.DataFrame, obj: str):
    """Create a ``vector`` object from the columns of the dataframe"""
    mstring = "PNetMass" if obj == "ak8FatJet" else "Mass"

    return vector.array(
        {
            "pt": events[f"{obj}Pt"],
            "phi": events[f"{obj}Phi"],
            "eta": events[f"{obj}Eta"],
            "M": events[f"{obj}{mstring}"],
        }
    )

In [None]:
trigger_dict = {
    "Resolved": (
        [
            "QuadPFJet70_50_40_35_PFBTagParticleNet_2BTagSum0p65",
        ],
        "QuadPFJet",
    ),
    "HT": (["PFHT1050"], "HT1050"),
    "BoostedJet": (
        [
            "AK8PFJet425_SoftDropMass40",
        ],
        "PFJet425_MSD40",
    ),
    "BoostedDiJet": (
        [
            "AK8DiPFJet250_250_MassSD50",
            "AK8DiPFJet260_260_MassSD30",
        ],
        "DiPFJet250-MSD50 |\n DiPFJet260-MSD30",
    ),
    "BoostedHbb": (
        [
            "AK8PFJet250_SoftDropMass40_PFAK8ParticleNetBB0p35",
        ],
        "PFJet425_MSD40_Xbb0p35",
    ),
    "Combined": (
        [
            "AK8PFJet250_SoftDropMass40_PFAK8ParticleNetBB0p35",
            "AK8PFJet425_SoftDropMass40",
            "AK8DiPFJet250_250_MassSD50",
            "AK8DiPFJet260_260_MassSD30",
        ],
        "PFJet425_MSD40_Xbb0p35 |\n PFJet425_MSD40 |\n DiPFJet250-MSD50 |\n DiPFJet260-MSD30",
    ),
    "Combined_noquad": (
        [
            "PFHT1050",
            "AK8DiPFJet250_250_MassSD50",
            "AK8DiPFJet260_260_MassSD30",
            "AK8PFJet250_SoftDropMass40_PFAK8ParticleNetBB0p35",
            "AK8PFJet425_SoftDropMass40",
        ],
        "PFJet425_MSD40_Xbb0p35 |\n PFJet425_MSD40 |\n DiPFJet250-MSD50 |\n DiPFJet260-MSD30 |\n HT1050",
    ),
    "Combined_all": (
        [
            "QuadPFJet70_50_40_35_PFBTagParticleNet_2BTagSum0p65",
            "PFHT1050",
            "AK8DiPFJet250_250_MassSD50",
            "AK8DiPFJet260_260_MassSD30",
            "AK8PFJet250_SoftDropMass40_PFAK8ParticleNetBB0p35",
            "AK8PFJet425_SoftDropMass40",
        ],
        "PFJet425_MSD40_Xbb0p35 |\n PFJet425_MSD40 |\n DiPFJet250-MSD50 |\n DiPFJet260-MSD30  |\n HT1050 |\n QuadPFJet",
    ),
    "Combined_ht": (
        [
            "PFHT1050",
            "AK8PFJet250_SoftDropMass40_PFAK8ParticleNetBB0p35",
            "AK8PFJet425_SoftDropMass40",
        ],
        "PFJet425_MSD40_Xbb0p35 |\n PFJet425_MSD40 |\n HT1050",
    ),
    "Combined_nodijet": (
        [
            "AK8PFJet250_SoftDropMass40_PFAK8ParticleNetBB0p35",
            "AK8PFJet425_SoftDropMass40",
        ],
        "PFJet425_MSD40_Xbb0p35 |\n PFJet425_MSD40",
    ),
}

In [None]:
# ht_bins_fine = (25, 200, 3000)
ht_bins_fine = (25, 200, 2000)
mhh_axis = hist.axis.Regular(40, 250, 1500, name="mhh", label=r"$m_{HH}$ [GeV]")
ht_axis = hist.axis.Regular(*ht_bins_fine, name="ht", label=r"HT [GeV]")
fjpt_axis = hist.axis.Regular(10, 250, 600, name="pt", label=r"fj pT$^0$ [GeV]")
cat_axis = hist.axis.StrCategory([], name="cat", growth=True)
sample_axis = hist.axis.StrCategory([], name="sample_name", growth=True)

hht = hist.Hist(ht_axis, cat_axis, sample_axis)
hmhh = hist.Hist(mhh_axis, cat_axis, sample_axis)
hpt = hist.Hist(fjpt_axis, cat_axis, sample_axis)

for sample in events_dict.keys():
    events = events_dict[sample]
    ht = events["ht"][0]
    pt_0 = events["ak8FatJetPt"][0]
    gen_higgs = make_vector(events, "GenHiggs")
    msd_0 = events["ak8FatJetMsd"][0]
    mreg_0 = events["ak8FatJetPNetMass"][0]

    mhh = (gen_higgs[:, 0] + gen_higgs[:, 1]).m

    selection = pt_0 > 200
    sel_pt = (pt_0 > 200) & ((msd_0 > 40) | (mreg_0 > 40))

    hht.fill(ht=ht, cat="denominator", sample_name=sample)
    hmhh.fill(mhh=mhh, cat="denominator", sample_name=sample)
    hpt.fill(pt=pt_0[sel_pt], cat="denominator", sample_name=sample)
    for trigger_title, (triggers, trigger_label) in trigger_dict.items():
        trigger_selection = np.zeros_like(selection)
        for hlt in triggers:
            trigger_selection |= (events[hlt].values == 1).squeeze()
        num_selection = trigger_selection
        hht.fill(ht=ht[num_selection], cat=trigger_title, sample_name=sample)
        hmhh.fill(mhh=mhh[num_selection], cat=trigger_title, sample_name=sample)
        hpt.fill(pt=pt_0[sel_pt & num_selection], cat=trigger_title, sample_name=sample)

In [None]:
def plot_1d(include_combined, hist_to_plot, sample, sample_label):
    fig, ax = plt.subplots(1, 1, figsize=(6, 4))
    for trigger_title, (triggers, trigger_label) in trigger_dict.items():
        is_combined = False
        if "Combined" in trigger_title:
            is_combined = True
        if (not include_combined and is_combined) or (include_combined and not is_combined):
            continue
        den = hist_to_plot[{"cat": "denominator", "sample_name": sample}]
        num = hist_to_plot[{"cat": trigger_title, "sample_name": sample}]
        hep.histplot(
            num / den,
            yerr=abs(clopper_pearson_interval(num.view(), den.view()) - num.view() / den.view()),
            label=trigger_label,
            ax=ax,
            flow="none",
        )
    leg = ax.legend(fontsize=8, bbox_to_anchor=(1.05, 1), loc="upper left")
    ax.set_ylabel("Events")
    leg.set_title(f"{sample_label} no selection", prop={"size": 10})
    ax.set_ylabel("Signal Trig. Eff.")
    ax.set_ylim(0, 1)


to_plot = [
    # ("hh4b", "ggF HH4b"),
    # ("vbfhh4b", "VBF HH4b"),
    ("vbfhh4b-c2v0", "VFB HH4b c2v=0"),
]
for sample, sample_label in to_plot:
    for include_combined in [False, True]:
        plot_1d(include_combined, hht, sample, sample_label)
        plot_1d(include_combined, hmhh, sample, sample_label)
        if "vbf" in sample:
            plot_1d(include_combined, hpt, sample, sample_label)