In [12]:
import coffea
from hist import Hist
from hist.intervals import clopper_pearson_interval

from os import listdir
from pathlib import Path
from typing import Union, List
import pandas as pd
import numpy as np

import math
from HH4b import utils, hh_vars
from HH4b.utils import ShapeVar

import os

import matplotlib.pyplot as plt
import mplhep as hep

plt.rcParams.update({"font.size": 24})
plt.style.use(hep.style.CMS)

In [15]:
# automatically reloads imported files on edits
%load_ext autoreload
%autoreload 2

In [7]:
MAIN_DIR = "../../../"

plot_dir = Path(f"{MAIN_DIR}/plots/Triggers/23Nov11")
plot_dir.mkdir(parents=True, exist_ok=True)

## Load Data

In [67]:
LUMI = {
    "2022": 7.97,
    "2022EE": 26.3,
}

In [26]:
# year = "2022"
# year = "2022EE-noE"
year = "2022EE"

data_dir = f"/eos/uscms/store/user/cmantill/bbbb/trigger_boosted/Aug15/"
# data_dir = f"/eos/uscms/store/user/cmantill/bbbb/trigger_boosted/Nov7_v12/"

samples = {
    "2022EE": {
        "data": ["Run2022E", "Run2022F", "Run2022G"],
        "ttbar": ["TTtoLNu2Q"],
    },
    "2022EE-noE": {
        "data": ["Run2022F", "Run2022G"],
        "ttbar": ["TTtoLNu2Q"],
    },
    "2022": {
        "data": ["Run2022C_single", "Run2022C", "Run2022D"],
        "ttbar": ["TTtoLNu2Q"],
    },
}[year]

y = year
if year == "2022EE-noE":
    y = "2022EE"

In [31]:
# columns to load
# the parquet files are too big so we can only load a few columns at a time without consumming much memory
load_columns = [
    ("ak8FatJetPt", 1),
    ("ak8FatJetPNetXbb", 1),
    ("ak8FatJetMsd", 1),
    ("AK8PFJet250_SoftDropMass40_PFAK8ParticleNetBB0p35", 1),
    ("AK8PFJet425_SoftDropMass40", 1),
]

In [32]:
full_samples_list = listdir(f"{data_dir}/{y}")
events_dict = {}
for label, selector in samples.items():
    events_dict[label] = []
    for sample in full_samples_list:
        if not utils.check_selector(sample, selector):
            continue
        if not Path(f"{data_dir}/{y}/{sample}/parquet").exists():
            print(f"No parquet file for {sample}")
            continue

        events = pd.read_parquet(
            f"{data_dir}/{y}/{sample}/parquet", columns=utils.format_columns(load_columns)
        )
        not_empty = len(events) > 0
        if not_empty:
            events_dict[label].append(events)

        print(f"Loaded {sample: <50}: {len(events)} entries")

    if len(events_dict[label]):
        events_dict[label] = pd.concat(events_dict[label])
    else:
        del events_dict[label]

Loaded Run2022E                                          : 205074 entries
Loaded Run2022F                                          : 916121 entries
Loaded Run2022G                                          : 176783 entries
Loaded TTtoLNu2Q                                         : 6880444 entries


## Fill Histograms

Histogram variables

In [87]:
mass_bins = [11, 20, 240]
pt_bins = [250, 275, 300, 325, 350, 375, 400, 450, 500, 600, 800]
txbb_bins = [0.0, 0.8, 0.95, 0.98, 1.0]

shape_vars = {
    # "mreg": ShapeVar(var="mreg", label=r"AK8 Jet Regressed Mass (GeV)", bins=mass_bins),
    "msd": ShapeVar(var="msd", label=r"AK8 Jet SD Mass (GeV)", bins=mass_bins, reg=True),
    "pt": ShapeVar(var="pt", label=r"AK8 Jet $p_T$ (GeV)", bins=pt_bins, reg=False),
    "txbb": ShapeVar(var="txbb", label=r"AK8 Jet $T_{Xbb}$", bins=txbb_bins, reg=False),
}

base_hist = Hist(*[shape_var.axis for shape_var in shape_vars.values()], storage="Double")

Triggers

In [88]:
from dataclasses import dataclass
from numpy.typing import ArrayLike


@dataclass
class Trigger:
    name: str
    label: str


@dataclass
class TriggerSet:
    triggers: list[Trigger]

    def __post_init__(self):
        self.label = " | ".join([trigger.label for trigger in self.triggers])

    def get_sel(self, events: pd.DataFrame) -> ArrayLike:
        """Return boolean selection"""
        return (
            np.sum([events[trigger.name] for trigger in self.triggers], axis=0)
            .squeeze()
            .astype(bool)
        )

In [89]:
triggers = {
    "BoostedJet": Trigger(name="AK8PFJet425_SoftDropMass40", label="PFJet425_MSD40"),
    "BoostedHbb": Trigger(
        name="AK8PFJet250_SoftDropMass40_PFAK8ParticleNetBB0p35", label="PFJet250_MSD40_Xbb0p35"
    ),
}

# triggers and combinations of triggers whose efficiency we're measuring
trigger_dict = {
    # first save each individual trigger
    **{key: TriggerSet([trigger]) for key, trigger in triggers.items()},
    # combinations
    "BoostedJetHbb": TriggerSet([triggers["BoostedJet"], triggers["BoostedHbb"]]),
}

Fill

In [90]:
fill_events = {
    "data": f"Data",
    "ttbar": r"$t\bar{t}$",
}

hists = {}

for ev_key, ev_label in fill_events.items():
    hists[ev_key] = h = {}

    events = events_dict[ev_key]
    sel = (events["ak8FatJetPt"][0].to_numpy() >= 250) & (
        events["ak8FatJetMsd"][0].to_numpy() >= 20
    )
    events = events[sel]

    fill_vars = {
        "msd": events["ak8FatJetMsd"][0],
        "pt": events["ak8FatJetPt"][0],
        "txbb": events["ak8FatJetPNetXbb"][0],
    }

    # denominator (i.e. all events passing pre-selection)
    h["den"] = base_hist.copy().fill(**fill_vars)

    # fill for each set of triggers
    for trigger_key, trigger_set in trigger_dict.items():
        trig_sel = trigger_set.get_sel(events)
        h[trigger_key] = base_hist.copy().fill(
            **{key: var[trig_sel] for key, var in fill_vars.items()}
        )
        h[f"{trigger_key}_effs"] = h[trigger_key] / h["den"]
        # TODO: save errors from Clopper Pearson as well

## Plotting

In [118]:
def mesh2d(
    xbins,
    ybins,
    vals,
    year,
    vmax=1,
    ax=None,
    title=None,
    title_params=None,
    xlabel="AK8 Jet SD Mass [GeV]",
    ylabel=r"AK8 Jet $p_T$ [GeV]",
    plot_dir="",
    name="",
    show=False,
    data=True,
    fontsize=28,
):
    """Plot 2D trigger efficiencies"""
    if ax is None:
        in_ax = False
        fig, ax = plt.subplots(1, 1, figsize=(18, 17))
    else:
        in_ax = True

    mesh = ax.pcolormesh(xbins, ybins, vals.T, cmap="turbo", vmin=0, vmax=vmax)
    for i in range(len(ybins) - 1):
        for j in range(len(xbins) - 1):
            if not math.isnan(vals[j, i]):
                ax.text(
                    (xbins[j] + xbins[j + 1]) / 2,
                    (ybins[i] + ybins[i + 1]) / 2,
                    vals[j, i].round(2),
                    color="black" if 0.1 * vmax < vals[j, i] < 0.9 * vmax else "white",
                    ha="center",
                    va="center",
                    fontsize=fontsize,
                )

    title_params = {"x": 0.35, "y": 1.005} if title_params is None else title_params
    ax.set_title(title, **title_params)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    
    hep.cms.label(ax=ax, data=data, year=year, lumi=int(LUMI[year]), com="13.6")

    if in_ax:
        return mesh
    else:
        fig.colorbar(mesh, ax=ax, pad=0.01)

        if len(name):
            plt.savefig(f"{plot_dir}/{name}.pdf", bbox_inches="tight")

        if show:
            plt.show()
        else:
            plt.close()


def plot_all_wrapper(tbins, xbins, ybins, w, year, plot_dir, name, vmax=1, bins="txbb"):
    """Wrapper for plotting efficiencies in all 4 Txbb bins"""
    plt.rcParams.update({"font.size": 36})
    fig, axs = plt.subplots(2, 2, figsize=(18 * 2, 17 * 2), constrained_layout=True)

    for k in range(4):
        row = k // 2
        col = k % 2
        ax = axs[row][col]

        match bins:
            case "txbb":
                title = rf"{tbins[k]:.2f} < $T_{{Xbb}}$ < {tbins[k + 1]:.2f}"
            case "pt":
                title = rf"{tbins[k]:.0f} < Jet 2 $p_T$ < {tbins[k + 1]:.0f} GeV"

        mesh = mesh2d(xbins, ybins, w[..., k], year, vmax=vmax, ax=ax, title=title, fontsize=28)

    for i in range(2):
        fig.colorbar(mesh, ax=axs[i].ravel().tolist(), pad=0.01)

    plt.savefig(f"{plot_dir}/{name}.pdf", bbox_inches="tight")

### 2D Plots

All txbb bins

In [125]:
for ev_key, ev_label in fill_events.items():
    for trigger_key in trigger_dict:
        w, msdbins, ptbins, txbbbins = hists[ev_key][f"{trigger_key}_effs"].to_numpy()
        # plot_all_wrapper(txbbbins, msdbins, ptbins, w, year, plot_dir, f"{year}_all_txbb_{ev_key}_{trigger_key}")
        # plt.close()

        for k in range(4):
            mesh2d(
                msdbins,
                ptbins,
                w[..., k],
                year,
                plot_dir=plot_dir,
                name=f"{year}_txbb_{txbb_bins[k]}_{ev_key}_{trigger_key}",
                title=rf"{txbbbins[k]:.2f} < $T_{{Xbb}}$ < {txbbbins[k + 1]:.2f}",
                title_params={"x": 0.315, "y": 1.007, "fontsize": 36},
                show=False,
            )
    #         break
    #     break
    # break

Sum over txbb

In [104]:
for ev_key, ev_label in fill_events.items():
    den = hists[ev_key]["den"][..., sum]
    for trigger_key in trigger_dict:
        num = hists[ev_key][trigger_key][..., sum]
        effs = num / den
        w, msdbins, ptbins = effs.to_numpy()
        mesh2d(
            msdbins,
            ptbins,
            w,
            year,
            plot_dir=plot_dir,
            name=f"{year}_sum_txbb_{ev_key}_{trigger_key}",
            show=False,
        )
    #     break
    # break