In [None]:
from __future__ import annotations

from copy import deepcopy
from os import listdir
from pathlib import Path

import matplotlib.pyplot as plt
import mplhep as hep
import numpy as np
import pandas as pd
from hist import Hist
from hist.intervals import clopper_pearson_interval

from HH4b import hh_vars, plotting, utils
from HH4b.utils import ShapeVar

plt.rcParams.update({"font.size": 24})
plt.style.use(hep.style.CMS)

In [None]:
# automatically reloads imported files on edits
%load_ext autoreload
%autoreload 2

In [None]:
MAIN_DIR = "../../../"

plot_dir = Path(f"{MAIN_DIR}/plots/Triggers/23Nov15")
plot_dir.mkdir(parents=True, exist_ok=True)

## Load Data

In [None]:
# year = "2022"
# year = "2022EE-noE"
year = "2022EE"

# data_dir = f"/eos/uscms/store/user/cmantill/bbbb/trigger_boosted/Aug15/"
data_dir = "/eos/uscms/store/user/cmantill/bbbb/trigger_boosted/23Nov9_v11_v11/"

samples = {
    "2022EE": {
        "data": ["Run2022E", "Run2022F", "Run2022G"],
        "ttbar": ["TTtoLNu2Q"],
    },
    "2022EE-noE": {
        "data": ["Run2022F", "Run2022G"],
        "ttbar": ["TTtoLNu2Q"],
    },
    "2022": {
        "data": ["Run2022C_single", "Run2022C", "Run2022D"],
        "ttbar": ["TTtoLNu2Q"],
    },
}[year]

y = year
if year == "2022EE-noE":
    y = "2022EE"

In [None]:
# columns to load
# the parquet files are too big so we can only load a few columns at a time without consumming much memory
load_columns = [
    ("ak8FatJetPt", 1),
    ("ak8FatJetPNetXbb", 1),
    ("ak8FatJetMsd", 1),
    # ("ak8FatJetPNetMass", 1),
    ("AK8PFJet250_SoftDropMass40_PFAK8ParticleNetBB0p35", 1),
    ("AK8PFJet425_SoftDropMass40", 1),
    ("PFHT1050", 1),
]

In [None]:
full_samples_list = listdir(f"{data_dir}/{y}")
events_dict = {}
for label, selector in samples.items():
    events_dict[label] = []
    for sample in full_samples_list:
        if not utils.check_selector(sample, selector):
            continue
        if not Path(f"{data_dir}/{y}/{sample}/parquet").exists():
            print(f"No parquet file for {sample}")
            continue

        events = pd.read_parquet(
            f"{data_dir}/{y}/{sample}/parquet",
            # columns=utils.format_columns(load_columns),
        )
        not_empty = len(events) > 0
        if not_empty:
            events_dict[label].append(events)

        print(f"Loaded {sample: <50}: {len(events)} entries")

    if len(events_dict[label]):
        events_dict[label] = pd.concat(events_dict[label])
    else:
        del events_dict[label]

In [None]:
events_dict["data"].columns

## Fill Histograms

Histogram variables

In [None]:
# fine bins initially and then rebinning below as needed
mass_bins = [44, 20, 240]
pt_bins = [160, 200, 1000]
txbb_bins = [100, 0, 1]

shape_vars = {
    # "mreg": ShapeVar(var="mreg", label=r"AK8 Jet Regressed Mass (GeV)", bins=mass_bins),
    "msd": ShapeVar(var="msd", label=r"AK8 Jet SD Mass (GeV)", bins=mass_bins, reg=True),
    "pt": ShapeVar(var="pt", label=r"AK8 Jet $p_T$ (GeV)", bins=pt_bins, reg=True),
    "txbb": ShapeVar(var="txbb", label=r"AK8 Jet $T_{Xbb}$", bins=txbb_bins, reg=True),
}

base_hist = Hist(*[shape_var.axis for shape_var in shape_vars.values()], storage="Double")

Triggers

In [None]:
from dataclasses import dataclass, field

from numpy.typing import ArrayLike


@dataclass
class Trigger:
    name: str
    label: str


@dataclass
class TriggerSet:
    triggers: list[Trigger]

    def __post_init__(self):
        self.label = "\n| ".join([trigger.label for trigger in self.triggers])

    def get_sel(self, events: pd.DataFrame) -> ArrayLike:
        """Return boolean selection"""
        return (
            np.sum([events[trigger.name] for trigger in self.triggers], axis=0)
            .squeeze()
            .astype(bool)
        )

In [None]:
triggers = {
    "HT": Trigger(name="PFHT1050", label="HT1050"),
    "BoostedJet": Trigger(name="AK8PFJet425_SoftDropMass40", label="PFJet425_MSD40"),
    "BoostedHbb": Trigger(
        name="AK8PFJet250_SoftDropMass40_PFAK8ParticleNetBB0p35", label="PFJet250_MSD40_Xbb0p35"
    ),
}

# triggers and combinations of triggers whose efficiency we're measuring
trigger_dict = {
    # first save each individual trigger
    **{key: TriggerSet([trigger]) for key, trigger in triggers.items()},
    # combinations
    "BoostedJetHbb": TriggerSet([triggers["BoostedJet"], triggers["BoostedHbb"]]),
    "HTBoostedJetHbb": TriggerSet([triggers["HT"], triggers["BoostedJet"], triggers["BoostedHbb"]]),
}

Fill

In [None]:
fill_events = {
    "data": "Data",
    "ttbar": r"$t\bar{t}$",
}

hists = {}

for ev_key, ev_label in fill_events.items():
    hists[ev_key] = h = {}

    events = events_dict[ev_key]
    sel = (events["ak8FatJetPt"][0].to_numpy() >= 250) & (
        events["ak8FatJetMsd"][0].to_numpy() >= 20
    )
    events = events[sel]

    fill_vars = {
        "msd": events["ak8FatJetMsd"][0],
        "pt": events["ak8FatJetPt"][0],
        "txbb": events["ak8FatJetPNetXbb"][0],
    }

    # denominator (i.e. all events passing pre-selection)
    h["den"] = base_hist.copy().fill(**fill_vars)

    # fill for each set of triggers
    for trigger_key, trigger_set in trigger_dict.items():
        trig_sel = trigger_set.get_sel(events)
        h[trigger_key] = base_hist.copy().fill(
            **{key: var[trig_sel] for key, var in fill_vars.items()}
        )
        # h[f"{trigger_key}_effs"] = h[trigger_key] / h["den"]
        # TODO: save errors from Clopper Pearson as well

## Plotting

In [None]:
def plot_all_wrapper(tbins, xbins, ybins, w, year, plot_dir, name, vmax=1, bins="txbb", show=False):
    """Wrapper for plotting efficiencies in all 4 Txbb bins"""
    plt.rcParams.update({"font.size": 36})
    fig, axs = plt.subplots(2, 2, figsize=(18 * 2, 17 * 2), constrained_layout=True)

    for k in range(4):
        row = k // 2
        col = k % 2
        ax = axs[row][col]

        match bins:
            case "txbb":
                title = rf"{tbins[k]:.2f} < $T_{{Xbb}}$ < {tbins[k + 1]:.2f}"
            case "pt":
                title = rf"{tbins[k]:.0f} < Jet 2 $p_T$ < {tbins[k + 1]:.0f} GeV"

        mesh = plotting.mesh2d(
            xbins, ybins, w[..., k], year, vmax=vmax, ax=ax, title=title, fontsize=28
        )

    for i in range(2):
        fig.colorbar(mesh, ax=axs[i].ravel().tolist(), pad=0.01)

    plt.savefig(f"{plot_dir}/{name}.pdf", bbox_inches="tight")

    if show:
        plt.show()
    else:
        plt.close()

### 2D Plots

In [None]:
plot2d_dir = plot_dir / "2D"
plot2d_dir.mkdir(exist_ok=True)

In [None]:
# coarser rebinning of histograms for 2D plots
coarse_bins = {
    "msd": np.linspace(20, 240, 12),
    "pt": [250, 275, 300, 325, 350, 375, 400, 450, 500, 600, 800],
    "txbb": [0.0, 0.8, 0.95, 0.98, 1.0],
}

coarse_hists = deepcopy(hists)

for ev_key in fill_events:
    for hkey in hists[ev_key]:
        for var, bins in coarse_bins.items():
            coarse_hists[ev_key][hkey] = utils.rebin_hist(coarse_hists[ev_key][hkey], var, bins)

    for trigger_key in trigger_dict:
        coarse_hists[ev_key][f"{trigger_key}_effs"] = (
            coarse_hists[ev_key][trigger_key] / coarse_hists[ev_key]["den"]
        )

In bins of Txbb

In [None]:
for ev_key, ev_label in fill_events.items():
    for trigger_key in trigger_dict:
        if "Hbb" not in trigger_key:
            # not need to separate into txbb bins if so
            continue

        w, msdbins, ptbins, txbbbins = coarse_hists[ev_key][f"{trigger_key}_effs"].to_numpy()
        plot_all_wrapper(
            txbbbins,
            msdbins,
            ptbins,
            w,
            year,
            plot2d_dir,
            f"{year}_all_txbb_{ev_key}_{trigger_key}",
            show=False,
        )

        tplot_dir = plot2d_dir / f"{trigger_key}_txbb_bins"
        tplot_dir.mkdir(exist_ok=True)

        for k in range(4):
            plotting.mesh2d(
                msdbins,
                ptbins,
                w[..., k],
                year,
                plot_dir=tplot_dir,
                name=f"{year}_txbb_{coarse_bins['txbb'][k]}_{ev_key}_{trigger_key}",
                title=rf"{txbbbins[k]:.2f} < $T_{{Xbb}}$ < {txbbbins[k + 1]:.2f}",
                title_params={"x": 0.315, "y": 1.007, "fontsize": 36},
                show=False,
            )
    #         break
    #     break
    # break

Sum over txbb

In [None]:
for ev_key, ev_label in fill_events.items():
    den = coarse_hists[ev_key]["den"][..., sum]
    for trigger_key in trigger_dict:
        num = coarse_hists[ev_key][trigger_key][..., sum]
        effs = num / den
        w, msdbins, ptbins = effs.to_numpy()
        plotting.mesh2d(
            msdbins,
            ptbins,
            w,
            year,
            plot_dir=plot2d_dir,
            name=f"{year}_sum_txbb_{ev_key}_{trigger_key}",
            show=False,
        )
    #     break
    # break

#### Checking mass dip

In [None]:
# coarser rebinning of histograms for 2D plots
coarse_bins = {
    "msd": np.arange(20, 241, 5),
    "pt": np.arange(300, 501, 5),
    "txbb": [0.9, 1.0],
}

mass_check_hists = deepcopy(hists)

for ev_key in fill_events:
    for hkey in hists[ev_key]:
        h = utils.multi_rebin_hist(mass_check_hists[ev_key][hkey], coarse_bins, flow=False)
        mass_check_hists[ev_key][hkey] = h[..., sum]  # sum over txbb axis

    for trigger_key in trigger_dict:
        mass_check_hists[ev_key][f"{trigger_key}_effs"] = (
            mass_check_hists[ev_key][trigger_key] / mass_check_hists[ev_key]["den"]
        )

In [None]:
for ev_key, ev_label in fill_events.items():
    print(ev_key)
    for trigger_key in ["BoostedHbb"]:
        print(trigger_key)
        w, msdbins, ptbins = mass_check_hists[ev_key][f"{trigger_key}_effs"].to_numpy()
        plotting.mesh2d(
            msdbins,
            ptbins,
            w,
            year,
            print_vals=False,
            title=r"$T_{Xbb} \geq 0.9$",
            plot_dir=plot2d_dir,
            name=f"{year}_mass_check_{ev_key}_{trigger_key}",
            data=ev_key == "data",
            show=True,
        )
        break
    # break

### 1D Plots

In [None]:
plot1d_dir = plot_dir / "1D"
plot1d_dir.mkdir(exist_ok=True)

In [None]:
plot_vars = {
    "msd": ShapeVar(
        var="msd", label=r"AK8 Jet SD Mass (GeV)", bins=np.arange(20, 241, 10), reg=False
    ),
    "pt": ShapeVar(
        var="pt",
        label=r"AK8 Jet $p_T$ (GeV)",
        bins=[250, 275, 300, 325, 350, 375, 400, 425, 450, 475, 500, 550, 600, 700, 800, 1000],
        reg=False,
    ),
    "txbb": ShapeVar(
        var="txbb", label=r"AK8 Jet $T_{Xbb}$", bins=np.arange(0, 1.001, 0.05), reg=False
    ),
}


var_label_map = {
    "msd": r"m_{SD}",
    "pt": r"p_T",
    "txbb": r"T_{Xbb}",
}


@dataclass
class TriggerSelection:
    selection: dict
    name: str = None
    label: str = None
    plot_vars: list[str] = field(default_factory=lambda: list(shape_vars.keys()))

    def __post_init__(self):
        self.plot_vars = [
            shape_var for key, shape_var in plot_vars.items() if key in self.plot_vars
        ]

        if self.label is None:
            self.label = []
            for var, sel in self.selection.items():
                self.label.append(f"${var_label_map[var]} \geq {sel[0]}$")

            self.label = " & ".join(self.label)

        if self.name is None:
            self.name = []
            for var, sel in self.selection.items():
                self.name.append(f"{var}_{sel[0]}")

            self.name = "_".join(self.name)

In [None]:
def get_effs_errs(hist_den, hist_num, pvar, pbins):
    """Get efficiency in ``pvar`` with bins ``pbins`` and calcualte errors from CP intervals."""
    hden = utils.rebin_hist(hist_den.project(pvar), pvar, pbins)
    hnum = utils.rebin_hist(hist_num.project(pvar), pvar, pbins)
    heffs = hnum / hden
    heffs.values()[...] = np.nan_to_num(heffs.values(), nan=0)
    intervals = clopper_pearson_interval(hnum.values(), hden.values())
    errs = np.abs(heffs.values() - intervals)
    return heffs, errs


# selections to apply and variables to plot for those selections
selections = [
    TriggerSelection(selection={"txbb": [0.9, 1], "pt": [300, 1000]}, plot_vars=["msd"]),
    # TriggerSelection(selection={"txbb": [0.9, 1], "pt": [400, 1000]}, plot_vars=["msd"]),
    # TriggerSelection(selection={"txbb": [0.9, 1], "msd": [60, 240]}, plot_vars=["pt"]),
    # TriggerSelection(selection={"pt": [300, 1000], "msd": [60, 240]}, plot_vars=["txbb"]),
    # TriggerSelection(selection={"pt": [300, 1000]}, plot_vars=["msd"]),
]

# which set of triggers to plot on each plot
plot_triggers = {
    "jethbbcombined": ["BoostedHbb", "BoostedJet", "BoostedJetHbb"],
    # "jethbbhtcombined": ["BoostedHbb", "BoostedJet", "HT", "BoostedJetHbb", "HTBoostedJetHbb"],
}

# data, ttbar loop
for ev_key, ev_label in fill_events.items():
    print(ev_key)
    for sel in selections:
        hist_den = utils.multi_rebin_hist(hists[ev_key]["den"], sel.selection, flow=False)
        for pt_key, ptriggers in plot_triggers.items():
            for pvar in sel.plot_vars:
                fig, ax = plt.subplots(1, 1, figsize=(12, 8))
                plt.rcParams.update({"font.size": 24})
                for ptrigger in ptriggers:
                    hist_num = utils.multi_rebin_hist(
                        hists[ev_key][ptrigger], sel.selection, flow=False
                    )
                    heffs, herrs = get_effs_errs(hist_den, hist_num, pvar.var, pvar.bins)
                    hep.histplot(
                        heffs,
                        label=trigger_dict[ptrigger].label,
                        yerr=herrs,
                        ax=ax,
                        histtype="errorbar",
                    )

                ax.grid(axis="y")
                ax.set_ylim([0, 1.01])
                ax.set_xlim([pvar.bins[0], pvar.bins[-1]])
                ax.set_ylabel("Efficiency")
                ax.set_xlabel(pvar.label, fontsize=20)

                leg = ax.legend()
                leg.set_title(sel.label, prop={"size": 20})
                leg.get_title().set_multialignment("center")

                hep.cms.label(
                    ax=ax,
                    data=ev_key == "data",
                    year=year,
                    lumi=round(hh_vars.LUMI[year] / 1e3),
                    com="13.6",
                )
                plt.savefig(
                    f"{plot1d_dir}/{sel.name}_{ev_key}_{pt_key}_{pvar.var}.pdf", bbox_inches="tight"
                )
                plt.show()
                # break
            break
        # break
    # break