# Postprocessing

Makes control plots and templates.

Authors: Raghav Kansal, Cristina Suarez

In [1]:
from __future__ import annotations

import os
import pickle
from copy import deepcopy
from pathlib import Path

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import mplhep as hep
import numpy as np
import pandas as pd
from hist import Hist

from HH4b import plotting, postprocessing, utils
from HH4b.hh_vars import LUMI, bg_keys, data_key, samples, sig_keys
from HH4b.postprocessing import Region
from HH4b.utils import CUT_MAX_VAL, ShapeVar

plt.style.use(hep.style.CMS)
hep.style.use("CMS")
formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))
plt.rcParams.update({"font.size": 16})

In [2]:
# automatically reloads imported files on edits
%load_ext autoreload
%autoreload 2

In [3]:
MAIN_DIR = "../../../"
# this is the directory to the files
# path_to_dir = f"{MAIN_DIR}/../data/skimmer/Oct26/"
path_to_dir = "/eos/uscms/store/user/ddiaz/bbbb/skimmer/24Mar31_v12_semilep-tt"
# path_to_dir = "/eos/uscms/store/user/cmantill/bbbb/skimmer/24Feb1_v12_semilep-tt"
year = "All"

# make plot and template directory
date = "24Mar31_v12_semilep-tt"
plot_dir = Path(f"{MAIN_DIR}/plots/PostProcessing/{date}/{year}")
template_dir = f"templates/{date}/"
_ = os.system(f"mkdir -p {plot_dir}")
_ = os.system(f"mkdir -p {template_dir}/cutflows/{year}")

In [4]:
# define dictionary with directories of files (this can be configured in a yaml file later in the script)
sig_keys = []
# ---2022
samples_2022 = deepcopy(samples_2022["2022"])
for key in list(samples.keys()):
    print(key)
    if key not in bg_keys + [data_key]:
        # if key not in bg_keys + sig_keys + [data_key]:
        del samples_2022[key]

sample_dirs_2022 = {path_to_dir: samples_2022}
print("------------")

# ---2022EE
samples_2022EE = deepcopy(samples["2022EE"])
for key in list(samples_2022EE.keys()):
    print(key)
    if key not in bg_keys + [data_key]:
        # if key not in bg_keys + sig_keys + [data_key]:
        del samples_2022EE[key]

sample_dirs_2022EE = {path_to_dir: samples_2022EE}
print("------------")

# ---2023
samples_2023_ = deepcopy(samples["2023"])
for key in list(samples_2023_.keys()):
    print(key)
    if key not in bg_keys + [data_key]:
        # if key not in bg_keys + sig_keys + [data_key]:
        del samples_2023_[key]

sample_dirs_2023_ = {path_to_dir: samples_2023_}
print("------------")

# ---2023BPix
samples_2023BPix = deepcopy(samples["2023BPix"])
for key in list(samples_2023BPix.keys()):
    print(key)
    if key not in bg_keys + [data_key]:
        # if key not in bg_keys + sig_keys + [data_key]:
        del samples_2023BPix[key]

sample_dirs_2023BPix = {path_to_dir: samples_2023BPix}
print("------------")
type(sample_dirs_2022)

hh4b
qcd
data
ttbar
gghtobb
vbfhtobb
vhtobb
tthtobb
diboson
vjetslnu
vjets
------------
hh4b
qcd
data
ttbar
gghtobb
vbfhtobb
vhtobb
tthtobb
diboson
vjetslnu
vjets
------------
hh4b
qcd
data
ttbar
gghtobb
vbfhtobb
vhtobb
tthtobb
diboson
vjetslnu
vjets
------------
hh4b
qcd
data
ttbar
gghtobb
vbfhtobb
vhtobb
tthtobb
diboson
vjetslnu
vjets
------------


dict

In [None]:
print(len(sample_dirs_2022))
print(sample_dirs_2022)

print(len(sample_dirs_2023_))
print(sample_dirs_2023_)

In [None]:
# pd.read_parquet(f"{path_to_dir}/2022EE/QCD_HT-2000/parquet").columns
# pd.read_parquet(f"{path_to_dir}/2022EE/GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV_TSG/parquet").columns
list(pd.read_parquet(f"{path_to_dir}/2022EE/QCD_HT-1200to1500/parquet").columns)

In [5]:
# columns to load
# the parquet files are too big so we can only load a few columns at a time without consumming much memory
load_columns = [
    ("weight", 1),
    ("MET_pt", 1),
    ("nFatJets", 1),
    ("ak8FatJetPt", 2),
    ("ak8FatJetEta", 2),
    ("ak8FatJetPhi", 2),
    ("ak8FatJetPNetXbb", 2),
    ("ak8FatJetMsd", 2),
    ("ak8FatJetPNetMass", 2),
    ("ak8FatJetTau3OverTau2", 2),
    ("bbFatJetMass", 2),
    ("bbFatJetPt", 2),
    ("bbFatJetEta", 2),
    ("bbFatJetPhi", 2),
    ("ht", 1),
    ("nPV", 1),
]

if "v12" in path_to_dir:
    load_columns += [("ak8FatJetPNetMassRaw", 2)]

load_columns_mc = load_columns + [
    ("single_weight_pileup", 1),
    ("single_weight_genWeight"),
    ("single_weight_trigsf_2jet"),
]
# + [(f"weight_{syst}_{shift}", 1) for syst in weight_shifts for shift in ["up", "down"]]

In [6]:
pt_cut = 300
pt_veto = 200
msd_cut = 50
eta_cut = 2.5

filters = [
    [
        ("('ak8FatJetPt', '0')", ">=", pt_cut),
        ("('ak8FatJetPt', '1')", "<", pt_veto),
        ("('ak8FatJetMsd', '0')", ">=", msd_cut),
        ("('ak8FatJetMsd', '1')", "<", msd_cut),
        # ("('ak8FatJetEta', '0')", "<=", eta_cut),
        # ("('ak8FatJetEta', '0')", ">=", -1*eta_cut),
        # ("('ak8FatJetPNetXbb', '0')", ">=", 0.8),
    ],
    #    [
    #        ("('ak8FatJetPt', '0')", ">=", pt_cut),
    #        ("('ak8FatJetPt', '1')", "<", pt_veto),
    #        ("('ak8FatJetEta', '0')", "<=", eta_cut),
    #        ("('ak8FatJetEta', '1')", "<=", eta_cut),
    #        ("('ak8FatJetEta', '0')", ">=", -1*eta_cut),
    #        ("('ak8FatJetEta', '1')", ">=", -1*eta_cut),
    #        ("('ak8FatJetMsd', '0')", ">=", msd_cut),
    #        ("('ak8FatJetMsd', '1')", "<", msd_cut),
    #        ("('ak8FatJetPNetXbb', '1')", ">=", 0.8),
    #    ],
]

# save cutflow as pandas table
# cutflow = pd.DataFrame(index=list(samples.keys()))

# dictionary that will contain all information (from all samples)
events_dict = {}
for input_dir, samples in sample_dirs.items():
    events_dict = {
        **events_dict,
        # this function will load files (only the columns selected), apply filters and compute a weight per event
        **utils.load_samples(
            input_dir,
            samples,
            "2022",
            filters=filters,
            columns=utils.format_columns(load_columns),
            variations=False,
            # columns_mc=utils.format_columns(load_columns_mc),
        ),
    }
sel_bg_keys = list(events_dict.keys())
sel_bg_keys.remove("data")
# print(events_dict)
# utils.add_to_cutflow(events_dict, "Preselection", "weight", cutflow)
# print("\n", cutflow)

In [None]:
print(events_dict.keys())

### Make some pre-corrected plots

In [None]:
ylims = {
    "2022": 5e4,
    "2022EE": 5e4,
}
tau32Bins = [
    0.0,
    0.1,
    0.1333,
    0.1667,
    0.2,
    0.2333,
    0.2667,
    0.3,
    0.3333,
    0.3667,
    0.4,
    0.4333,
    0.4667,
    0.5,
    0.5333,
    0.5667,
    0.6,
    0.6333,
    0.6667,
    0.7,
    0.7333,
    0.7667,
    0.8,
    0.8333,
    0.8667,
    0.9,
    0.9333,
    0.9667,
    1.0,
]
# {var: (bins, label)}
control_plot_vars = [
    # var must match key in events dictionary (i.e. as saved in parquet file)
    # ShapeVar(var="DijetMass", label=r"$m^{jj}$ (GeV)", bins=[30, 600, 4000]),
    #    ShapeVar(var="ak8FatJetPt0", label=r"$p_T^{j1}$ (GeV)", bins=[66, 0, 1000]),
    #    ShapeVar(var="ak8FatJetPt1", label=r"$p_T^{j2}$ (GeV)", bins=[66, 0, 1000]),
    #    ShapeVar(var="ak8FatJetMsd0", label=r"$M_{SD}^{j1}$ (GeV)", bins=[40, 0, 500]),
    #    ShapeVar(var="ak8FatJetEta0", label=r"$\eta^{j1}$", bins=[20, -5, 5]),
    #    ShapeVar(var="ak8FatJetMsd1", label=r"$M_{SD}^{j1}$ (GeV)", bins=[40, 0, 500]),
    #    ShapeVar(var="ak8FatJetEta1", label=r"$\eta^{j1}$", bins=[20, -5, 5]),
    #    ShapeVar(var="ak8FatJetPNetXbb0", label=r"$X_{bb}^{j1}$ ", bins=[25, 0, 1.0]),
    # ShapeVar(var="ak8FatJetTau3OverTau20", label=r"$[\tau_3/\tau_2]^{j1}_$ ", bins=[20, 0.1, 0.5]),
    # ShapeVar(var="ak8FatJetTau3OverTau20", label=r"$[\tau_3/\tau_2]^{j1}$ ", bins=[30, 0., 1.]),
    # ShapeVar(var="ak8FatJetTau3OverTau20", label=r"$[\tau_3/\tau_2]^{j1}$ ", bins=[20, 0., 1.]),
    ShapeVar(
        var="ak8FatJetTau3OverTau20", label=r"$[\tau_3/\tau_2]^{j1}$ ", bins=tau32Bins, reg=False
    ),
]

hists = {}
for shape_var in control_plot_vars:
    if shape_var.var not in hists:
        hists[shape_var.var] = utils.singleVarHist(
            events_dict,
            shape_var,
            weight_key="weight",
        )

for shape_var in control_plot_vars:
    name = f"{plot_dir}/{shape_var.var}.png"
    label = name.replace(".png", "_pre.png")
    plotting.ratioHistPlot(
        hists[shape_var.var],
        year,
        sig_keys,
        sel_bg_keys,
        name=label,
        show=True,
        log=True,
        plot_significance=False,
        significance_dir=shape_var.significance_dir,
        ratio_ylims=[0.0, 2.0],
        ylim=ylims[year],
        ylim_low=1e-3,
    )
    # break

In [None]:
samples_Datattbar = {"data", "ttbar"}
samples_Nottbar = {"diboson", "qcd", "vjets", "vjetslnu"}
samples_Data = {"data"}
samples_ttbar = {"ttbar"}
varBins = tau32Bins
# varBins=[0.1,0.16,0.18,0.2,0.22,0.24,0.26,0.28,0.3,0.32,0.34,0.36,0.38,0.4,0.42,0.44,0.46]
# theShapeVar = ShapeVar(var="ak8FatJetTau3OverTau20", label=r"$[\tau_3/\tau_2]^{j1}_$ ", bins=[20, 0.1, 0.5])
theShapeVar = ShapeVar(
    bins=varBins,
    var="ak8FatJetTau3OverTau20",
    label=r"$[\tau_3/\tau_2]^{j1}$",
    # bins=[20, 0.1, 0.5])
    reg=False,
)
vars = [theShapeVar.var]
# for sample in samples:
histos = {}
histos["ttbar"] = utils.singleVarHistSel(
    events_dict,
    theShapeVar,
    samples_ttbar,
    weight_key="weight",
)
histos["Nottbar"] = utils.singleVarHistSel(
    events_dict,
    theShapeVar,
    samples_Nottbar,
    weight_key="weight",
)
histos["data"] = utils.singleVarHistSel(
    events_dict,
    theShapeVar,
    samples_Data,
    weight_key="weight",
)
binContent_ttbar = np.array(histos["ttbar"].values())
binContent_Data = np.array(histos["data"].values())
binContent_Nottbar = np.sum(np.array(histos["Nottbar"].values()), axis=0)
print("TTBar only: ", binContent_ttbar)
print("Data only: ", binContent_Data)
print("MC No TTbar: ", binContent_Nottbar)
print("*************")
tau32BinnedSFs = (binContent_Data - binContent_Nottbar) / binContent_ttbar
print("(Data - NonTTbarMC)/TTbar: ", (tau32BinnedSFs))
sigma_f = np.sqrt(
    ((1 / binContent_ttbar) * np.sqrt(binContent_Data)) ** 2
    + ((-1 / binContent_ttbar) * np.sqrt(binContent_Nottbar)) ** 2
    + (((binContent_Data - binContent_Nottbar) / binContent_ttbar**2) * np.sqrt(binContent_ttbar))
    ** 2
)
print("sigma ", sigma_f)
##plotting.ratioHistPlot(
##    histos[theShapeVar.var],
##    year,
##    sig_keys,
##    samples_ttbar,
##    name=theShapeVar.label,
##    show=True,
##    log=True,
##    ratio_ylims=[0.0, 2.0],
##    ylim=ylims[year],
##    ylim_low=1e-1,
###    )

### Make some Post Corrected plots

In [None]:
events = events_dict["ttbar"]
# print(events)
# tau32 = events["ak8FatJetTau3OverTau20"].to_numpy()
tau32 = {"ak8FatJetTau3OverTau20": utils.get_feat(events, "ak8FatJetTau3OverTau20")}[
    "ak8FatJetTau3OverTau20"
]
print(tau32)
ftau32 = tau32[tau32 < 0.5]
tau32FittedSF_4 = (
    18.4912 - 235.086 * ftau32 + 1098.94 * ftau32**2 - 2163 * ftau32**3 + 1530.59 * ftau32**4
)
# print(events["weight"].to_numpy().squeeze())
plt.figure(figsize=(10, 6))
plt.scatter(ftau32, tau32FittedSF_4, alpha=0.5)
plt.title("4th Order Polynomial Fit for tau32FittedSF")
plt.xlabel("tau32")
plt.ylabel("tau32FittedSF_4")
plt.grid(True)
plt.show()

### Plot with the Binned SF

In [None]:
control_plot_vars = [
    # var must match key in events dictionary (i.e. as saved in parquet file)
    # ShapeVar(var="DijetMass", label=r"$m^{jj}$ (GeV)", bins=[30, 600, 4000]),
    # ShapeVar(var="ak8FatJetPt0", label=r"$p_T^{j1}$ (GeV)", bins=[66, 0, 1000]),
    # ShapeVar(var="ak8FatJetPt1", label=r"$p_T^{j2}$ (GeV)", bins=[66, 0, 1000]),
    # ShapeVar(var="ak8FatJetMsd0", label=r"$M_{SD}^{j1}$ (GeV)", bins=[40, 0, 500]),
    # ShapeVar(var="ak8FatJetEta0", label=r"$\eta^{j1}$", bins=[20, -5, 5]),
    # ShapeVar(var="ak8FatJetPNetXbb0", label=r"$X_{bb}^{j1}$ ", bins=[25, 0, 1.0]),
    # ShapeVar(var="ak8FatJetTau3OverTau20", label=r"$[\tau_3/\tau_2]^{j1}_$ ", bins=[20, 0.1, 0.5]),
    # ShapeVar(var="ak8FatJetTau3OverTau20", label=r"$[\tau_3/\tau_2]^{j1}$ ", bins=[30, 0., 1.]),
    # ShapeVar(var="ak8FatJetTau3OverTau20", label=r"$[\tau_3/\tau_2]^{j1}$ ", bins=[20, 0., 1.]),
    ShapeVar(
        var="ak8FatJetTau3OverTau20", label=r"$[\tau_3/\tau_2]^{j1}$ ", bins=tau32Bins, reg=False
    ),
]

hists = {}
for shape_var in control_plot_vars:
    if shape_var.var not in hists:
        hists[shape_var.var] = utils.singleVarHist(
            events_dict,
            shape_var,
            weight_key="weight",
        )

hists["ak8FatJetTau3OverTau20"].view()[
    utils.get_key_index(hists["ak8FatJetTau3OverTau20"], "ttbar"), ...
] *= tau32BinnedSFs.squeeze()

for shape_var in control_plot_vars:
    name = f"{plot_dir}/{shape_var.var}.png"
    label = name.replace(".png", "_postTau32binnedSF.png")
    plotting.ratioHistPlot(
        hists[shape_var.var],
        year,
        sig_keys,
        sel_bg_keys,
        name=label,
        show=True,
        log=True,
        plot_significance=False,
        significance_dir=shape_var.significance_dir,
        ratio_ylims=[0.0, 2.0],
        ylim=ylims[year],
        ylim_low=1e-3,
    )
    # break

### Fitted SFs

In [None]:
control_plot_vars = [
    # var must match key in events dictionary (i.e. as saved in parquet file)
    # ShapeVar(var="DijetMass", label=r"$m^{jj}$ (GeV)", bins=[30, 600, 4000]),
    ShapeVar(var="ak8FatJetPt0", label=r"$p_T^{j1}$ (GeV)", bins=[66, 0, 1000]),
    ShapeVar(var="ak8FatJetPt1", label=r"$p_T^{j2}$ (GeV)", bins=[66, 0, 1000]),
    ShapeVar(var="ak8FatJetMsd0", label=r"$M_{SD}^{j1}$ (GeV)", bins=[40, 0, 500]),
    ShapeVar(var="ak8FatJetEta0", label=r"$\eta^{j1}$", bins=[20, -5, 5]),
    ShapeVar(var="ak8FatJetPNetXbb0", label=r"$X_{bb}^{j1}$ ", bins=[25, 0, 1.0]),
    # ShapeVar(var="ak8FatJetTau3OverTau20", label=r"$[\tau_3/\tau_2]^{j1}_$ ", bins=[20, 0.1, 0.5]),
    # ShapeVar(var="ak8FatJetTau3OverTau20", label=r"$[\tau_3/\tau_2]^{j1}$ ", bins=[30, 0., 1.]),
    # ShapeVar(var="ak8FatJetTau3OverTau20", label=r"$[\tau_3/\tau_2]^{j1}$ ", bins=[20, 0., 1.]),
    ShapeVar(
        var="ak8FatJetTau3OverTau20", label=r"$[\tau_3/\tau_2]^{j1}$ ", bins=tau32Bins, reg=False
    ),
]

hists = {}
for shape_var in control_plot_vars:
    if shape_var.var not in hists:
        hists[shape_var.var] = utils.singleVarHist(
            events_dict, shape_var, weight_key="weight", sf=["tau32SF"]
        )

# hists["ak8FatJetTau3OverTau20"].view()[utils.get_key_index(hists["ak8FatJetTau3OverTau20"], "ttbar"), ...] *= tau32BinnedSFs.squeeze()

for shape_var in control_plot_vars:
    name = f"{plot_dir}/{shape_var.var}.png"
    label = name.replace(".png", "_postTau32FittedSF.png")
    plotting.ratioHistPlot(
        hists[shape_var.var],
        year,
        sig_keys,
        sel_bg_keys,
        name=label,
        show=True,
        log=True,
        plot_significance=False,
        significance_dir=shape_var.significance_dir,
        ratio_ylims=[0.0, 2.0],
        ylim=ylims[year],
        ylim_low=1e-3,
    )
    # break

### Weights

Removing high QCD event weight

In [None]:
# plt.hist(events_dict["qcd"]["weight"][events_dict["qcd"]["ak8FatJetPt"][0] > 1250], bins=np.power(10, np.linspace(-4, 5, 100)))
plt.hist(
    events_dict["qcd"]["single_weight_genweight"][events_dict["qcd"]["ak8FatJetPt"][0] > 300],
    bins=np.power(10, np.linspace(-4, 7, 100)),
    label="Filtered QCD single_weight_genweight",
)
# plt.hist(events_dict["qcd"]["single_weight_genweight"], bins=np.power(10, np.linspace(-4, 7, 100)))
plt.hist(
    events_dict["qcd"]["weight"], bins=np.power(10, np.linspace(-4, 5, 100)), label="QCD weight"
)
plt.yscale("log")
plt.xscale("log")
plt.legend()
plt.show()

In [None]:
events_dict["qcd"] = events_dict["qcd"][(events_dict["qcd"]["weight"] < 1e2).values]
# remove super high gen weight events for high pt qcd
events_dict["qcd"] = events_dict["qcd"][
    ~(
        (events_dict["qcd"]["ak8FatJetPt"][0] > 400)
        & (events_dict["qcd"]["single_weight_genweight"][0] > 1e6)
    ).values
]

Checking pileup weights and removing from signal

In [None]:
fig = plt.figure(figsize=(12, 12))
plt.rcParams.update({"font.size": 24})
# plt.hist(
#    events_dict["hh4b"]["single_weight_pileup"],
#    np.linspace(0, 10, 21),
#    histtype="step",
#    label="HH4b",
# )
plt.hist(
    events_dict["qcd"]["single_weight_pileup"], np.linspace(0, 10, 21), histtype="step", label="QCD"
)
plt.hist(
    events_dict["ttbar"]["single_weight_pileup"],
    np.linspace(0, 10, 21),
    histtype="step",
    label="TT",
)
plt.legend()
plt.yscale("log")
plt.xlabel("Pileup Weights")
plt.ylabel("Events")
hep.cms.label(data=False, year=year, lumi=round(LUMI[year] / 1e3))
plt.savefig(f"{plot_dir}/pileup_weights.pdf", bbox_inches="tight")
plt.show()

In [None]:
if "Oct26" in path_to_dir:
    events_dict["hh4b"]["weight"] /= events_dict["hh4b"]["single_weight_pileup"]
utils.add_to_cutflow(events_dict, "Fix Weights", "weight", cutflow)
cutflow

### Updated Control Plots

In [None]:
ylims = {
    "2022": 5e4,
    "2022EE": 4e5,
}

# {var: (bins, label)}
control_plot_vars = [
    # var must match key in events dictionary (i.e. as saved in parquet file)
    # ShapeVar(var="DijetMass", label=r"$m^{jj}$ (GeV)", bins=[30, 600, 4000]),
    ShapeVar(var="ak8FatJetPt0", label=r"$p_T^{j1}$ (GeV)", bins=[50, 300, 1000]),
    ShapeVar(var="ak8FatJetPt1", label=r"$p_T^{j2}$ (GeV)", bins=[50, 300, 1000]),
    ShapeVar(var="ak8FatJetMsd0", label=r"$M_{SD}^{j1}$ (GeV)", bins=[50, 0, 500]),
    ShapeVar(var="ak8FatJetEta0", label=r"$\eta^{j1}$", bins=[50, -2.6, 2.6]),
    ShapeVar(var="ak8FatJetPNetXbb0", label=r"$X_{bb}^{j1}$ ", bins=[100, 0, 1.0]),
    ShapeVar(var="ak8FatJetTau3OverTau20", label=r"$[\tau_3/\tau_2]^{j1}$ ", bins=[100, 0, 1.0]),
    # ShapeVar(
    #     var="ak8FatJetPNetMass0",
    #     label=r"$m_{reg}^{j1}$ (GeV)",
    #     bins=[19, 60, 250],
    #     significance_dir="bin",
    # ),
    # ShapeVar(
    #     var="ak8FatJetPNetMass1",
    #     label=r"$m_{reg}^{j2}$ (GeV)",
    #     bins=[19, 60, 250],
    #     significance_dir="bin",
    # ),
    # ShapeVar(
    #     var="ak8FatJetMsd0",
    #     label=r"$m_{SD}^{j1}$ (GeV)",
    #     bins=[19, 60, 250],
    #     significance_dir="bin",
    # ),
    # ShapeVar(
    #     var="ak8FatJetMsd1",
    #     label=r"$m_{SD}^{j2}$ (GeV)",
    #     bins=[19, 60, 250],
    #     significance_dir="bin",
    # ),
    # ShapeVar(
    #     var="ak8FatJetPNetXbb0",
    #     label=r"$T_{Xbb}^{j1}$",
    #     bins=[50, 0.0, 1],
    # ),
    # ShapeVar(
    #     var="ak8FatJetPNetXbb1",
    #     label=r"$T_{Xbb}^{j2}$",
    #     bins=[50, 0.0, 1],
    # ),
    # ShapeVar(
    #     var="ht",
    #     label=r"HT (GeV)",
    #     bins=[50, 0, 4000],
    # ),
]

hists = {}
for shape_var in control_plot_vars:
    if shape_var.var not in hists:
        hists[shape_var.var] = utils.singleVarHist(
            events_dict,
            shape_var,
            weight_key="weight",
        )

for shape_var in control_plot_vars:
    name = f"{plot_dir}/{shape_var.var}.png"
    plotting.ratioHistPlot(
        hists[shape_var.var],
        year,
        sig_keys,
        sel_bg_keys,
        name=name,
        show=True,
        log=True,
        plot_significance=False,
        significance_dir=shape_var.significance_dir,
        ratio_ylims=[0.0, 2.0],
        ylim=ylims[year],
        ylim_low=1e-1,
    )
    # break

### Control Plots

In [None]:
ylims = {
    "2022": 5e4,
    "2022EE": 4e5,
}

# {var: (bins, label)}
control_plot_vars = [
    # var must match key in events dictionary (i.e. as saved in parquet file)
    # ShapeVar(var="DijetMass", label=r"$m^{jj}$ (GeV)", bins=[30, 600, 4000]),
    ShapeVar(var="ak8FatJetPt0", label=r"$p_T^{j1}$ (GeV)", bins=[50, 300, 1500]),
    # ShapeVar(var="ak8FatJetPt1", label=r"$p_T^{j2}$ (GeV)", bins=[50, 300, 1500]),
    # ShapeVar(
    #     var="ak8FatJetPNetMass0",
    #     label=r"$m_{reg}^{j1}$ (GeV)",
    #     bins=[19, 60, 250],
    #     significance_dir="bin",
    # ),
    # ShapeVar(
    #     var="ak8FatJetPNetMass1",
    #     label=r"$m_{reg}^{j2}$ (GeV)",
    #     bins=[19, 60, 250],
    #     significance_dir="bin",
    # ),
    # ShapeVar(
    #     var="ak8FatJetMsd0",
    #     label=r"$m_{SD}^{j1}$ (GeV)",
    #     bins=[19, 60, 250],
    #     significance_dir="bin",
    # ),
    # ShapeVar(
    #     var="ak8FatJetMsd1",
    #     label=r"$m_{SD}^{j2}$ (GeV)",
    #     bins=[19, 60, 250],
    #     significance_dir="bin",
    # ),
    # ShapeVar(
    #     var="ak8FatJetPNetXbb0",
    #     label=r"$T_{Xbb}^{j1}$",
    #     bins=[50, 0.0, 1],
    # ),
    # ShapeVar(
    #     var="ak8FatJetPNetXbb1",
    #     label=r"$T_{Xbb}^{j2}$",
    #     bins=[50, 0.0, 1],
    # ),
    # ShapeVar(
    #     var="ht",
    #     label=r"HT (GeV)",
    #     bins=[50, 0, 4000],
    # ),
]

hists = {}
for shape_var in control_plot_vars:
    if shape_var.var not in hists:
        hists[shape_var.var] = utils.singleVarHist(
            events_dict,
            shape_var,
            weight_key="weight",
        )

for shape_var in control_plot_vars:
    name = f"{plot_dir}/{shape_var.var}.pdf"
    plotting.ratioHistPlot(
        hists[shape_var.var],
        year,
        sig_keys,
        bg_keys,
        name=name,
        show=True,
        log=True,
        plot_significance=False,
        significance_dir=shape_var.significance_dir,
        ratio_ylims=[0.2, 1.8],
        ylim=ylims[year],
    )
    # break

mSD vs mReg

In [None]:
mplot_dir = Path(plot_dir) / "msd_mreg"
mplot_dir.mkdir(exist_ok=True)

plot_events = {
    #    "hh4b": "HH4b",
    "qcd": "QCD",
    "ttbar": "TT",
    "data": "Data",
}

bins_hh4b = np.linspace(30, 250, 23)
bins_all = np.linspace(30, 330, 31)

mass_vars = {}

for ev_key in plot_events:
    bins = bins_hh4b if ev_key == "hh4b" else bins_all
    mass_vars[ev_key] = [
        ShapeVar(
            var="ak8FatJetPNetMass",
            label="Regressed (JEC-Corrected)" if "v12" in path_to_dir else "Regressed",
            bins=bins,
            reg=False,
            plot_args={"color": plotting.colours["lightred"]},
        ),
        ShapeVar(
            var="ak8FatJetMsd",
            label="Soft Drop",
            bins=bins,
            reg=False,
            plot_args={"color": plotting.colours["darkblue"]},
        ),
    ]

    if "v12" in path_to_dir:
        mass_vars[ev_key].append(
            ShapeVar(
                var="ak8FatJetPNetMassRaw",
                label="Regressed (Raw)",
                bins=bins,
                reg=False,
                plot_args={"color": plotting.colours["orange"]},
            )
        )

hists = {}

for ev_key, ev_label in plot_events.items():
    events = events_dict[ev_key]
    hists[ev_key] = {}

    for i in range(2):
        for mvar in mass_vars[ev_key]:
            hists[ev_key][f"{mvar.var}{i}"] = h = Hist(mvar.axis, storage="weight")
            h.fill(events[mvar.var][i], weight=events["weight"])

            if ev_key == "hh4b":
                hists[ev_key][f"{mvar.var}{i}_mean"] = np.mean(events[mvar.var][i])
                hists[ev_key][f"{mvar.var}{i}_std"] = np.std(events[mvar.var][i])

with mplot_dir.joinpath("hists.pkl").open("wb") as f:
    pickle.dump(hists, f)

In [None]:
mass_vars_v11 = [
    ShapeVar(
        var="ak8FatJetMsd",
        label="Soft Drop",
        plot_args={"color": plotting.colours["darkblue"]},
    ),
    ShapeVar(
        var="ak8FatJetPNetMass",
        label="Regressed v11",
        plot_args={"color": plotting.colours["lightred"]},
    ),
]

mass_vars_v12 = [
    ShapeVar(
        var="ak8FatJetPNetMass",
        label="Regressed v12",
        plot_args={"color": plotting.colours["orange"]},
    ),
    # ShapeVar(
    #     var="ak8FatJetPNetMassRaw",
    #     label="Regressed v12 (Raw)",
    #     plot_args={"color": plotting.colours["forestgreen"]},
    # ),
]

with plot_dir.parent.parent.joinpath("23Nov14_v11/2022EE/msd_mreg/hists.pkl").open("rb") as f:
    hists_v11 = pickle.load(f)

with plot_dir.parent.parent.joinpath("23Nov14_v12/2022/msd_mreg/hists.pkl").open("rb") as f:
    hists_v12 = pickle.load(f)

for ev_key, ev_label in plot_events.items():
    for i in range(2):
        fig, ax = plt.subplots(1, 1, figsize=(12, 12))
        plt.rcParams.update({"font.size": 24})

        for mvar in mass_vars_v11:
            h = hists_v11[ev_key][f"{mvar.var}{i}"]
            if ev_key == "hh4b":
                add_label = f"\n$\\overline{{m}} = {hists_v11[ev_key][f'{mvar.var}{i}_mean']:.1f}, \sigma_m = {hists_v11[ev_key][f'{mvar.var}{i}_std']:.1f}$ GeV"
            else:
                add_label = ""

            hep.histplot(
                h,
                histtype="step",
                label=f"{mvar.label}{add_label}",
                linewidth=2,
                yerr=False,
                density=True,
                **mvar.plot_args,
            )

        for mvar in mass_vars_v12:
            h = hists_v12[ev_key][f"{mvar.var}{i}"]
            if ev_key == "hh4b":
                add_label = f"\n$\\overline{{m}} = {hists_v12[ev_key][f'{mvar.var}{i}_mean']:.1f}, \sigma_m = {hists_v12[ev_key][f'{mvar.var}{i}_std']:.1f}$ GeV"
            else:
                add_label = ""

            hep.histplot(
                h,
                histtype="step",
                label=f"{mvar.label}{add_label}",
                linewidth=2,
                yerr=False,
                density=True,
                **mvar.plot_args,
            )

        ax.set_title(ev_label, x=0.42, y=1.002)
        ax.set_xlabel(f"AK8 Jet {i + 1} Mass")
        ax.set_ylabel("Events (A.U.)")
        ax.set_ylim(0)

        # hep.cms.label(
        #     data=True, label="Internal", year=year, lumi=round(LUMI[year] / 1e3), com="13.6"
        # )

        hep.cms.label(
            data=True,
            label="Internal",
            rlabel="",
        )

        ax.legend(fontsize=18)
        plt.savefig(mplot_dir / f"{ev_key}_j{i}_msd_mreg_noraw.pdf", bbox_inches="tight")
        plt.show()

    #     break
    # break

In [None]:
mplot_dir = Path(plot_dir) / "msd_mreg"
mplot_dir.mkdir(exist_ok=True)

plot_events = {
    "hh4b": "HH4b",
    "qcd": "QCD",
    "ttbar": "TT",
    "data": "Data",
}

bins = np.linspace(30, 250, 23)

mass_vars = [
    ShapeVar(
        var="ak8FatJetPNetMass",
        label="Regressed (JEC-Corrected)" if "v12" in path_to_dir else "Regressed",
        bins=bins,
        reg=False,
        plot_args={"color": plotting.colours["lightred"]},
    ),
    ShapeVar(
        var="ak8FatJetMsd",
        label="Soft Drop",
        bins=bins,
        reg=False,
        plot_args={"color": plotting.colours["darkblue"]},
    ),
]

if "v12" in path_to_dir:
    mass_vars.append(
        ShapeVar(
            var="ak8FatJetPNetMassRaw",
            label="Regressed (Raw)",
            bins=bins,
            reg=False,
            plot_args={"color": plotting.colours["orange"]},
        )
    )

for ev_key, ev_label in plot_events.items():
    events = events_dict[ev_key]

    for i in range(2):
        fig, ax = plt.subplots(1, 1, figsize=(12, 12))
        plt.rcParams.update({"font.size": 24})

        bins = np.linspace(0, 250, 26)

        for mvar in mass_vars:
            if ev_key == "hh4b":
                mean = np.mean(events[mvar.var][i])
                std = np.std(events[mvar.var][i])
                add_label = f"\n$\\overline{{m}} = {mean:.1f}, \sigma_m = {std:.1f}$ GeV"
            else:
                add_label = ""

            ax.hist(
                events[mvar.var][i],
                bins=mvar.bins if ev_key == "hh4b" else np.linspace(30, 330, 31),
                histtype="step",
                label=f"{mvar.label}{add_label}",
                linewidth=2,
                # alpha=0.5,
                # hatch="//",
                **mvar.plot_args,
            )

        ax.set_title(ev_label, x=0.42, y=1.002)
        ax.set_xlabel(f"AK8 Jet {i + 1} Mass")
        ax.set_ylabel("Events")

        hep.cms.label(
            data=True, label="Internal", year=year, lumi=round(LUMI[year] / 1e3), com="13.6"
        )

        ax.legend(fontsize=18)
        plt.savefig(mplot_dir / f"{ev_key}_j{i}_msd_mreg.pdf", bbox_inches="tight")
        plt.show()

    #     break
    # break

Pileup and trigger weight checks

In [None]:
for key, events in events_dict.items():
    if key == data_key:
        events["weight_notrig"] = events["weight"]
    else:
        events["weight_notrig"] = events["weight"] / events["single_weight_trigsf_2jet"]

In [None]:
for key, events in events_dict.items():
    if key in [data_key] + sig_keys:
        events["weight_nopileup"] = events["weight"]
    else:
        events["weight_nopileup"] = events["weight"] / events["single_weight_pileup"]

In [None]:
# {var: (bins, label)}
control_plot_vars = [
    # var must match key in events dictionary (i.e. as saved in parquet file)
    # ShapeVar(var="DijetMass", label=r"$m^{jj}$ (GeV)", bins=[30, 600, 4000]),
    ShapeVar(var="ak8FatJetPt0", label=r"$p_T^{j1}$ (GeV)", bins=[50, 300, 1500]),
    # ShapeVar(var="ak8FatJetPt0", label=r"$p_T^{j1}$ (GeV)", bins=[20, 300, 400]),
    # ShapeVar(var="ak8FatJetPt1", label=r"$p_T^{j2}$ (GeV)", bins=[50, 300, 1500]),
    # ShapeVar(
    #     var="ak8FatJetPNetMass0",
    #     label=r"$m_{reg}^{j1}$ (GeV)",
    #     bins=[19, 60, 250],
    #     significance_dir="bin",
    # ),
    # ShapeVar(
    #     var="ak8FatJetPNetMass1",
    #     label=r"$m_{reg}^{j2}$ (GeV)",
    #     bins=[19, 60, 250],
    #     significance_dir="bin",
    # ),
    # ShapeVar(
    #     var="ak8FatJetMsd0",
    #     label=r"$m_{SD}^{j1}$ (GeV)",
    #     bins=[19, 60, 250],
    #     significance_dir="bin",
    # ),
    # ShapeVar(
    #     var="ak8FatJetMsd1",
    #     label=r"$m_{SD}^{j2}$ (GeV)",
    #     bins=[19, 60, 250],
    #     significance_dir="bin",
    # ),
    # ShapeVar(
    #     var="ak8FatJetPNetXbb0",
    #     label=r"$T_{Xbb}^{j1}$",
    #     bins=[50, 0.0, 1],
    # ),
    # ShapeVar(
    #     var="ak8FatJetPNetXbb1",
    #     label=r"$T_{Xbb}^{j2}$",
    #     bins=[50, 0.0, 1],
    # ),
    # ShapeVar(var="nPV", label=r"nPV", bins=[20, 0, 80])
    # ShapeVar(
    #     var="ht",
    #     label=r"HT (GeV)",
    #     bins=[50, 0, 4000],
    # ),
]

# for weight_key in ["weight", "weight_nopileup"]:
for weight_key in ["weight_nopileup"]:
    # for weight_key in ["weight", "weight_notrig"]:
    hists = {}
    for shape_var in control_plot_vars:
        if shape_var.var not in hists:
            hists[shape_var.var] = utils.singleVarHist(
                events_dict,
                shape_var,
                weight_key=weight_key,
            )

    for shape_var in control_plot_vars:
        name = f"{plot_dir}/{shape_var.var}_{weight_key}.pdf"
        plotting.ratioHistPlot(
            hists[shape_var.var],
            year,
            sig_keys,
            bg_keys,
            name=name,
            show=True,
            log=True,
            ratio_ylims=[0.2, 1.8],
            ylim=ylims[year],
            # title="With Pileup Reweighting"
            # if weight_key == "weight"
            # else "Without Pileup Reweighting",
        )
        # break

In [None]:
# signal only
events = events_dict["hh4b"]
bins = np.arange(60, 201, 8)
plt.figure(figsize=(12, 12))
plt.hist(
    events["ak8FatJetMsd"][0],
    bins=bins,
    label="No Correction",
    # histtype="step",
    weights=events["weight_notrig"],
    color=plotting.colours["slategray"],
    linewidth=2,
    alpha=0.5,
    hatch="//",
)
plt.hist(
    events["ak8FatJetMsd"][0],
    bins=bins,
    label="With Corrections",
    histtype="step",
    weights=events["weight"],
    color=plotting.colours["orange"],
    linewidth=2,
)
plt.legend()
hep.cms.label(data=False, year=year, lumi=round(LUMI[year] / 1e3), com="13.6")
plt.title("HH4b", x=0.45, y=1.005)
plt.ylabel("Events")
plt.xlabel(r"Jet 1 $m_{SD}$(GeV)")
plt.ylim(0)
plt.savefig(f"{plot_dir}/msd_trig_weights.pdf", bbox_inches="tight")
plt.show()

Mass plots for signal

In [None]:
sig_key = "hh4b"
plots = {
    "bb0FatJetPNetMass": r"$m_{reg}^{j1}$ (GeV)",
    "bb1FatJetPNetMass": r"$m_{reg}^{j2}$ (GeV)",
    "bb0FatJetMsd": r"$m_{SD}^{j1}$ (GeV)",
    "bb1FatJetMsd": r"$m_{SD}^{j2}$ (GeV)",
}

for key, label in plots.items():
    plt.hist(
        utils.get_feat(events_dict[sig_key], key),
        bins=20,
        histtype="step",
        label=label,
        range=[50, 250],
        weights=utils.get_feat(events_dict[sig_key], "weight"),
        linestyle="--",
        linewidth=2,
    )

plt.legend()
plt.xlabel("Mass (GeV)")
plt.ylabel("Events")
plt.show()

Control plots without trigger weights

In [None]:
hists_notrigweights = {}
for shape_var in control_plot_vars:
    if shape_var.var not in hists_notrigweights:
        hists_notrigweights[shape_var.var] = utils.singleVarHist(
            events_dict,
            shape_var,
            weight_key="weight_notrig",
        )

for shape_var in control_plot_vars:
    name = f"{plot_dir}/{shape_var.var}_notrigweights.pdf"
    plotting.ratioHistPlot(
        hists_notrigweights[shape_var.var],
        year,
        sig_keys,
        bg_keys,
        name=name,
        show=True,
        log=True,
        ratio_ylims=None,
    )

Pt plots for different Txbb cuts

In [None]:
# {var: (bins, label)}
control_plot_vars = [
    # var must match key in events dictionary (i.e. as saved in parquet file)
    # ShapeVar(var="DijetMass", label=r"$m^{jj}$ (GeV)", bins=[30, 600, 4000]),
    ShapeVar(var="ak8FatJetPt0", label=r"$p_T^{j1}$ (GeV)", bins=[50, 300, 1500]),
    ShapeVar(var="ak8FatJetPt1", label=r"$p_T^{j2}$ (GeV)", bins=[50, 300, 1500]),
    ShapeVar(var="ak8FatJetMsd0", label=r"$m_{SD}^{j1}$ (GeV)", bins=[19, 60, 250]),
    ShapeVar(var="ak8FatJetMsd1", label=r"$m_{SD}^{j2}$ (GeV)", bins=[19, 60, 250]),
]

for txbb_cut in [0, 0.35, 0.6, 0.8, 0.9, 0.95, 0.985]:
    print(txbb_cut)
    hists = {}
    sel, _ = utils.make_selection(
        {"ak8FatJetPNetXbb0+ak8FatJetPNetXbb1": [txbb_cut, CUT_MAX_VAL]}, events_dict
    )
    print(np.mean(sel["qcd"]))
    for shape_var in control_plot_vars:
        if shape_var.var not in hists:
            hists[shape_var.var] = utils.singleVarHist(
                events_dict,
                shape_var,
                weight_key="weight",
                selection=sel,
            )

    for shape_var in control_plot_vars:
        name = f"{plot_dir}/{shape_var.var}_txbb{txbb_cut}.pdf"
        plotting.ratioHistPlot(
            hists[shape_var.var],
            year,
            sig_keys,
            bg_keys,
            name=name,
            show=True,
            log=True,
            plot_significance=False,
            significance_dir=shape_var.significance_dir,
            ratio_ylims=[0.2, 1.8],
            title=rf"At least one Jet's $T_{{Xbb}}$ > {txbb_cut}",
        )
        # break

ROC Curve

In [None]:
sig_jets_score = events_dict["gghtobb"]["ak8FatJetPNetXbb"].values
sig_jets_score = np.max(sig_jets_score, axis=1)
# # copy array values to second column
# sig_jets_score = np.stack((sig_jets_score, sig_jets_score), axis=1)

bg_jets_score = events_dict["qcd"]["ak8FatJetPNetXbb"].values.reshape(-1)

In [None]:
from sklearn.metrics import roc_curve

bg_skip = 4
sig_key = "gghtobb"
bg_keys = ["qcd"]  # add ttbar too?

y_true = np.concatenate(
    [
        np.ones(len(sig_jets_score)),
        np.zeros(int(np.ceil(len(bg_jets_score) / bg_skip))),
    ]
)

weights = np.concatenate(
    [events_dict[sig_key]["weight"].values.reshape(-1)]
    + [
        np.stack(
            (events_dict[bg_key]["weight"].values, events_dict[bg_key]["weight"].values), axis=1
        ).reshape(-1)[::bg_skip]
        for bg_key in bg_keys
    ],
)

scores = np.concatenate((sig_jets_score, bg_jets_score[::bg_skip]))

fpr, tpr, thresholds = roc_curve(y_true, scores, sample_weight=weights)

In [None]:
roc = {
    "fpr": fpr,
    "tpr": tpr,
    "thresholds": thresholds,
}

with open(f"{plot_dir}/roc.pkl", "wb") as f:
    pickle.dump(roc, f)

In [None]:
def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return idx

In [None]:
# plot_thresholds = [0.35, 0.8, 0.95, 0.975, 0.985, 0.99]  # v11
plot_thresholds = [0.1, 0.35, 0.8, 0.9, 0.94, 0.96]  # v12
th_colours = ["#9381FF", "#1f78b4", "#a6cee3", "#ff7f00", "#7CB518", "#EDB458", "#36213E"]

pths = {th: [[], []] for th in plot_thresholds}
plt.figure(figsize=(12, 12))
plt.plot(
    tpr,
    fpr,
    linewidth=2,
)

for th in plot_thresholds:
    idx = find_nearest(thresholds, th)
    pths[th][0].append(tpr[idx])
    pths[th][1].append(fpr[idx])


for k, th in enumerate(plot_thresholds):
    plt.scatter(
        *pths[th],
        marker="o",
        s=40,
        label=rf"$T_{{Xbb}}$ > {th}",
        color=th_colours[k],
        zorder=100,
    )

    plt.vlines(
        x=pths[th][0],
        ymin=0,
        ymax=pths[th][1],
        color=th_colours[k],
        linestyles="dashed",
        alpha=0.5,
    )

    plt.hlines(
        y=pths[th][1],
        xmin=0,
        xmax=pths[th][0],
        color=th_colours[k],
        linestyles="dashed",
        alpha=0.5,
    )


hep.cms.label(data=False, year=year, com="13.6")
# plt.hlines(y=0.01, xmin=0, xmax=1, colors="lightgrey", linestyles="dashed")
plt.yscale("log")
plt.xlabel("Signal efficiency")
plt.ylabel("Background efficiency")
# plt.suptitle(f"HVV FatJet {pvars['title']} ROC", y=0.95)
# plt.title(cut_labels[cutstr], fontsize=20)
plt.xlim([0, 1])
plt.ylim([1e-3, 1])
plt.legend(loc="upper left")
plt.savefig(f"{plot_dir}/roccurve.pdf", bbox_inches="tight")

In [None]:
pths

### Cut-based selection

In [None]:
bb_masks = postprocessing.bb_assignment(events_dict)

In [None]:
# {label: {cutvar: [min, max], ...}, ...}
txbb_cut = 0.985
pt_cut = 300

selection_regions = {
    "pass": Region(
        cuts={
            "bb0FatJetPNetXbb": [txbb_cut, CUT_MAX_VAL],
            "bb1FatJetPNetXbb": [txbb_cut, CUT_MAX_VAL],
            "bb0FatJetPt": [pt_cut, CUT_MAX_VAL],
            "bb1FatJetPt": [pt_cut, CUT_MAX_VAL],
            "bb0FatJetPNetMass": [100, 150],
        },
        label="Pass",
    ),
    "fail": Region(
        cuts={
            "bb0FatJetPNetXbb": [-CUT_MAX_VAL, txbb_cut],
            "bb1FatJetPNetXbb": [-CUT_MAX_VAL, txbb_cut],
            "bb0FatJetPt": [pt_cut, CUT_MAX_VAL],
            "bb1FatJetPt": [pt_cut, CUT_MAX_VAL],
            "bb0FatJetPNetMass": [100, 150],
        },
        label="Fail",
    ),
}

fit_shape_var = ShapeVar(
    "bb1FatJetPNetMass",
    r"$m^{2}_\mathrm{Reg}$ (GeV)",
    [19, 60, 250],
    reg=True,
    blind_window=[100, 150],
)

In [None]:
templates = postprocessing.get_templates(
    events_dict,
    bb_masks,
    year,
    sig_keys,
    selection_regions,
    [fit_shape_var],
    {},
    template_dir,
    bg_keys,
    plot_dir,
    cutflow,
    weight_key="weight",
    # weight_shifts=weight_shifts,
    show=True,
)

In [None]:
templates

In [None]:
postprocessing.save_templates(
    templates,
    f"{template_dir}/{year}_templates.pkl",
    fit_shape_var,
)

In [None]:
import pickle

with open(f"templates/23Oct17/{year}_templates.pkl", "rb") as f:
    templates2 = pickle.load(f)