# Postprocessing

Makes control plots and templates.

Authors: Raghav Kansal, Cristina Suarez

In [None]:
import os
import utils
import plotting
import postprocessing
from postprocessing import Region, weight_shifts
from utils import ShapeVar, CUT_MAX_VAL
from hh_vars import samples, data_key, bg_keys, sig_keys

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import mplhep as hep
import matplotlib.ticker as mticker

plt.style.use(hep.style.CMS)
hep.style.use("CMS")
formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))
plt.rcParams.update({"font.size": 16})

In [None]:
# automatically reloads imported files on edits
%load_ext autoreload
%autoreload 2

In [None]:
# define dictionary with directories of files (this can be configured in a yaml file later in the script)
sig_keys = ["hh4b"]
for key in list(samples.keys()):
    if key not in bg_keys + sig_keys + [data_key]:
        del samples[key]

MAIN_DIR = "../../../"
# this is the directory to the files
# path_to_dir = "/eos/uscms/store/user/cmantill/bbbb/skimmer/Oct13Test/"
path_to_dir = f"{MAIN_DIR}/../data/skimmer/Oct19wSelection/"
sample_dirs = {path_to_dir: samples}
year = "2022EE"

# make plot and template directory
date = "23Oct23QCDPt"
plot_dir = f"{MAIN_DIR}/plots/PostProcessing/{date}/"
template_dir = f"templates/{date}/"
_ = os.system(f"mkdir -p {plot_dir}")
_ = os.system(f"mkdir -p {template_dir}/cutflows/{year}")

In [None]:
# columns to load
# the parquet files are too big so we can only load a few columns at a time without consumming much memory
load_columns = [
    ("weight", 1),
    # ("DijetMass", 1),
    ("ak8FatJetPt", 2),
    ("ak8FatJetPNetXbb", 2),
    ("ak8FatJetMsd", 2),
    ("ak8FatJetPNetMass", 2),
]

load_columns_mc = load_columns + [("single_weight_trigsf_2jet", 1)]
# + [(f"weight_{syst}_{shift}", 1) for syst in weight_shifts for shift in ["up", "down"]]

In [None]:
df = pd.read_parquet(
    "../../../../data/skimmer/Oct19wSelection/2022EE/GluGlutoHHto4B_kl-0p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV_TSG/parquet"
)
list(df.columns)

In [None]:
filters = [
    [
        ("('ak8FatJetPt', '0')", ">=", 300),
        ("('ak8FatJetPt', '1')", ">=", 300),
        ("('ak8FatJetMsd', '0')", ">=", 60),
        ("('ak8FatJetMsd', '1')", ">=", 60),
        ("('ak8FatJetPNetXbb', '0')", ">=", 0.8),
    ],
    [
        ("('ak8FatJetPt', '0')", ">=", 300),
        ("('ak8FatJetPt', '1')", ">=", 300),
        ("('ak8FatJetMsd', '0')", ">=", 60),
        ("('ak8FatJetMsd', '1')", ">=", 60),
        ("('ak8FatJetPNetXbb', '1')", ">=", 0.8),
    ],
]

# save cutflow as pandas table
cutflow = pd.DataFrame(index=list(samples.keys()))

# dictionary that will contain all information (from all samples)
events_dict = {}
for input_dir, samples in sample_dirs.items():
    events_dict = {
        **events_dict,
        # this function will load files (only the columns selected), apply filters and compute a weight per event
        **utils.load_samples(
            input_dir,
            samples,
            year,
            filters=filters,
            columns=utils.format_columns(load_columns),
            columns_mc=utils.format_columns(load_columns_mc),
        ),
    }

utils.add_to_cutflow(events_dict, "Preselection", "weight", cutflow)
print("\n", cutflow)

Checking weights without the trigger SFs

In [None]:
for key, events in events_dict.items():
    if key == data_key:
        events["weight_notrig"] = events["weight"]
    else:
        events["weight_notrig"] = events["weight"] / events["single_weight_trigsf_2jet"]

In [None]:
_ = plt.hist(events_dict["qcd"]["single_weight_trigsf_2jet"].values, bins=100, histtype="step")

### Control Plots

In [None]:
# {var: (bins, label)}
control_plot_vars = [
    # var must match key in events dictionary (i.e. as saved in parquet file)
    # ShapeVar(var="DijetMass", label=r"$m^{jj}$ (GeV)", bins=[30, 600, 4000]),
    ShapeVar(var="ak8FatJetPt0", label=r"$p_T^{j1}$ (GeV)", bins=[50, 300, 1500]),
    # ShapeVar(var="ak8FatJetPt1", label=r"$p_T^{j2}$ (GeV)", bins=[50, 300, 1500]),
    # ShapeVar(
    #     var="ak8FatJetPNetMass0",
    #     label=r"$m_{reg}^{j1}$ (GeV)",
    #     bins=[20, 50, 250],
    #     significance_dir="bin",
    # ),
    # ShapeVar(
    #     var="ak8FatJetPNetMass1",
    #     label=r"$m_{reg}^{j2}$ (GeV)",
    #     bins=[20, 50, 250],
    #     significance_dir="bin",
    # ),
    # ShapeVar(
    #     var="ak8FatJetMsd0",
    #     label=r"$m_{SD}^{j1}$ (GeV)",
    #     bins=[20, 50, 250],
    #     significance_dir="bin",
    # ),
    # ShapeVar(
    #     var="ak8FatJetMsd1",
    #     label=r"$m_{SD}^{j2}$ (GeV)",
    #     bins=[20, 50, 250],
    #     significance_dir="bin",
    # ),
    # ShapeVar(
    #     var="ak8FatJetPNetXbb0",
    #     label=r"$T_{Xbb}^{j1}$",
    #     bins=[50, 0.0, 1],
    # ),
    # ShapeVar(
    #     var="ak8FatJetPNetXbb1",
    #     label=r"$T_{Xbb}^{j2}$",
    #     bins=[50, 0.0, 1],
    # ),
]

# hists = {}
for shape_var in control_plot_vars:
    if shape_var.var not in hists:
        hists[shape_var.var] = utils.singleVarHist(
            events_dict,
            shape_var,
            weight_key="weight",
        )

for shape_var in control_plot_vars:
    name = f"{plot_dir}/{shape_var.var}.pdf"
    plotting.ratioHistPlot(
        hists[shape_var.var],
        year,
        sig_keys,
        bg_keys,
        name=name,
        show=True,
        log=True,
        plot_significance=False,
        significance_dir=shape_var.significance_dir,
        ratio_ylims=None,
    )
    # break

Mass plots for signal

In [None]:
sig_key = "hh4b"
plots = {
    "bb0FatJetPNetMass": r"$m_{reg}^{j1}$ (GeV)",
    "bb1FatJetPNetMass": r"$m_{reg}^{j2}$ (GeV)",
    "bb0FatJetMsd": r"$m_{SD}^{j1}$ (GeV)",
    "bb1FatJetMsd": r"$m_{SD}^{j2}$ (GeV)",
}

for key, label in plots.items():
    plt.hist(
        utils.get_feat(events_dict[sig_key], key),
        bins=20,
        histtype="step",
        label=label,
        range=[50, 250],
        weights=utils.get_feat(events_dict[sig_key], "weight"),
        linestyle="--",
        linewidth=2,
    )

plt.legend()
plt.xlabel("Mass (GeV)")
plt.ylabel("Events")
plt.show()

Control plots without trigger weights

In [None]:
hists_notrigweights = {}
for shape_var in control_plot_vars:
    if shape_var.var not in hists_notrigweights:
        hists_notrigweights[shape_var.var] = utils.singleVarHist(
            events_dict,
            shape_var,
            weight_key="weight_notrig",
        )

for shape_var in control_plot_vars:
    name = f"{plot_dir}/{shape_var.var}_notrigweights.pdf"
    plotting.ratioHistPlot(
        hists_notrigweights[shape_var.var],
        year,
        sig_keys,
        bg_keys,
        name=name,
        show=True,
        log=True,
        ratio_ylims=None,
    )

### Cut-based selection

In [None]:
bb_masks = postprocessing.bb_assignment(events_dict)

In [None]:
# {label: {cutvar: [min, max], ...}, ...}
txbb_cut = 0.985

selection_regions = {
    "pass": Region(
        cuts={
            "bb0FatJetPNetXbb": [txbb_cut, CUT_MAX_VAL],
            "bb1FatJetPNetXbb": [txbb_cut, CUT_MAX_VAL],
            "bb0FatJetPNetMass": [100, 150],
        },
        label="Pass",
    ),
    "fail": Region(
        cuts={
            "bb0FatJetPNetXbb": [-CUT_MAX_VAL, txbb_cut],
            "bb1FatJetPNetXbb": [-CUT_MAX_VAL, txbb_cut],
            "bb0FatJetPNetMass": [100, 150],
        },
        label="Fail",
    ),
}

fit_shape_var = ShapeVar(
    "bb1FatJetPNetMass",
    r"$m^{2}_\mathrm{Reg}$ (GeV)",
    [20, 60, 260],
    reg=True,
    blind_window=[100, 150],
)

In [None]:
templates = postprocessing.get_templates(
    events_dict,
    bb_masks,
    year,
    sig_keys,
    selection_regions,
    [fit_shape_var],
    {},
    template_dir,
    bg_keys,
    plot_dir,
    cutflow,
    weight_key="weight",
    # weight_shifts=weight_shifts,
    show=True,
)

In [None]:
templates

In [None]:
postprocessing.save_templates(
    templates,
    f"{template_dir}/{year}_templates.pkl",
    fit_shape_var,
)

In [None]:
import pickle

with open(f"templates/23Oct17/{year}_templates.pkl", "rb") as f:
    templates2 = pickle.load(f)