# Postprocessing

Makes control plots and templates.

Authors: Raghav Kansal, Cristina Suarez

In [None]:
import os
import utils
import plotting
import postprocessing
from postprocessing import Region, weight_shifts
from utils import ShapeVar, CUT_MAX_VAL
from hh_vars import samples, data_key, bg_keys, sig_keys

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import mplhep as hep
import matplotlib.ticker as mticker

plt.style.use(hep.style.CMS)
hep.style.use("CMS")
formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))
plt.rcParams.update({"font.size": 16})

In [None]:
# automatically reloads imported files on edits
%load_ext autoreload
%autoreload 2

In [None]:
# define dictionary with directories of files (this can be configured in a yaml file later in the script)
sig_keys = ["hh4b"]
for key in list(samples.keys()):
    if key not in bg_keys + sig_keys + [data_key]:
        del samples[key]

MAIN_DIR = "../../../"
# this is the directory to the files
# path_to_dir = "/eos/uscms/store/user/cmantill/bbbb/skimmer/Oct13Test/"
path_to_dir = f"{MAIN_DIR}/../data/skimmer/Oct13Test/"
sample_dirs = {path_to_dir: samples}
year = "2022EE"

# make plot and template directory
date = "23Oct16Preselection"
plot_dir = f"{MAIN_DIR}/plots/PostProcessing/{date}/"
template_dir = f"templates/{date}/"
_ = os.system(f"mkdir -p {plot_dir}")
_ = os.system(f"mkdir -p {template_dir}/cutflows/{year}")

In [None]:
# columns to load
# the parquet files are too big so we can only load a few columns at a time without consumming much memory
load_columns = [
    ("weight", 1),
    ("DijetMass", 1),
    ("ak8FatJetPt", 2),
    ("ak8FatJetPNetXbb", 2),
    ("ak8FatJetPNetMass", 2),
]

load_columns_mc = load_columns + [("single_weight_trigsf_2jet", 1)]
# + [(f"weight_{syst}_{shift}", 1) for syst in weight_shifts for shift in ["up", "down"]]

In [None]:
df = pd.read_parquet(
    "../../../../data/skimmer/Oct13Test/2022EE/GluGlutoHHto4B_kl-0p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV_TSG/parquet"
)
list(df.columns)

In [None]:
filters = [("('ak8FatJetPNetXbb', '0')", ">=", 0.8)]

# save cutflow as pandas table
cutflow = pd.DataFrame(index=list(samples.keys()))

# dictionary that will contain all information (from all samples)
events_dict = {}
for input_dir, samples in sample_dirs.items():
    events_dict = {
        **events_dict,
        # this function will load files (only the columns selected), apply filters and compute a weight per event
        **utils.load_samples(
            input_dir,
            samples,
            year,
            filters=filters,
            columns=utils.format_columns(load_columns),
            columns_mc=utils.format_columns(load_columns_mc),
        ),
    }

utils.add_to_cutflow(events_dict, "Preselection", "weight", cutflow)
print("\n", cutflow)

Checking weights without the trigger SFs

In [None]:
for key, events in events_dict.items():
    if key == data_key:
        events["weight_notrig"] = events["weight"]
    else:
        events["weight_notrig"] = events["weight"] / events["single_weight_trigsf_2jet"]

In [None]:
_ = plt.hist(events_dict["qcd"]["single_weight_trigsf_2jet"].values, bins=100, histtype="step")

Control Plots

In [None]:
# {var: (bins, label)}
control_plot_vars = [
    # var must match key in events dictionary (i.e. as saved in parquet file)
    # ShapeVar(var="DijetMass", label=r"$m^{jj}$ (GeV)", bins=[30, 600, 4000]),
    # ShapeVar(var="ak8FatJetPt0", label=r"$p_T^1$ (GeV)", bins=[50, 300, 1500]),
    # ShapeVar(var="ak8FatJetPt1", label=r"$p_T^2$ (GeV)", bins=[50, 300, 1500]),
    # ShapeVar(
    #     var="ak8FatJetPNetMass0", label=r"$m_{reg}^{1}$ (GeV)", bins=[20, 50, 250], significance_dir="bin"
    # ),
    # ShapeVar(
    #     var="ak8FatJetPNetMass1", label=r"$m_{reg}^{2}$ (GeV)", bins=[20, 50, 250], significance_dir="bin"
    # ),
    ShapeVar(
        var="ak8FatJetPNetXbb0",
        label=r"$TX_{bb}^{1}$",
        bins=[10, 0.8, 1],
    ),
    ShapeVar(
        var="ak8FatJetPNetXbb1",
        label=r"$TX_{bb}^{2}$",
        bins=[50, 0.0, 1],
    ),
]

hists = {}
for shape_var in control_plot_vars:
    if shape_var.var not in hists:
        hists[shape_var.var] = utils.singleVarHist(
            events_dict,
            shape_var,
            weight_key="weight",
        )

# hists_notrigweights = {}
# for shape_var in control_plot_vars:
#     if shape_var.var not in hists_notrigweights:
#         hists_notrigweights[shape_var.var] = utils.singleVarHist(
#             events_dict,
#             shape_var,
#             weight_key="weight_notrig",
#         )

In [None]:
for shape_var in control_plot_vars:
    name = f"{plot_dir}/{shape_var.var}.pdf"
    plotting.ratioHistPlot(
        hists[shape_var.var],
        year,
        sig_keys,
        bg_keys,
        name=name,
        show=True,
        log=True,
        plot_significance=True,
        significance_dir=shape_var.significance_dir,
        ratio_ylims=None,
    )

In [None]:
for shape_var in control_plot_vars:
    name = f"{plot_dir}/{shape_var.var}_notrigweights.pdf"
    plotting.ratioHistPlot(
        hists_notrigweights[shape_var.var],
        year,
        sig_keys,
        bg_keys,
        name=name,
        show=True,
        log=True,
        ratio_ylims=None,
    )

Cut-based selection

In [None]:
# {label: {cutvar: [min, max], ...}, ...}
txbb_cut = 0.985

selection_regions = {
    "pass": Region(
        cuts={
            "ak8FatJetPNetXbb0": [txbb_cut, CUT_MAX_VAL],
            "ak8FatJetPNetXbb1": [txbb_cut, CUT_MAX_VAL],
            "ak8FatJetPNetMass1": [100, 150],
        },
        label="Pass",
    ),
    "fail": Region(
        cuts={
            "ak8FatJetPNetXbb0": [-CUT_MAX_VAL, txbb_cut],
            "ak8FatJetPNetXbb1": [-CUT_MAX_VAL, txbb_cut],
            "ak8FatJetPNetMass1": [100, 150],
        },
        label="Fail",
    ),
}

fit_shape_var = ShapeVar(
    "ak8FatJetPNetMass0",
    r"$m^{1}_\mathrm{Reg}$ (GeV)",
    [20, 60, 260],
    reg=True,
    blind_window=[100, 150],
)

In [None]:
templates = postprocessing.get_templates(
    events_dict,
    year,
    sig_keys,
    selection_regions,
    [fit_shape_var],
    {},
    template_dir,
    bg_keys,
    plot_dir,
    cutflow,
    weight_key="weight",
    # weight_shifts=weight_shifts,
    show=False,
)

In [None]:
postprocessing.save_templates(
    templates,
    f"{template_dir}/{year}_templates.pkl",
    fit_shape_var,
)