In [None]:
import utils
import plotting
import postprocessing
from utils import ShapeVar
from hh_vars import samples, data_key, bg_keys, sig_keys

import numpy as np
import pandas as pd

In [None]:
# automatically reloads imported files on edits
%load_ext autoreload
%autoreload 2

In [None]:
# define dictionary with directories of files (this can be configured in a yaml file later in the script)
sig_keys = ["hh4b"]
bg_keys = ["qcd", "ttbar"]
for key in list(samples.keys()):
    if key not in bg_keys + sig_keys + [data_key]:
        del samples[key]

# this is the directory to the files
path_to_dir = "/eos/uscms/store/user/cmantill/bbbb/skimmer/Oct13Test/"
sample_dirs = {path_to_dir: samples}
year = "2022EE"

In [None]:
# columns to load
# the parquet files are too big so we can only load a few columns at a time without consumming much memory
load_columns = [
    ("weight", 1),
    # ("DijetMass", 1),
    ("ak8FatJetPt", 2),
    ("ak8FatJetPNetXbb", 2),
    ("ak8FatJetPNetMass", 2),
]

load_columns_mc = load_columns + [("single_weight_trigsf_2jet", 1)]

In [None]:
# save cutflow as pandas table
cutflow = pd.DataFrame(index=list(samples.keys()))

# dictionary that will contain all information (from all samples)
events_dict = {}
for input_dir, samples in sample_dirs.items():
    events_dict = {
        **events_dict,
        # this function will load files (only the columns selected), apply filters and compute a weight per event
        **utils.load_samples(
            input_dir,
            samples,
            year,
            columns=utils.format_columns(load_columns),
            columns_mc=utils.format_columns(load_columns_mc),
        ),
    }

utils.add_to_cutflow(events_dict, "Preselection", "weight", cutflow)
print("\n", cutflow)

Control Plots

In [None]:
# {var: (bins, label)}
control_plot_vars = {
    # var must match key in events dictionary (i.e. as saved in parquet file)
    # "DijetMass": ShapeVar(var="DijetMass", label=r"$m^{jj}$ (GeV)", bins=[30, 600, 4000]),
    "ak8FatJetPt0": ShapeVar(var="ak8FatJetPt0", label=r"$p_T^0$ (GeV)", bins=[30, 300, 1500]),
    "ak8FatJetPt1": ShapeVar(var="ak8FatJetPt1", label=r"$p_T^1$ (GeV)", bins=[30, 300, 1500]),
    "ak8FatJetPNetMass0": ShapeVar(
        var="ak8FatJetPNetMass0", label=r"$m_{reg}^{0}$ (GeV)", bins=[20, 50, 250]
    ),
    "ak8FatJetPNetMass1": ShapeVar(
        var="ak8FatJetPNetMass1", label=r"$m_{reg}^{1}$ (GeV)", bins=[20, 50, 250]
    ),
    # "ak8FatJetPNetXbb0": ShapeVar(
    #     var="ak8FatJetPNetXbb0",
    #     label=r"$TX_{bb}^{0}$",
    #     bins=[50, 0.0, 1],
    # ),
}

hists = {}
for shape_var in control_plot_vars:
    if shape_var.var not in hists:
        hists[shape_var.var] = utils.singleVarHist(
            events_dict,
            shape_var,
            weight_key="weight",
        )

for shape_var in control_plot_vars:
    name = f"{tplot_dir}/{cutstr}{shape_var.var}{logstr}.pdf"
    plotting.ratioHistPlot(
        hists[shape_var.var],
        year,
        plot_sig_keys,
        bg_keys,
        name=name,
        sig_scale_dict=tsig_scale_dict if not log else None,
        plot_significance=plot_significance,
        significance_dir=shape_var.significance_dir,
        show=show,
        log=log,
        ylim=None if not log else 1e15,
    )


hists = postprocessing.control_plots(
    events_dict,
    bb_masks,
    ["HHbbVV", "qqHH_CV_1_C2V_1_kl_1_HHbbVV"],
    control_plot_vars,
    f"{plot_dir}/ControlPlots/{year}/",
    year,
    bg_keys=["QCD", "TT", "ST", "V+Jets", "Diboson"],
    # bg_keys=["QCD", "TT", "ST", "V+Jets", "Hbb"],
    sig_scale_dict={"HHbbVV": 2e5, "qqHH_CV_1_C2V_1_kl_1_HHbbVV": 2e6},
    show=True,
)