# data/mc plots in 2018

### Post-process Data and MC samples

- Sum all MC samples that belong to the same process
- Scale the number of events by the total sum of weights

In [None]:
# import utilities for post-process
import utils

import vector
import pandas as pd
import numpy as np

In [None]:
samples = {
    "hh4b": ["GluGlutoHHto4B_cHHH1_TuneCP5_PSWeights_13TeV-powheg-pythia8"],
    "hh4b-kl0": ["GluGlutoHHto4B_cHHH0_TuneCP5_PSWeights_13TeV-powheg-pythia8"],
    "hh4b-kl2p45": ["GluGlutoHHto4B_cHHH2p45_TuneCP5_PSWeights_13TeV-powheg-pythia8"],
    "hh4b-kl5": ["GluGlutoHHto4B_cHHH5_TuneCP5_PSWeights_13TeV-powheg-pythia8"],
    "qcd": [
        "QCD_HT-200to300-13TeV",
        "QCD_HT-300to500-13TeV" "QCD_HT-500to700-13TeV",
        "QCD_HT-700to1000-13TeV",
        "QCD_HT-1000to1500-13TeV",
        "QCD_HT-1500to2000-13TeV",
        "QCD_HT-2000toInf-13TeV",
    ],
    # "ttbar": [
    #    "TTToSemiLeptonic_13TeV",
    #    "TTToHadronic_13TeV",
    #    "TTTo2L2Nu_13TeV",
    # ],
    "vjets": [
        "WJetsToQQ_HT-200to400_13TeV",
        "WJetsToQQ_HT-400to600_13TeV",
        "WJetsToQQ_HT-600to800_13TeV",
        "WJetsToQQ_HT-800toInf_13TeV",
        "ZJetsToQQ_HT-200to400_13TeV",
        "ZJetsToQQ_HT-400to600_13TeV",
        "ZJetsToQQ_HT-600to800_13TeV",
        "ZJetsToQQ_HT-800toInf_13TeV",
    ],
    "hbb": [
        "GluGluHToBB_Pt-200ToInf_M-125_TuneCP5_MINLO_13TeV-powheg-pythia8",
        "VBFHToBB_M-125_dipoleRecoilOn_TuneCP5_13TeV-powheg-pythia8",
        "WminusH_HToBB_WToQQ_M-125_TuneCP5_13TeV-powheg-pythia8",
        "WminusH_HToBB_WToLNu_M-125_TuneCP5_13TeV-powheg-pythia8",
        "WplusH_HToBB_WToQQ_M-125_TuneCP5_13TeV-powheg-pythia8",
        "WplusH_HToBB_WToLNu_M-125_TuneCP5_13TeV-powheg-pythia8",
        "ZH_HToBB_ZToQQ_M-125_TuneCP5_13TeV-powheg-pythia8",
        "ZH_HToBB_ZToLL_M-125_TuneCP5_13TeV-powheg-pythia8",
        "ZH_HToBB_ZToNuNu_M-125_TuneCP5_13TeV-powheg-pythia8",
        "ggZH_HToBB_ZToBB_M-125_TuneCP5_13TeV-powheg-pythia8",
        "ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8",
    ],
    "diboson": [
        "ZZTo4B01j_5f_TuneCP5_13TeV-amcatnloFXFX-pythia8",
    ],
}

samples_to_use = {s: samples[s] for s in ["qcd", "hh4b", "hh4b-kl2p45", "hh4b-kl5", "hh4b-kl0"]}

# define dictionary with directories of files (this can be configured in a yaml file later in the script)
# this is the directory to the files
path_to_dir = "/eos/uscms/store/user/cmantill/bbbb/matching/Oct10"
dirs = {path_to_dir: samples_to_use}
year = "2018"

In [None]:
# filters are sequences of strings that can be used to place a selection or mask in the parquet files
# e.g. https://github.com/rkansal47/HHbbVV/blob/main/src/HHbbVV/postprocessing/postprocessing.py#L80
filters = [
    [
        ("('ak8FatJetPt', '0')", ">=", 300),
        ("('ak8FatJetMsd', '0')", ">=", 60),
        ("('ak8FatJetPNetXbb', '0')", ">=", 0.975),
        ("('ak8FatJetPNetXbb', '1')", "<", 0.975),
    ]
]

In [None]:
# columns to load
# the parquet files are too big so we can only load a few columns at a time without consumming much memory
load_columns = [
    ("weight", 1),
    ("ak4JetPt", 4),
    ("ak4JetPhi", 4),
    ("ak4JetEta", 4),
    ("ak4JetMass", 4),
    ("ak4JetbtagDeepFlavB", 4),
    ("ak4Pair0", 2),
    ("ak4Pair1", 2),
    ("ak8FatJetPt", 2),
    ("ak8FatJetEta", 2),
    ("ak8FatJetPhi", 2),
    ("ak8FatJetPNetMass", 2),
    ("ak8FatJetPNetXbb", 2),
]
# reformat into ("column name", "idx") format for reading multiindex columns
columns = []
for key, num_columns in load_columns:
    for i in range(num_columns):
        columns.append(f"('{key}', '{i}')")

In [None]:
def make_vector(events: pd.DataFrame, obj: str):
    """Create a ``vector`` object from the columns of the dataframe"""
    mstring = "PNetMass" if obj == "ak8FatJet" else "Mass"

    return vector.array(
        {
            "pt": events[f"{obj}Pt"],
            "phi": events[f"{obj}Phi"],
            "eta": events[f"{obj}Eta"],
            "M": events[f"{obj}{mstring}"],
        }
    )


def add_jet_variables(events):
    fatjets = make_vector(events, "ak8FatJet")
    fatjet_0 = fatjets[:, 0]

    jets = make_vector(events, "ak4Jet")
    jets_shape = np.arange(len(jets.pt))

    # get unordered pairs
    first_bb_pair = events.ak4Pair0.to_numpy()
    second_bb_pair = events.ak4Pair1.to_numpy()

    first_bb_j1 = jets[jets_shape, first_bb_pair[:, 0]]
    first_bb_j2 = jets[jets_shape, first_bb_pair[:, 1]]
    first_bb_dijet = first_bb_j1 + first_bb_j2

    second_bb_j1 = jets[jets_shape, second_bb_pair[:, 0]]
    second_bb_j2 = jets[jets_shape, second_bb_pair[:, 1]]
    second_bb_dijet = second_bb_j1 + second_bb_j2

    # stack pairs
    bb_pairs = np.stack([first_bb_pair, second_bb_pair], axis=1)

    # sort by deltaR with leading fatjet
    bbs_dRfj = np.concatenate(
        [
            first_bb_dijet.deltaR(fatjet_0).reshape(-1, 1),
            second_bb_dijet.deltaR(fatjet_0).reshape(-1, 1),
        ],
        axis=1,
    )
    # sort from larger dR to smaller
    sort_by_dR = np.argsort(-bbs_dRfj, axis=-1)

    bb_pairs_sorted = np.array(
        [
            [bb_pair_e[sort_e[0]], bb_pair_e[sort_e[1]]]
            for bb_pair_e, sort_e in zip(bb_pairs, sort_by_dR)
        ]
    )

    # get sorted pairs
    first_bb_pair_sort = bb_pairs_sorted[:, 0]
    second_bb_pair_sort = bb_pairs_sorted[:, 1]

    first_bb_j1 = jets[np.arange(len(jets.pt)), first_bb_pair_sort[:, 0]]
    first_bb_j2 = jets[np.arange(len(jets.pt)), first_bb_pair_sort[:, 1]]
    first_bb_dijet = first_bb_j1 + first_bb_j2

    second_bb_j1 = jets[np.arange(len(jets.pt)), second_bb_pair_sort[:, 0]]
    second_bb_j2 = jets[np.arange(len(jets.pt)), second_bb_pair_sort[:, 1]]
    second_bb_dijet = second_bb_j1 + second_bb_j2

    dr_fatjet = first_bb_dijet.deltaR(fatjet_0)

    hh = first_bb_dijet + fatjet_0

    events["ak4DijetPt"] = first_bb_dijet.pt
    events["ak4DijetMass"] = first_bb_dijet.mass
    events["ak4DijetdRFatJet"] = dr_fatjet
    events["HHMass"] = hh.mass

    return events

In [None]:
# dictionary that will contain all information (from all samples)
events_dict = {}
for input_dir, samples in dirs.items():
    events_dict = {
        **events_dict,
        # this function will load files (only the columns selected), apply filters and compute a weight per event
        **utils.load_samples(input_dir, samples, year, filters=filters, columns=columns),
    }

In [None]:
# add more variables
for sample, events in events_dict.items():
    events_dict[sample] = add_jet_variables(events)

In [None]:
events_dict["hh4b"]

In [None]:
# this will be the weight that will be stored in the eventsDict once the utils.loadSamples function is done
weight_key = ["finalWeight"]

In [None]:
samples_loaded = list(events_dict.keys())
keys_loaded = list(events_dict[samples_loaded[0]].keys())
print(f"Keys in events_dict")
print(keys_loaded)

In [None]:
samples_loaded

In [None]:
samples_to_fill = [
    # "data",
    "qcd",
    # "vjets",
    # "ttbar",
    # "hbb",
    # "diboson",
]
vars_to_plot = [
    "ak4DijetPt",
    "ak4DijetMass",
    "ak4DijetdRFatJet",
    # "HHMass",
    "ak8FatJetPt0",
    "ak8FatJetPNetXbb0",
    "ak8FatJetPNetMass0",
]

# define ShapeVar (label and bins for a given variable)
from utils import ShapeVar

var_to_shapevar = {
    # var must match key in events dictionary (i.e. as saved in parquet file)
    "HHMass": ShapeVar(var="HHMass", label=r"AK8 + jj mass (GeV)", bins=[30, 600, 4000]),
    "ak4JetbtagDeepFlavB0": ShapeVar(
        var="ak4JetbtagDeepFlavB0",
        label=r"AK4 deepFlavB $^0$ (GeV)",
        bins=[30, 0, 1],
        significance_dir="right",
    ),
    "ak4JetbtagDeepFlavB1": ShapeVar(
        var="ak4JetbtagDeepFlavB1",
        label=r"AK4 deepFlavB $^1$ (GeV)",
        bins=[30, 0, 1],
        significance_dir="right",
    ),
    "ak4DijetPt": ShapeVar(
        var="ak4DijetPt",
        label=r"AK4 jj $p_T$ (GeV)",
        bins=[30, 300, 1500],
        significance_dir="right",
    ),
    "ak4DijetMass": ShapeVar(
        var="ak4DijetMass", label=r"AK4 jj mass (GeV)", bins=[30, 0, 200], significance_dir="right"
    ),
    "ak4DijetdRFatJet": ShapeVar(
        var="ak4DijetdRFatJet", label=r"AK4 dR(fj)", bins=[30, 0, 5], significance_dir="right"
    ),
    "ak8FatJetPt0": ShapeVar(
        var="ak8FatJetPt0",
        label=r"$AK8 p_T^0$ (GeV)",
        bins=[30, 300, 1500],
        significance_dir="right",
    ),
    "ak8FatJetPNetMass0": ShapeVar(
        var="ak8FatJetPNetMass0", label=r"AK8 $m_{reg}^{0}$ (GeV)", bins=[20, 0, 260]
    ),
    "ak8FatJetPNetXbb0": ShapeVar(
        var="ak8FatJetPNetXbb0",
        label=r"AK8 $TX_{bb}^{0}$",
        bins=[50, 0.8, 1],
    ),
}

In [None]:
# make a histogram
hists = {}
for var in vars_to_plot:
    print(var)
    shape_var = var_to_shapevar[var]
    if shape_var.var not in hists:
        hists[shape_var.var] = utils.singleVarHist(
            events_dict,
            shape_var,
            weight_key=weight_key,
            selection=None,
        )

In [None]:
hists

In [None]:
hists["ak4DijetMass"][{"Sample": "hh4b"}]

In [None]:
import mplhep as hep

hist, bins = hists["ak8FatJetPt0"][{"Sample": "qcd"}].to_numpy()
print(hist)
hep.histplot(hist, bins=bins, stack=True)

In [None]:
# make a stacked plot
from plotting import plot_hists

plot_hists(
    year,
    hists,
    vars_to_plot,
    58.6,
    add_data=False,
    # mult_factor=100,
    # logy=True,
    mult_factor=1,
    logy=False,
    density=True,
    stack=True,
    bbox_to_anchor=None,
)