### 1. Background
Investigate how many reconstructed events (boosted/resolved) fall into each gen-level category

In [None]:
from __future__ import annotations

import hist
import numpy as np
import pandas as pd
import vector

vector.register_awkward()


In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

# mplhep for CMS-style plots
import mplhep as hep
from matplotlib.lines import Line2D

plt.style.use(hep.style.CMS)
hep.style.use("CMS")
formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))
plt.rcParams.update({"font.size": 16})

In [None]:
def make_vector(events: pd.DataFrame, obj: str):
    """Create a ``vector`` object from the columns of the dataframe"""
    mstring = "Msd" if obj == "ak8FatJet" else "Mass"

    return vector.array(
        {
            "pt": events[f"{obj}Pt"],
            "phi": events[f"{obj}Phi"],
            "eta": events[f"{obj}Eta"],
            "M": events[f"{obj}{mstring}"],
        }
    )

In [None]:
def e2np(e, key):
    return e[key].to_numpy()

In [None]:
eAll = pd.read_parquet(
    "/eos/uscms/store/user/cmantill/bbbb/matching/Oct30/2018/GluGlutoHHto4B_cHHH1_TuneCP5_PSWeights_13TeV-powheg-pythia8/parquet/"
)

In [None]:
list(eAll.keys())

In [None]:
jets = make_vector(eAll, "ak4Jet")
gen_higgs = make_vector(eAll, "GenHiggs")
gen_bs = make_vector(eAll, "Genb")
fjs = make_vector(eAll, "ak8FatJet")

### 2. Categorize events by reco information

- Boosted
    - two good fatjets (fatjet with index 0 has the largest Xbb score)
    - ("('ak8FatJetPt', '0')", ">=", 300)
    - ("('ak8FatJetPNetMass', '0')", ">=", 60)
    - ("('ak8FatJetPNetXbb', '0')", ">=", 0.9734)  # medium working point
    - ("('ak8FatJetPt', '1')", ">=", 300)
    - ("('ak8FatJetPNetMass', '1')", ">=", 60)
    - ("('ak8FatJetPNetXbb', '1')", ">=", 0.9734) # medium working point
        
- Resolved
    - veto boosted selection
    - require at least 3 AK4 jets each with pt>40 GeV and b-tag (ak4JetbtagDeepFlavB) > 0.2783 # medium  working point

#### 2.1 Boosted

In [None]:
ak8FatJetPt = e2np(eAll, "ak8FatJetPt")
ak8FatJetPNetMass = e2np(eAll, "ak8FatJetPNetMass")
ak8FatJetPNetXbb = e2np(eAll, "ak8FatJetPNetXbb")

fjPtPass = ak8FatJetPt >= 300
fjMassPass = ak8FatJetPNetMass >= 60
fjXbbPass = ak8FatJetPNetXbb >= 0.9734
fjPass = fjPtPass & fjMassPass & fjXbbPass

recoBoosted = np.sum(fjPass, axis=-1) >= 2
eRecoBoosted = eAll[recoBoosted]

#### 2.2 Resolved

In [None]:
ak4JetPt = e2np(eAll, "ak4JetPt")
ak4JetbtagDeepFlavB = e2np(eAll, "ak4JetbtagDeepFlavB")

jPtPass = ak4JetPt > 40
sufficientPtPass = np.sum(jPtPass, axis=-1) >= 4
jBTagPass = ak4JetbtagDeepFlavB > 0.2783
sufficientBTagPass = np.sum(jBTagPass, axis=-1) >= 3

sufficientJPass = sufficientPtPass & sufficientBTagPass

recoResolved = (~recoBoosted) & sufficientJPass
eRecoResolved = eAll[recoResolved]

### 3. Bypass events of each reco category into gen-level categories

In [None]:
# this categorization prioritize boosted
# see ooc-semiboosted.ipynb
def categorize_events_by_gen(events):
    drbh1ak8 = events["ak8FatJetMaxdRH1"].to_numpy()
    drbh2ak8 = events["ak8FatJetMaxdRH2"].to_numpy()
    indexak8 = events["ak8FatJetHiggsMatchIndex"].to_numpy()
    indexak4 = events["ak4JetHiggsMatchIndex"].to_numpy()
    nbh1ak8 = events["ak8FatJetNumBMatchedH1"].to_numpy()
    nbh2ak8 = events["ak8FatJetNumBMatchedH2"].to_numpy()

    # ak4 jets matched to h1 and h2
    h1ak4 = indexak4 == 0
    h2ak4 = indexak4 == 1
    num_ak4m2h1 = h1ak4.sum(axis=1)
    num_ak4m2h2 = h2ak4.sum(axis=1)
    h1m2ak4 = num_ak4m2h1 == 2
    h2m2ak4 = num_ak4m2h2 == 2

    # ak8 jets matched to h1 and h2
    h1ak8 = indexak8 == 0
    h2ak8 = indexak8 == 1
    num_ak8m2h1 = h1ak8.sum(axis=1)
    num_ak8m2h2 = h2ak8.sum(axis=1)
    h1m1ak8 = num_ak8m2h1 == 1
    h2m1ak8 = num_ak8m2h2 == 1

    boosted = h1m1ak8 & h2m1ak8
    semi_boosted_h1 = h1m2ak4 & h2m1ak8 & ~(boosted)
    semi_boosted_h2 = h2m2ak4 & h1m1ak8 & ~(boosted)
    semi_boosted = semi_boosted_h1 | semi_boosted_h2
    resolved = (
        (h1m2ak4 & h2m2ak4) & ~(boosted) & ~(semi_boosted) & (num_ak8m2h1 == 0) & (num_ak8m2h2 == 0)
    )
    not_categorized = ~(resolved | boosted | semi_boosted)

    return boosted, semi_boosted, resolved, not_categorized

In [None]:
recoBoosted_genCategories = categorize_events_by_gen(eRecoBoosted)
recoResolved_genCategories = categorize_events_by_gen(eRecoResolved)

### 4. Plot m_HH for each reco category and each gen category

In [None]:
def plot_mhh_per_category(gen_higgs, categories, title="mhh Plot"):
    boosted, semi_boosted, resolved, not_categorized = categories
    mhh = (gen_higgs[:, 0] + gen_higgs[:, 1]).m

    # bypass into each category
    mhh_boosted = mhh[boosted]
    mhh_semi = mhh[semi_boosted]
    mhh_resolved = mhh[resolved]
    mhh_out = mhh[not_categorized]

    # start to plot, code modified on Christina's plot_h1h2_fj
    bins = np.arange(mhh.min(), mhh.max(), 10)
    bins = np.arange(100, 2000, 50)
    var_axis = hist.axis.Variable(bins, name="var", label="variable")
    cat_axis = hist.axis.StrCategory([], name="cat", growth=True)

    hist_mhh = hist.Hist(var_axis, cat_axis)
    hist_mhh.fill(var=mhh_boosted, cat="boosted")
    hist_mhh.fill(var=mhh_semi, cat="semi_boosted")
    hist_mhh.fill(var=mhh_resolved, cat="resolved")
    hist_mhh.fill(var=mhh_out, cat="non-categorized")

    leg_elems = []
    fig, ax = plt.subplots(1, 1, figsize=(8, 4))
    if mhh_boosted.size != 0:
        hist_mhh[{"cat": "boosted"}].plot1d(ax=ax, label="Boosted", color="y", ls="-")
        leg_elems.append(Line2D([0], [0], color="y", lw=2, ls="-", label="boosted"))
    if mhh_semi.size != 0:
        hist_mhh[{"cat": "semi_boosted"}].plot1d(ax=ax, label="Semi-boosted", color="r", ls="-.")
        leg_elems.append(Line2D([0], [0], color="r", lw=2, ls="-.", label="semi-boosted"))
    if mhh_resolved.size != 0:
        hist_mhh[{"cat": "resolved"}].plot1d(ax=ax, label="Resolved", color="k", ls=":")
        leg_elems.append(Line2D([0], [0], color="k", lw=2, ls=":", label="resolved"))
    if mhh_out.size != 0:
        hist_mhh[{"cat": "non-categorized"}].plot1d(
            ax=ax, label="Outside of these categories", color="c", ls="--"
        )
        leg_elems.append(Line2D([0], [0], color="c", lw=2, ls="--", label="ooc"))

    leg = ax.legend(handles=leg_elems)
    leg.set_title("Gen-Level categories", prop={"size": 10})
    # ax.set_xlim(0, 2000)
    # ax.set_ylim(0, 1000)
    ax.set_xlabel(r"$m_{HH}$ (GeV)")
    ax.set_title(title)
    return

In [None]:
genH_recoBoosted = make_vector(eRecoBoosted, "GenHiggs")
plot_mhh_per_category(
    genH_recoBoosted, recoBoosted_genCategories, title="Gen-Level Categories of Reco Boosted Events"
)

In [None]:
genH_recoResolved = make_vector(eRecoResolved, "GenHiggs")
plot_mhh_per_category(
    genH_recoResolved,
    recoResolved_genCategories,
    title="Gen-Level Categories of Reco Resolved Events",
)

### 5. Inverstigate Reco Resolved Events' Gen level OOC Subset

In [None]:
# trying to define a function for pie chart outpout
# input: 3d array of shape (N_event, N_jetType, N_Higgs)
# output: pie values and labels
def make_ooc_pie(events):
    # getting ooc event indices
    _, _, _, not_categorized = categorize_events_by_gen(events)

    # collecting number of ak4 jets and number of ak8
    # fat jets matched to Higgs
    indexak4 = events["ak4JetHiggsMatchIndex"].to_numpy()
    indexak8 = events["ak8FatJetHiggsMatchIndex"].to_numpy()

    h1ak4 = indexak4 == 0
    h2ak4 = indexak4 == 1
    num_ak4m2h1 = h1ak4.sum(axis=1)
    num_ak4m2h2 = h2ak4.sum(axis=1)

    h1ak8 = indexak8 == 0
    h2ak8 = indexak8 == 1
    num_ak8m2h1 = h1ak8.sum(axis=1)
    num_ak8m2h2 = h2ak8.sum(axis=1)

    # stack them for easier processing
    num_ak4N8m2h_nc = (
        np.stack(
            [
                num_ak4m2h1[not_categorized],
                num_ak4m2h2[not_categorized],
                num_ak8m2h1[not_categorized],
                num_ak8m2h2[not_categorized],
            ],
            axis=1,
        )
        .reshape(-1, 2, 2)
        .tolist()
    )

    # This function is tryig to assume H1 and H2 is symmetric
    # e.g. num_ak4m2HX = (2, 1) == (1, 2)
    # the point is to not differentiate H1 and H2
    # note that if ak4 pair is permutated, then ak8 pair should be permutated in the same way

    # store all permutations of the recorded combinations of [[num_ak4m2H1, num_ak4m2H2], [num_ak8m2H1, num_ak8m2H2]]
    perm_pair_ak4N8m2h_nc = []
    # store unique combinations of [num_ak4m2HX, num_ak8m2HX]
    unique_pair_ak4N8m2h_nc = []

    # loop from all num pairs constructed from the ooc events
    for pair in num_ak4N8m2h_nc:
        # if it doesn't match any permutation of the recorded unique pair
        if pair not in perm_pair_ak4N8m2h_nc:
            # add to the unique pair
            unique_pair_ak4N8m2h_nc.append(pair)
            # also add its permutations to the permutation list
            perm_pair_ak4N8m2h_nc.append(pair)
            perm_pair_ak4N8m2h_nc.append([pair[0][::-1], pair[1][::-1]])

    LUT = {}
    for pair in unique_pair_ak4N8m2h_nc:
        num_ak4m2h1 = pair[0][0]
        num_ak4m2h2 = pair[0][1]
        num_ak8m2h1 = pair[1][0]
        num_ak8m2h2 = pair[1][1]
        LUT[f"{num_ak4m2h1}{num_ak4m2h2}{num_ak8m2h1}{num_ak8m2h2}"] = 0

    for num in num_ak4N8m2h_nc:
        num_ak4m2h1 = num[0][0]
        num_ak4m2h2 = num[0][1]
        num_ak8m2h1 = num[1][0]
        num_ak8m2h2 = num[1][1]

        if num in unique_pair_ak4N8m2h_nc:
            LUT[f"{num_ak4m2h1}{num_ak4m2h2}{num_ak8m2h1}{num_ak8m2h2}"] += 1
        else:
            LUT[f"{num_ak4m2h2}{num_ak4m2h1}{num_ak8m2h2}{num_ak8m2h1}"] += 1

    labels_pie = np.array(list(LUT.keys()))
    ys_pie = np.array(list(LUT.values()))
    labels_pie = labels_pie[np.argsort(ys_pie)[::-1]]
    ys_pie = ys_pie[np.argsort(ys_pie)[::-1]]

    return ys_pie, labels_pie

In [None]:
ys_pie, labels_pie = make_ooc_pie(eRecoResolved)

In [None]:
fig, ax = plt.subplots()
ax.pie(ys_pie, labels=labels_pie, autopct="%1.0f%%")
ax.set(title="Pie Chart of OOC AK4RecoJet Matching Patterns")

fig.text(0.5, 0.85, "Number convention: AK4H1, AK4H2, AK8H1 AK8H2", ha="center", c="b")
plt.show()