# Investigate gen Jet matching with gen Higgs

In [None]:
import numpy as np
import pandas as pd
import vector
import os
import hist
import awkward as ak

vector.register_awkward()

from itertools import permutations

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from matplotlib.lines import Line2D

# mplhep for CMS-style plots
import mplhep as hep

plt.style.use(hep.style.CMS)
hep.style.use("CMS")
formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))
plt.rcParams.update({"font.size": 16})

In [None]:
def make_vector(events: pd.DataFrame, obj: str):
    """Create a ``vector`` object from the columns of the dataframe"""
    mstring = "Msd" if obj == "ak8FatJet" else "Mass"

    return vector.array(
        {
            "pt": events[f"{obj}Pt"],
            "phi": events[f"{obj}Phi"],
            "eta": events[f"{obj}Eta"],
            "M": events[f"{obj}{mstring}"],
        }
    )

In [None]:
events = pd.read_parquet(
    "/eos/uscms/store/user/cmantill/bbbb/matching/Oct30/2018/GluGlutoHHto4B_cHHH1_TuneCP5_PSWeights_13TeV-powheg-pythia8/parquet/"
)

In [None]:
jets = make_vector(events, "ak4Jet")
gen_higgs = make_vector(events, "GenHiggs")
gen_bs = make_vector(events, "Genb")
fjs = make_vector(events, "ak8FatJet")

In [None]:
genJs = make_vector(events, "ak4GenJet")
genBs = make_vector(events, "Genb")

In [None]:
# Pull out ak8 information to get boosted
# remember we need to classify the boosted to get the ooc
drbh1ak8 = events["ak8FatJetMaxdRH1"].to_numpy()
drbh2ak8 = events["ak8FatJetMaxdRH2"].to_numpy()
indexak8 = events["ak8FatJetHiggsMatchIndex"].to_numpy()
nbh1ak8 = events["ak8FatJetNumBMatchedH1"].to_numpy()
nbh2ak8 = events["ak8FatJetNumBMatchedH2"].to_numpy()

In [None]:
# dissemble gen_b by H
b_m2h1 = genBs[:,0:2]
b_m2h2 = genBs[:,2:4]

In [None]:
# calculate dR between genb and each genJet
dR_h1b1_genJs = b_m2h1[:, 0][:, np.newaxis].deltaR(genJs)
dR_h1b2_genJs = b_m2h1[:, 1][:, np.newaxis].deltaR(genJs)
dR_h2b1_genJs = b_m2h2[:, 0][:, np.newaxis].deltaR(genJs)
dR_h2b2_genJs = b_m2h2[:, 1][:, np.newaxis].deltaR(genJs)

In [None]:
# count how many jets are matched to each b
GenJm2b1h1 = (dR_h1b1_genJs < 0.4)[:,:, np.newaxis]
GenJm2b2h1 = (dR_h1b2_genJs < 0.4)[:,:, np.newaxis]
GenJm2b1h2 = (dR_h2b1_genJs < 0.4)[:,:, np.newaxis]
GenJm2b2h2 = (dR_h2b2_genJs < 0.4)[:,:, np.newaxis]
print(GenJm2b1h1.shape)

# concatenate the arrays
# 1st dimension: event
# 2nd dimension: GenJ
# 3rd dimension: match to each b true or false
GenJm2b = np.concatenate([GenJm2b1h1, GenJm2b2h1, GenJm2b1h2, GenJm2b2h2], axis=2)
print(GenJm2b.shape)

In [None]:
# construct ak4GenJet info
ak4GenJetHiggsMatch = np.zeros(shape=genJs.shape, dtype=bool)
ak4GenJetHiggsMatchIndex = -1*np.ones(shape=genJs.shape, dtype=int)

# If a gen J is matched to 1 and only 1 genb, 
# it is considered to be matched to the mother H
# and will be assigned with a ak4GenJetHiggsMatchIndex
GenJm2H = np.sum(GenJm2b, axis=2)==1
ak4GenJetHiggsMatch[GenJm2H] = True

# find which genb and H matched the genJ
# draft in the sense that only the matched genJ's
# genB and genH indices will be used
bIdx_draft = np.argmax(GenJm2b, axis=2)
hIdx_draft = np.floor(bIdx_draft/2)
ak4GenJetHiggsMatchIndex[ak4GenJetHiggsMatch] = hIdx_draft[ak4GenJetHiggsMatch]

## Plot each categories for the GenJ genb matching

In [None]:
indexak8 = events["ak8FatJetHiggsMatchIndex"].to_numpy()
indexak4 = ak4GenJetHiggsMatchIndex

In [None]:
# ak4 jets matched to h1 and h2
h1ak4 = indexak4 == 0
h2ak4 = indexak4 == 1
num_ak4m2h1 = h1ak4.sum(axis=1)
num_ak4m2h2 = h2ak4.sum(axis=1)
h1m2ak4 = num_ak4m2h1 == 2
h2m2ak4 = num_ak4m2h2 == 2

# ak8 jets matched to h1 and h2
h1ak8 = indexak8 == 0
h2ak8 = indexak8 == 1
num_ak8m2h1 = h1ak8.sum(axis=1)
num_ak8m2h2 = h2ak8.sum(axis=1)
h1m1ak8 = num_ak8m2h1 == 1
h2m1ak8 = num_ak8m2h2 == 1

boosted = h1m1ak8 & h2m1ak8
semi_resolved_h1 = h1m2ak4 & h2m1ak8 & ~(boosted)
semi_resolved_h2 = h2m2ak4 & h1m1ak8 & ~(boosted)
semi_resolved = semi_resolved_h1 | semi_resolved_h2
resolved = (h1m2ak4 & h2m2ak4) & ~(boosted) & ~(semi_resolved) & (num_ak8m2h1 == 0) & (num_ak8m2h2 == 0)
not_categorized = ~(resolved | boosted | semi_resolved)

In [None]:
h1ak8nb2 = (indexak8 == 0) & (nbh1ak8 == 2)
h2ak8nb2 = (indexak8 == 1) & (nbh2ak8 == 2)
h1m1ak8b2 = h1ak8nb2.sum(axis=1) == 1
h2m1ak8b2 = h2ak8nb2.sum(axis=1) == 1

h1ak8nb1 = (indexak8 == 0) & (nbh1ak8 == 1)
h2ak8nb1 = (indexak8 == 1) & (nbh2ak8 == 1)
h1m1ak8b1 = h1ak8nb1.sum(axis=1) == 1
h2m1ak8b1 = h2ak8nb1.sum(axis=1) == 1

boosted_nb2 = h1m1ak8b2 & h2m1ak8b2
boosted_nb1 = h1m1ak8b1 & h2m1ak8b1
boosted_nb1nb2 = (h1m1ak8b1 & h2m1ak8b2) | (h1m1ak8b2 & h2m1ak8b1)

In [None]:
num_ak4N8m2h_nc = (
    np.stack(
        [
            num_ak4m2h1[not_categorized],
            num_ak4m2h2[not_categorized],
            num_ak8m2h1[not_categorized],
            num_ak8m2h2[not_categorized],
        ],
        axis=1,
    )
    .reshape(-1, 2, 2)
    .tolist()
)

In [None]:
# trying to define a function for pie chart outpout
# input: 3d array of shape (N_event, N_jetType, N_Higgs)
# output: pie values and labels
def make_ooc_pie(num_ak4N8m2h_nc):
    # This function is tryig to assume H1 and H2 is symmetric
    # e.g. num_ak4m2HX = (2, 1) == (1, 2)
    # the point is to not differentiate H1 and H2
    # note that if ak4 pair is permutated, then ak8 pair should be permutated in the same way
    
    # store all permutations of the recorded combinations of [[num_ak4m2H1, num_ak4m2H2], [num_ak8m2H1, num_ak8m2H2]]
    perm_pair_ak4N8m2h_nc = []
    # store unique combinations of [num_ak4m2HX, num_ak8m2HX]
    unique_pair_ak4N8m2h_nc = []
    
    # loop from all num pairs constructed from the ooc events
    for pair in num_ak4N8m2h_nc:
        # if it doesn't match any permutation of the recorded unique pair
        if pair not in perm_pair_ak4N8m2h_nc:
            # add to the unique pair
            unique_pair_ak4N8m2h_nc.append(pair)
            # also add its permutations to the permutation list
            perm_pair_ak4N8m2h_nc.append(pair)
            perm_pair_ak4N8m2h_nc.append([pair[0][::-1], pair[1][::-1]])

    LUT = {}
    for pair in unique_pair_ak4N8m2h_nc:
        num_ak4m2h1 = pair[0][0]
        num_ak4m2h2 = pair[0][1]
        num_ak8m2h1 = pair[1][0]
        num_ak8m2h2 = pair[1][1]
        LUT[f"{num_ak4m2h1}{num_ak4m2h2}{num_ak8m2h1}{num_ak8m2h2}"] = 0

    for num in num_ak4N8m2h_nc:
        num_ak4m2h1 = num[0][0]
        num_ak4m2h2 = num[0][1]
        num_ak8m2h1 = num[1][0]
        num_ak8m2h2 = num[1][1]

        if num in unique_pair_ak4N8m2h_nc:
            LUT[f"{num_ak4m2h1}{num_ak4m2h2}{num_ak8m2h1}{num_ak8m2h2}"] += 1
        else:
            LUT[f"{num_ak4m2h2}{num_ak4m2h1}{num_ak8m2h2}{num_ak8m2h1}"] += 1

    labels_pie = np.array(list(LUT.keys()))
    ys_pie = np.array(list(LUT.values()))
    labels_pie = labels_pie[np.argsort(ys_pie)[::-1]]
    ys_pie = ys_pie[np.argsort(ys_pie)[::-1]]
    
    return ys_pie, labels_pie

In [None]:
ys_pie, labels_pie = make_ooc_pie(num_ak4N8m2h_nc)

In [None]:
fig, ax = plt.subplots()
ax.pie(ys_pie, labels=labels_pie, autopct="%1.0f%%")
ax.set(title="Pie Chart of OOC AK4GenJet Matching Patterns")

fig.text(0.5, 0.85, 'Number convention: AK4H1, AK4H2, AK8H1 AK8H2 (AK4=AK8)', ha='center', c='b')
plt.show()

In [None]:
# calcualte m_HH for every event
mhh = (gen_higgs[:, 0] + gen_higgs[:, 1]).m

# bypass into each category
mhh_boosted = mhh[boosted]
mhh_resolved = mhh[resolved]
mhh_semi = mhh[semi_resolved]
mhh_out = mhh[not_categorized]

# start to plot, code modified on Christina's plot_h1h2_fj
bins = np.arange(mhh.min(), mhh.max(), 10)
bins = np.arange(100, 2000, 50)
var_axis = hist.axis.Variable(bins, name="var", label="variable")
cat_axis = hist.axis.StrCategory([], name="cat", growth=True)

hist_mhh = hist.Hist(var_axis, cat_axis)
hist_mhh.fill(var=mhh_boosted, cat="boosted")
hist_mhh.fill(var=mhh_semi, cat="semi_resolved")
hist_mhh.fill(var=mhh_resolved, cat="resolved")
hist_mhh.fill(var=mhh_out, cat="non-categorized")

leg_elems = []
fig, ax = plt.subplots(1, 1, figsize=(8, 4))
hist_mhh[{"cat": "boosted"}].plot1d(ax=ax, label="Boosted", color='y', ls='-')
hist_mhh[{"cat": "semi_resolved"}].plot1d(ax=ax, label="Semi-Resolved", color='r', ls="-.")
hist_mhh[{"cat": "resolved"}].plot1d(ax=ax, label="Resolved", color='k', ls=':')
hist_mhh[{"cat": "non-categorized"}].plot1d(ax=ax, label="Outside of these categories", color='c', ls="--")
leg_elems.append(Line2D([0], [0], color='y', lw=2, ls='-', label='boosted'))
leg_elems.append(Line2D([0], [0], color='r', lw=2, ls='-.', label='semi-resolved'))
leg_elems.append(Line2D([0], [0], color='k', lw=2, ls=':', label='resolved'))
leg_elems.append(Line2D([0], [0], color='c', lw=2, ls='--', label='ooc'))
leg = ax.legend(handles=leg_elems)
leg.set_title("Gen-Level categories", prop={"size": 10})
ax.set_xlim(0, 2000)
ax.set_ylim(0, 30000)
ax.set_xlabel(r"$m_{HH}$")
ax.set_title("Catrgories of GenJet->GenH Matching (AK8>AK4)")