In [None]:
# automatically reloads imported files on edits
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np

import mplhep as hep
import matplotlib.ticker as mticker
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

import hist

hep.style.use(["CMS", "firamath"])

formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))
plt.rcParams.update({"font.size": 12})
plt.rcParams["lines.linewidth"] = 2
plt.rcParams["grid.color"] = "#CCCCCC"
plt.rcParams["grid.linewidth"] = 0.5
plt.rcParams["figure.edgecolor"] = "none"

In [None]:
import onnxruntime

MAIN_DIR = "../../../"
sess_options = onnxruntime.SessionOptions()
sess_options.intra_op_num_threads = 23
sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_PARALLEL
session = onnxruntime.InferenceSession(
    f"{MAIN_DIR}/../data/spanet-inference/spanet_pnet_all_vars_v0.onnx", sess_options
)

In [None]:
input_name = session.get_inputs()[0].name
output_name = session.get_outputs()[0].name

In [None]:
events = pd.read_parquet(
    "../../../../data/matching/23Nov18_WSel_v9_private/2018/GluGlutoHHto4B_cHHH1_TuneCP5_PSWeights_13TeV-powheg-pythia8/parquet"
)
list(events.columns)

In [None]:
nevents = len(events.ak4JetPt[0])
nevents

In [None]:
import vector


def make_vector(events: pd.DataFrame, obj: str):
    """Create a ``vector`` object from the columns of the dataframe"""
    mstring = "PNetMass" if obj == "ak8FatJet" else "Mass"

    return vector.array(
        {
            "pt": events[f"{obj}Pt"],
            "phi": events[f"{obj}Phi"],
            "eta": events[f"{obj}Eta"],
            "M": events[f"{obj}{mstring}"],
        }
    )


jets = make_vector(events, "ak4Jet")
fatjets = make_vector(events, "ak8FatJet")

In [None]:
njets = 10
jet_vars = ["PtCorr", "Eta", "SinPhi", "CosPhi", "PNetB", "Mass"]
arrays = []
for i in range(njets):
    pdf = pd.DataFrame(0, index=np.arange(nevents), columns=jet_vars)
    pdf["PtCorr"] = events.ak4JetPt[i]
    pdf["Eta"] = events.ak4JetEta[i]
    pdf["SinPhi"] = np.sin(events.ak4JetPhi[i])
    pdf["CosPhi"] = np.cos(events.ak4JetPhi[i])
    pdf["Mass"] = events.ak4JetMass[i]
    num = events.ak4JetbtagPNetProbb[i] + events.ak4JetbtagPNetProbbb[i]
    den = (
        events.ak4JetbtagPNetProbb[i]
        + events.ak4JetbtagPNetProbbb[i]
        + events.ak4JetbtagPNetProbc[i]
        + events.ak4JetbtagPNetProbcc[i]
        + events.ak4JetbtagPNetProbg[i]
        + events.ak4JetbtagPNetProbuds[i]
    )
    pdf["PNetB"] = np.where(den > 0, num / den, -1)
    np_arr = pdf.values.T.astype(np.float32)
    arrays.append(np_arr)

Jets_data = np.transpose(np.transpose(arrays, (1, 0, 2)))
Jets_Pt = Jets_data[:, :, 0]
MIN_PT = 20
Jets_mask = Jets_Pt > MIN_PT

In [None]:
boosted_arrays = []
fatjet_vars = ["Pt", "Eta", "SinPhi", "CosPhi", "PNetXbb", "PNetXjj", "PNetQCD", "Mass"]
nfatjets = 3
for i in range(nfatjets):
    pdf = pd.DataFrame(0, index=np.arange(nevents), columns=fatjet_vars)
    pdf["Pt"] = events.ak8FatJetPt[i]
    pdf["Eta"] = events.ak8FatJetEta[i]
    pdf["SinPhi"] = np.sin(events.ak8FatJetPhi[i])
    pdf["CosPhi"] = np.cos(events.ak8FatJetPhi[i])
    pdf["PNetXbb"] = events.ak8FatJetPNetXbb[i]
    pdf["PNetXjj"] = events.ak8FatJetPNetXjj[i]
    pdf["PNetQCD"] = events.ak8FatJetPNetQCD[i]
    pdf["Mass"] = events.ak8FatJetPNetMass[i]

    np_arr = pdf.values.T.astype(np.float32)
    boosted_arrays.append(np_arr)

BoostedJets_data = np.transpose(np.transpose(boosted_arrays, (1, 0, 2)))
MIN_FJPT = 200
BoostedJets_Pt = BoostedJets_data[:, :, 0]
BoostedJets_mask = BoostedJets_Pt > MIN_FJPT

In [None]:
lep_arrays = []
lep_vars = ["Pt", "Eta", "SinPhi", "CosPhi"]
nleptons = 2
for i in range(nleptons):
    pdf = pd.DataFrame(0, index=np.arange(nevents), columns=lep_vars)
    pdf["Pt"] = events.LeptonPt[i]
    pdf["Eta"] = events.LeptonEta[i]
    pdf["SinPhi"] = np.sin(events.LeptonPhi[i])
    pdf["CosPhi"] = np.cos(events.LeptonPhi[i])

    np_arr = pdf.values.T.astype(np.float32)
    lep_arrays.append(np_arr)

Leptons_data = np.transpose(np.transpose(lep_arrays, (1, 0, 2)))
Leptons_Pt = Leptons_data[:, :, 0]
Leptons_mask = Leptons_Pt > 20

In [None]:
tau_arrays = []
tau_vars = ["Pt", "Eta", "SinPhi", "CosPhi"]
ntaus = 2
for i in range(ntaus):
    pdf = pd.DataFrame(0, index=np.arange(nevents), columns=tau_vars)
    pdf["Pt"] = events.tauPt[i]
    pdf["Eta"] = events.tauEta[i]
    pdf["SinPhi"] = np.sin(events.tauPhi[i])
    pdf["CosPhi"] = np.cos(events.tauPhi[i])

    np_arr = pvalues.T.astype(np.float32)
    tau_arrays.append(np_arr)

Taus_data = np.transpose(np.transpose(tau_arrays, (1, 0, 2)))
Taus_Pt = Taus_data[:, :, 0]
Taus_mask = Taus_Pt > 20

In [None]:
met_arrays = [np.array([events.MET_pt.values.squeeze()])]
MET_data = np.transpose(met_arrays)
MET_mask = MET_data[:, :, 0] > 0

In [None]:
ht_arrays = [np.array([events.ht.values.squeeze()])]
HT_data = np.transpose(ht_arrays)
HT_mask = HT_data[:, :, 0] > 0

In [None]:
Jets_arrays = {}
Higgs_vars = ["mass", "pt", "eta", "sinphi", "cosphi", "dr"]
for i in range(njets):
    name = "Jet%s" % i
    Higgs_list = []
    for j in range(1, njets):
        if i == j:
            continue
        if int(j) < int(i):
            continue
        j_i = jets[:, i]
        j_j = jets[:, j]
        jj = j_i + j_j
        pdf = pd.DataFrame(0, index=np.arange(nevents), columns=Higgs_vars)
        pdf["mass"] = jj.mass
        pdf["pt"] = jj.pt
        pdf["eta"] = jj.eta
        pdf["sinphi"] = np.sin(jj.phi)
        pdf["cosphi"] = np.cos(jj.phi)
        pdf["dr"] = j_i.deltaR(j_j)
        pdf = pdf.fillna(0)
        np_arr = pdf.values.T.astype(np.float32)
        Higgs_list.append(np_arr)
    Jets_arrays[name] = Higgs_list

Jet_data = {}
Jet_mask = {}
for i in range(njets - 1):
    Jet_data[i] = np.transpose(np.transpose(Jets_arrays[f"Jet{i}"], (1, 0, 2)))
    pt = Jet_data[i][:, :, 0]
    Jet_mask[i] = pt > 20

In [None]:
input_dict = {
    "Jets_data": Jets_data,
    "Jets_mask": Jets_mask,
    "BoostedJets_data": BoostedJets_data,
    "BoostedJets_mask": BoostedJets_mask,
    "Leptons_data": Leptons_data,
    "Leptons_mask": Leptons_mask,
    "Taus_data": Taus_data,
    "Taus_mask": Taus_mask,
    "MET_data": MET_data,
    "MET_mask": MET_mask,
    "HT_data": HT_data,
    "HT_mask": HT_mask,
    "Jet1_data": Jet_data[0],
    "Jet1_mask": Jet_mask[0],
    "Jet2_data": Jet_data[1],
    "Jet2_mask": Jet_mask[1],
    "Jet3_data": Jet_data[2],
    "Jet3_mask": Jet_mask[2],
    "Jet4_data": Jet_data[3],
    "Jet4_mask": Jet_mask[3],
    "Jet5_data": Jet_data[4],
    "Jet5_mask": Jet_mask[4],
    "Jet6_data": Jet_data[5],
    "Jet6_mask": Jet_mask[5],
    "Jet7_data": Jet_data[6],
    "Jet7_mask": Jet_mask[6],
    "Jet8_data": Jet_data[7],
    "Jet8_mask": Jet_mask[7],
    "Jet9_data": Jet_data[8],
    "Jet9_mask": Jet_mask[8],
}

In [None]:
output_nodes = session.get_outputs()
output_names = [node.name for node in output_nodes]
output_values = session.run(output_names, input_dict)

In [None]:
output_names

In [None]:
# 0-8: SPANET matching
# assignment probabilities
#  max_h1, index_h1: 0
#  max_h2, index_h2: 1
#  max_h3, index_h3: 2
# detection probabilities
#  h1Det: 6
#  h2Det: 7
#  h3Det: 8
# boosted assignment probabilities
#  bh1: 3
#  bh2: 4
#  bh3: 5
# boosted detection probabilities
#  9-11
# 12
#  0 ?
#  prob_hhh: 1
#  prob_qcd: 2
#  prob_tt: 3
#  prob_vjets: 4
#  prob_vv: 5
#  prob_hhh4b2tau: 6
#  prob_hh4b: 7
#  prob_hh2b2tau: 8
classification = output_values[12]

In [None]:
prob_hhh = output_values[12][:, 1]
prob_qcd = output_values[12][:, 2]
prob_tt = output_values[12][:, 3]
prob_vjets = output_values[12][:, 4]
prob_vv = output_values[12][:, 5]
prob_hhh4b2tau = output_values[12][:, 6]
prob_hh4b = output_values[12][:, 7]
prob_hh2b2tau = output_values[12][:, 8]

In [None]:
spanet_discr_axis = hist.axis.Regular(40, 0, 1, name="discr", label="SPANET Prob Discriminator")
class_axis = hist.axis.StrCategory([], name="class", growth=True)
h = hist.Hist(spanet_discr_axis, class_axis)
h.fill(prob_hhh, "hhh")
h.fill(prob_qcd, "qcd")
h.fill(prob_tt, "tt")
h.fill(prob_vjets, "vjets")
h.fill(prob_vv, "vv")
h.fill(prob_hhh4b2tau, "hhh4b2tau")
h.fill(prob_hh4b, "hh4b")
h.fill(prob_hh2b2tau, "hh2b2tau")

fig, ax = plt.subplots(1, 1, figsize=(7, 5))
legend_elements = []
linestyles = {
    "hh4b": "solid",
    "hhh": "dashed",
    "hhh4b2tau": "dashdot",
    "hh2b2tau": "dashed",
    "qcd": "dashdot",
    "tt": "dotted",
    "vv": "dotted",
    "vjets": "dashed",
}
color_by_prob = {
    "hh4b": "red",
    "hhh": "green",
    "hhh4b2tau": "grey",
    "hh2b2tau": "black",
    "qcd": "orange",
    "tt": "blue",
    "vv": "teal",
    "vjets": "violet",
}
for key in ["hhh", "qcd", "tt", "vv", "vjets", "hhh4b2tau", "hh2b2tau", "hh4b"]:
    hep.histplot(
        h[{"class": key}],
        density=True,
        lw=2,
        ls=linestyles[key],
        color=color_by_prob[key],
    )
    legend_elements.append(
        Line2D([0], [0], color=color_by_prob[key], lw=2, label=key, ls=linestyles[key])
    )
ax.legend(handles=legend_elements)
ax.set_ylabel("Density")
ax.set_yscale("log")
ax.set_title("HH4b sample")

In [None]:
session_assignment = onnxruntime.InferenceSession(
    f"{MAIN_DIR}/../data/spanet-inference/spanet_categorisation_v6.onnx", sess_options
)

In [None]:
output_nodes_assignment = session_assignment.get_outputs()
output_names_assignment = [node.name for node in output_nodes_assignment]
output_values_assignment = session_assignment.run(output_names_assignment, input_dict)

In [None]:
output_names_assignment

In [None]:
prob_3bh0h = output_values_assignment[12][:, 1]
prob_2bh1h = output_values_assignment[12][:, 2]
prob_1bh2h = output_values_assignment[12][:, 3]
prob_0bh3h = output_values_assignment[12][:, 4]
prob_2bh0h = output_values_assignment[12][:, 5]
prob_1bh1h = output_values_assignment[12][:, 6]
prob_0bh2h = output_values_assignment[12][:, 7]
prob_1bh0h = output_values_assignment[12][:, 8]
prob_0bh1h = output_values_assignment[12][:, 9]
prob_0bh0h = output_values_assignment[12][:, 0]

In [None]:
input_dict["Jet9_data"][0]

In [None]:
np.argmax(np.stack(output_values_assignment[12], axis=1), axis=0)

In [None]:
prob_3bh0h

In [None]:
spanet_discr_axis = hist.axis.Regular(40, 0, 1, name="discr", label="SPANET Assignment")
class_axis = hist.axis.StrCategory([], name="class", growth=True)
h = hist.Hist(spanet_discr_axis, class_axis)
h.fill(prob_3bh0h, "3bh0h")
h.fill(prob_2bh1h, "2bh1h")
h.fill(prob_1bh2h, "1bh2h")
h.fill(prob_0bh3h, "0bh3h")
h.fill(prob_2bh0h, "2bh0h")
h.fill(prob_1bh1h, "1bh1h")
h.fill(prob_0bh2h, "0bh2h")
h.fill(prob_1bh0h, "1bh0h")
h.fill(prob_0bh1h, "0bh1h")
h.fill(prob_0bh0h, "0bh0h")

fig, ax = plt.subplots(1, 1, figsize=(7, 5))
legend_elements = []
color_by_prob = {
    "3bh0h": "grey",
    "2bh1h": "grey",
    "1bh2h": "grey",
    "0bh3h": "grey",
    "2bh0h": "blue",
    "1bh1h": "red",
    "0bh2h": "green",
    "1bh0h": "black",
    "0bh1h": "black",
    "0bh0h": "black",
}
for key in color_by_prob.keys():
    hep.histplot(
        h[{"class": key}],
        density=True,
        lw=2,
        color=color_by_prob[key],
    )
    legend_elements.append(Line2D([0], [0], color=color_by_prob[key], lw=2, label=key))
ax.legend(handles=legend_elements)
ax.set_ylabel("Density")
ax.set_yscale("log")
ax.set_title("HH4b sample")

In [None]:
input_dict_func.keys()

In [None]:
input_dict["Jets_data"][1]

In [None]:
# cross check w functions
from HH4b.matching_study import predict_spanet_hhh

input_dict_func = predict_spanet_hhh.build_inputs(events)
output_values_assignment_func = session_assignment.run(output_names_assignment, input_dict_func)

In [None]:
output_values_assignment_func[12][0]

In [None]:
output_values_assignment[12][0]

In [None]:
input_dict_func.keys()

In [None]:
input_dict.keys()

In [None]:
prob_3bh0h_f = output_values_assignment_func[12][:, 1]
prob_2bh1h_f = output_values_assignment_func[12][:, 2]
prob_1bh2h_f = output_values_assignment_func[12][:, 3]
prob_0bh3h_f = output_values_assignment_func[12][:, 4]
prob_2bh0h_f = output_values_assignment_func[12][:, 5]
prob_1bh1h_f = output_values_assignment_func[12][:, 6]
prob_0bh2h_f = output_values_assignment_func[12][:, 7]
prob_1bh0h_f = output_values_assignment_func[12][:, 8]
prob_0bh1h_f = output_values_assignment_func[12][:, 9]
prob_0bh0h_f = output_values_assignment_func[12][:, 0]

In [None]:
h = hist.Hist(spanet_discr_axis, class_axis)
h.fill(prob_3bh0h_f, "3bh0h")
h.fill(prob_2bh1h_f, "2bh1h")
h.fill(prob_1bh2h_f, "1bh2h")
h.fill(prob_0bh3h_f, "0bh3h")
h.fill(prob_2bh0h_f, "2bh0h")
h.fill(prob_1bh1h_f, "1bh1h")
h.fill(prob_0bh2h_f, "0bh2h")
h.fill(prob_1bh0h_f, "1bh0h")
h.fill(prob_0bh1h_f, "0bh1h")
h.fill(prob_0bh0h_f, "0bh0h")

fig, ax = plt.subplots(1, 1, figsize=(7, 5))
legend_elements = []
color_by_prob = {
    "3bh0h": "grey",
    "2bh1h": "grey",
    "1bh2h": "grey",
    "0bh3h": "grey",
    "2bh0h": "blue",
    "1bh1h": "red",
    "0bh2h": "green",
    "1bh0h": "black",
    "0bh1h": "black",
    "0bh0h": "black",
}
for key in color_by_prob.keys():
    hep.histplot(
        h[{"class": key}],
        density=True,
        lw=2,
        color=color_by_prob[key],
    )
    legend_elements.append(Line2D([0], [0], color=color_by_prob[key], lw=2, label=key))
ax.legend(handles=legend_elements)
ax.set_ylabel("Density")
ax.set_yscale("log")
ax.set_title("HH4b sample")

In [None]:
import awkward as ak

In [None]:
arr = np.triu(output_values[0][0][0:10, 0:10])
# np.argsort(arr.flatten())[::-1][:45]
arr = ak.from_numpy(arr)
# ak.argsort(arr, ascending=False, axis=1).to_numpy()
arr.to_numpy()

In [None]:
np.argsort(arr.to_numpy().flatten())[::-1][:45]

In [None]:
def get_maximas(assignment_prob):
    """
    Get indices of possible jet pairings (10*(10-1) / 2 = 45) for a given higgs
    sorted by maximum assignment probability
    Jet pairings are an int, e.g. 1 or 12, which should be converted to a string:
    - 1 => 01 pairs
    - 12 => 12 pairs
    """
    # get 10*10 assignment probabilities
    # get upper triangle to avoid pairing repetitions
    assignment_prob_ak = ak.from_numpy(np.triu(assignment_prob[:][:, 0:njets, 0:njets]))
    arr_flat = ak.flatten(assignment_prob_ak, axis=2)
    # sort pairings by maximum assignment probabilities
    max_indices = ak.argsort(arr_flat, ascending=False, axis=1).to_numpy()[:, :45]
    max_values = arr_flat[max_indices]
    return max_indices, max_values


# h1 - h3 assignment probability
index_h1, prob_h1 = get_maximas(output_values[0][:])
index_h2, prob_h2 = get_maximas(output_values[1][:])
index_h3, prob_h3 = get_maximas(output_values[2][:])
hIndex = ak.from_numpy(np.stack([index_h1, index_h2, index_h3], axis=1))

# h1 - h3 detection probability
h1Det = output_values[6][:]
h2Det = output_values[7][:]
h3Det = output_values[8][:]
hDet = np.stack([h1Det, h2Det, h3Det]).T
# sort detection probabiilty
hDetMax = ak.argsort(ak.from_numpy(hDet), ascending=False, axis=1)

In [None]:
# boosted h1 - h3 assignment probability
bh1 = output_values[3][:]
bh2 = output_values[4][:]
bh3 = output_values[5][:]

# boosted_higgs = find_boosted_higgs(bh1,bh2,bh3)
# SPANET creates assignment matrices keeping both AK4 and AK8 jets, so 10 + 3
# for boosted assignment, we want only AK8 jets, hence I look only at elements 10,11,12
boosted_h1 = ak.from_regular(ak.from_numpy(bh1[:, 10:13])) > 0.5
boosted_h2 = (ak.from_regular(ak.from_numpy(bh2[:, 10:13])) > 0.5) & ~boosted_h1
boosted_h3 = (ak.from_regular(ak.from_numpy(bh3[:, 10:13])) > 0.5) & (~boosted_h2) & (~boosted_h1)

In [None]:
boosted = np.stack([boosted_h1, boosted_h2, boosted_h3], axis=1)

In [None]:
boosted_h1.to_numpy()

In [None]:
boosted_h2.to_numpy()

In [None]:
boosted_h3.to_numpy()

In [None]:
positions, index = ak.where(boosted_h1)
index

In [None]:
positions, index = ak.where(boosted_h2)
index

In [None]:
positions, index = ak.where(boosted_h3)
index

In [None]:
higgs_reconstructed_index = ak.from_numpy(
    np.repeat([[0, 1, 2]], boosted.to_numpy().shape[0], axis=0)
)
higgs_reconstructed_index

In [None]:
h_index_1_char = np.char.mod("%02d", index_h1)
h_index_2_char = np.char.mod("%02d", index_h2)
h_index_3_char = np.char.mod("%02d", index_h3)
h_index_char = np.stack([h_index_1_char, h_index_2_char, h_index_3_char], axis=1)


def remove_elements_with_pd(h_index_char, selected_pairs):
    pairs_pd = pd.DataFrame()
    pairs_pd["pairs_str"] = np.char.mod("%02d", selected_pairs)
    pairs_pd["jet0"] = pairs_pd["pairs_str"].str[0].astype(int)
    pairs_pd["jet1"] = pairs_pd["pairs_str"].str[1].astype(int)

    # just not smart enough to figure this out w/o a loop
    pairs_used = []
    for j in range(3):
        used_j = []
        for i in range(45):
            x = pd.Series(h_index_char[:, 0][:, i]).astype(str)
            used = (
                (x.str[0].astype(int) == pairs_pd["jet0"])
                | (x.str[1].astype(int) == pairs_pd["jet0"])
                | (x.str[0].astype(int) == pairs_pd["jet1"])
                | (x.str[1].astype(int) == pairs_pd["jet1"])
            )
            used_j.append(used.values)
        used_j = np.array(used_j).T
        pairs_used.append(used_j)
    pairs_used = np.array(pairs_used)
    pairs_used = np.transpose(pairs_used, (1, 0, 2))
    return pairs_used


# get pairings of higgs with max detection probability
higgs_1 = hIndex[hDetMax[:, 0:1]]
# select the first pairs (sorted by assignment probability)
higgs_1_pairs = ak.flatten(higgs_1[:, :, 0]).to_numpy()

# get mask for pairings that are already in use
is_higgs_1_pair = remove_elements_with_pd(h_index_char, higgs_1_pairs)
hIndex_wo1 = ak.mask(hIndex, ~is_higgs_1_pair)

# get pairings of higgs with 2nd max detection probability
higgs_2 = hIndex_wo1[hDetMax[:, 1:2]]
# select the first pairs (that are not masked)
higgs_2_pairs = np.array([h.to_numpy().compressed()[0] for h in higgs_2])

# get mask for pairings that are already in use
is_higgs_2_pair = remove_elements_with_pd(h_index_char, higgs_2_pairs)
hIndex_wo2 = ak.mask(hIndex_wo1, (~is_higgs_2_pair) & (~is_higgs_1_pair))

# get pairings of higgs with 2nd max detection probability
higgs_3 = hIndex_wo2[hDetMax[:, 2:3]]
higgs_3_pairs = np.array([h.to_numpy().compressed()[0] for h in higgs_3])

In [None]:
resolved_higgs = np.stack([higgs_1_pairs, higgs_2_pairs, higgs_3_pairs], axis=1)
resolved_higgs

In [None]:
higgs_reconstructed_index_fill = ak.where(boosted_h1, higgs_reconstructed_index, resolved_higgs)
higgs_reconstructed_index_fill

In [None]:
higgs_reconstructed_index_fill = ak.where(
    (boosted_h1 | boosted_h2 | boosted_h3), higgs_reconstructed_index, resolved_higgs
)
higgs_reconstructed_index_fill

In [None]:
higgs_jet_mass = ak.where((boosted_h1 | boosted_h2 | boosted_h3), fatjets.mass, resolved_higgs)
higgs_jet_pt = ak.where((boosted_h1 | boosted_h2 | boosted_h3), fatjets.eta, resolved_higgs)
higgs_jet_eta = ak.where((boosted_h1 | boosted_h2 | boosted_h3), fatjets.phi, resolved_higgs)
higgs_jet_phi = ak.where((boosted_h1 | boosted_h2 | boosted_h3), fatjets.pt, resolved_higgs)

In [None]:
is_boosted_1 = ak.any(boosted_h1, axis=1)
is_boosted_2 = ak.any(boosted_h2, axis=1)
is_boosted_3 = ak.any(boosted_h3, axis=1)
is_boosted_all = ak.any((boosted_h1 | boosted_h2 | boosted_h3), axis=1)
is_boosted_stack = boosted_h1 | boosted_h2 | boosted_h3

In [None]:
is_boosted_stack.to_numpy()

In [None]:
fatjets[pairs_pd["higgs_1_rec"]].mass

In [None]:
fatjets.mass.shape

In [None]:
fatjets.mass.shape

In [None]:
fatjets.mass

In [None]:
is_boosted_stack

In [None]:
fatjets.mass

In [None]:
higgs_reconstructed_index_fill

In [None]:
pairs_pd = pd.DataFrame()
for i in range(1, 4):
    pairs_pd[f"higgs_{i}_rec"] = higgs_reconstructed_index_fill[:, i - 1].to_numpy()
    pairs_pd[f"higgs_{i}_isboosted"] = is_boosted_stack[:, i - 1].to_numpy()
    pairs_pd[f"higgs_{i}_pairs_str"] = np.char.mod("%02d", resolved_higgs[:, i - 1])
    pairs_pd[f"higgs_{i}_jet0"] = pairs_pd[f"higgs_{i}_pairs_str"].str[0].astype(int)
    pairs_pd[f"higgs_{i}_jet1"] = pairs_pd[f"higgs_{i}_pairs_str"].str[1].astype(int)
    pairs_pd[f"higgs_{i}_jet_mass"] = higgs_jet_mass[:, i - 1]

pairs_pd

In [None]:
pairs_pd

In [None]:
for i in range(1, 4):
    jjs = []
    for j in range(pairs_pd[f"higgs_{i}_jet0"].shape[0]):
        jet_0 = pairs_pd[f"higgs_{i}_jet0"][j]
        jet0 = jets[j, jet_0]
        jet_1 = pairs_pd[f"higgs_{i}_jet1"][j]
        jet1 = jets[j, jet_1]
        jjs.append((jet0 + jet1))
    pairs_pd[f"higgs_resolved_{i}_mass"] = [jj.mass for jj in jjs]
    pairs_pd[f"higgs_resolved_{i}_eta"] = [jj.eta for jj in jjs]
    pairs_pd[f"higgs_resolved_{i}_phi"] = [jj.phi for jj in jjs]
    pairs_pd[f"higgs_resolved_{i}_pt"] = [jj.pt for jj in jjs]

In [None]:
(jets[3, 1] + jets[3, 4])

In [None]:
jets[3, 1]

In [None]:
jets[3, 4]

In [None]:
jet0_mass

In [None]:
jet1_mass

In [None]:
for i in range(1, 4):
    pairs_pd.loc[~pairs_pd[f"higgs_{i}_isboosted"], f"higgs_{i}_jet_mass"] = pairs_pd[
        f"higgs_resolved_{i}_mass"
    ]
    pairs_pd.loc[~pairs_pd[f"higgs_{i}_isboosted"], f"higgs_{i}_jet_mass"] = pairs_pd[
        f"higgs_resolved_{i}_mass"
    ]
    pairs_pd.loc[~pairs_pd[f"higgs_{i}_isboosted"], f"higgs_{i}_jet_mass"] = pairs_pd[
        f"higgs_resolved_{i}_mass"
    ]

columns = []
for i in range(1, 4):
    columns.append(f"higgs_{i}_jet_mass")
    columns.append(f"higgs_{i}_isboosted")

pairs_pd[columns]

In [None]:
spanet_higgsmass_axis = hist.axis.Regular(40, 0, 250, name="mass", label="SPANET Higgs mass")
higgs_axis = hist.axis.StrCategory([], name="higgs", growth=True)
h = hist.Hist(spanet_higgsmass_axis, higgs_axis)
h.fill(pairs_pd["higgs_1_jet_mass"], "h1")
h.fill(pairs_pd["higgs_2_jet_mass"], "h2")
h.fill(pairs_pd["higgs_3_jet_mass"], "h3")


fig, ax = plt.subplots(1, 1, figsize=(7, 5))
legend_elements = []
linestyle_by_class = {
    "h1": "solid",
    "h2": "dashed",
    "h3": "dotted",
}
color_by_class = {
    "h1": "gray",
    "h2": "gray",
    "h3": "gray",
}
for key in linestyle_by_class.keys():
    hep.histplot(
        h[{"higgs": key}], density=True, lw=2, ls=linestyle_by_class[key], color=color_by_class[key]
    )
    legend_elements.append(
        Line2D([0], [0], ls=linestyle_by_class[key], lw=2, label=key, color=color_by_class[key])
    )
ax.legend(handles=legend_elements)
ax.set_ylabel("Density")
ax.set_yscale("log")
ax.set_title("HH4b sample")

In [None]:
all_probs = np.stack(
    [
        prob_3bh0h,
        prob_2bh1h,
        prob_1bh2h,
        prob_0bh3h,
        prob_2bh0h,
        prob_1bh1h,
        prob_0bh2h,
        prob_1bh0h,
        prob_0bh1h,
        prob_0bh0h,
    ],
    axis=1,
)
all_probs

In [None]:
np.argmax(np.stack(output_values_assignment[12], axis=1), axis=0)

In [None]:
probs_str = [
    "3bh0h",
    "2bh1h",
    "1bh2h",
    "0bh3h",
    "2bh0h",
    "1bh1h",
    "0bh2h",
    "1bh0h",
    "0bh1h",
    "0bh0h",
]

In [None]:
max_probs = np.argmax(all_probs, axis=1)
max_probs.shape

In [None]:
max_probs

In [None]:
probs_axis = hist.axis.IntCategory(
    range(10), name="prob", label="SPANET max probability", growth=True
)
h = hist.Hist(probs_axis)
h.fill(max_probs)

fig, ax = plt.subplots(1, 1, figsize=(7, 5))
hep.histplot(
    h,
    lw=2,
)
ax.set_ylabel("Events")
ax.set_yscale("log")
xticks = [i + 0.5 for i in range(10)]
ax.set_xticks(xticks, probs_str, size="small", rotation="vertical")
ax.set_title("HH4b sample")

In [None]:
h = hist.Hist(spanet_higgsmass_axis, higgs_axis, probs_axis)
h.fill(pairs_pd["higgs_1_jet_mass"], "h1", max_probs)
h.fill(pairs_pd["higgs_2_jet_mass"], "h2", max_probs)
h.fill(pairs_pd["higgs_3_jet_mass"], "h3", max_probs)

fig, ax = plt.subplots(1, 2, figsize=(12, 5))
linestyle_by_class = {
    "h1": "solid",
    "h2": "dashed",
    # "h3": "dotted",
}
color_by_class = {
    4: {
        "h1": "green",
        "h2": "green",
        # "h3": "green",
    },
    5: {
        "h1": "b",
        "h2": "b",
        # "h3": "b",
    },
}
for i, prob in enumerate([4, 5]):
    legend_elements = []
    for key in linestyle_by_class.keys():
        hep.histplot(
            h[{"higgs": key, "prob": prob}],
            density=True,
            lw=2,
            ax=ax[i],
            ls=linestyle_by_class[key],
            color=color_by_class[prob][key],
        )
        legend_elements.append(
            Line2D(
                [0],
                [0],
                ls=linestyle_by_class[key],
                lw=2,
                label=key,
                color=color_by_class[prob][key],
            )
        )
    ax[i].legend(handles=legend_elements)
    ax[i].set_ylabel("Density")
ax[0].set_title("Max prob 2bh0h")
ax[1].set_title("Max prob 1bh1h")

In [None]:
h[{"prob": 4}]

In [None]:
"""
SPANET gives a prediction for each Higgs separately, so effectively you can have overlaps between jets, say 
  H1: highest assignment prob = 02 (jet1 and jet3)
  H2: highest assignment prob = 12 (jet2 and jet3) 
This is an ambiguous case, so we need to select which Higgs to reconstruct first, then remove the overlaps.
For that, we use the highest detection probability.
Then, we would remove jet1 and jet3 from the possible jets for H2 and H3
"""


def remove_elements(all_pairs, selected_pairs):
    # remove selected pairs from possible list of pairs
    # build higgs_1 like array (45 possible pairings) using only the selected pair value
    pairs_like = np.empty([45, selected_pairs.shape[0]])
    pairs_like[:] = selected_pairs
    pairs_like = pairs_like.T

    # build boolean array for masking pairs
    is_pair = np.empty([all_pairs.to_numpy().shape[0], 3, 45])
    is_pair[:, 0, :] = pairs_like
    is_pair[:, 1, :] = pairs_like
    is_pair[:, 2, :] = pairs_like

    return is_pair


# get pairings of higgs with max detection probability
higgs_1 = hIndex[hDetMax[:, 0:1]]
# select the first pairs (sorted by assignment probability)
higgs_1_pairs = ak.flatten(higgs_1[:, :, 0]).to_numpy()

# get mask for pairings that are already in use
is_higgs_1_pair = remove_elements(higgs_1, higgs_1_pairs)
hIndex_wo1 = ak.mask(hIndex, (hIndex != is_higgs_1_pair))

# get pairings of higgs with 2nd max detection probability
higgs_2 = hIndex_wo1[hDetMax[:, 1:2]]
# select the first pairs (that are not masked)
higgs_2_pairs = np.array([h.to_numpy().compressed()[0] for h in higgs_2])

# get mask for pairings that are already in use
is_higgs_2_pair = remove_elements(higgs_2, higgs_2_pairs)
hIndex_wo2 = ak.mask(hIndex_wo1, (hIndex_wo1 != is_higgs_2_pair))

# get pairings of higgs with 2nd max detection probability
higgs_3 = hIndex_wo2[hDetMax[:, 2:3]]
higgs_3_pairs = np.array([h.to_numpy().compressed()[0] for h in higgs_3])

# this sort of works except that I've only gotten rid of exact pair matchings (e.g. if I see select 12, I get rid of all pairings with jet 1 and 2..)
# so this is not it yet

Using Pandas dataframe

In [None]:
h_index_1_char = np.char.mod("%02d", index_h1)
h_index_2_char = np.char.mod("%02d", index_h2)
h_index_3_char = np.char.mod("%02d", index_h3)
h_index_char = np.stack([h_index_1_char, h_index_2_char, h_index_3_char], axis=1)


def remove_elements_with_pd(h_index_char, selected_pairs):
    pairs_pd = pd.DataFrame()
    pairs_pd["pairs_str"] = np.char.mod("%02d", selected_pairs)
    pairs_pd["jet0"] = pairs_pd["pairs_str"].str[0].astype(int)
    pairs_pd["jet1"] = pairs_pd["pairs_str"].str[1].astype(int)

    # just not smart enough to figure this out w/o a loop
    pairs_used = []
    for j in range(3):
        used_j = []
        for i in range(45):
            x = pd.Series(h_index_char[:, 0][:, i]).astype(str)
            used = (
                (x.str[0].astype(int) == pairs_pd["jet0"])
                | (x.str[1].astype(int) == pairs_pd["jet0"])
                | (x.str[0].astype(int) == pairs_pd["jet1"])
                | (x.str[1].astype(int) == pairs_pd["jet1"])
            )
            used_j.append(used.values)
        used_j = np.array(used_j).T
        pairs_used.append(used_j)
    pairs_used = np.array(pairs_used)
    pairs_used = np.transpose(pairs_used, (1, 0, 2))
    return pairs_used


# get pairings of higgs with max detection probability
higgs_1 = hIndex[hDetMax[:, 0:1]]
# select the first pairs (sorted by assignment probability)
higgs_1_pairs = ak.flatten(higgs_1[:, :, 0]).to_numpy()

# get mask for pairings that are already in use
is_higgs_1_pair = remove_elements_with_pd(h_index_char, higgs_1_pairs)
hIndex_wo1 = ak.mask(hIndex, ~is_higgs_1_pair)

# get pairings of higgs with 2nd max detection probability
higgs_2 = hIndex_wo1[hDetMax[:, 1:2]]
# select the first pairs (that are not masked)
higgs_2_pairs = np.array([h.to_numpy().compressed()[0] for h in higgs_2])

# get mask for pairings that are already in use
is_higgs_2_pair = remove_elements_with_pd(h_index_char, higgs_2_pairs)
hIndex_wo2 = ak.mask(hIndex_wo1, (~is_higgs_2_pair) & (~is_higgs_1_pair))

# get pairings of higgs with 2nd max detection probability
higgs_3 = hIndex_wo2[hDetMax[:, 2:3]]
higgs_3_pairs = np.array([h.to_numpy().compressed()[0] for h in higgs_3])

In [None]:
higgs_1_pairs

In [None]:
higgs_1_pairs.shape

In [None]:
higgs_2_pairs

In [None]:
higgs_3_pairs

Matching

In [None]:
pairs_pd = pd.DataFrame()
pairs_pd["higgs_1_pairs_str"] = np.char.mod("%02d", higgs_1_pairs)
pairs_pd["higgs_1_jet0"] = pairs_pd["higgs_1_pairs_str"].str[0].astype(int)
pairs_pd["higgs_1_jet1"] = pairs_pd["higgs_1_pairs_str"].str[1].astype(int)
pairs_pd["higgs_2_pairs_str"] = np.char.mod("%02d", higgs_2_pairs)
pairs_pd["higgs_2_jet0"] = pairs_pd["higgs_2_pairs_str"].str[0].astype(int)
pairs_pd["higgs_2_jet1"] = pairs_pd["higgs_2_pairs_str"].str[1].astype(int)
pairs_pd["higgs_3_pairs_str"] = np.char.mod("%02d", higgs_3_pairs)
pairs_pd["higgs_3_jet0"] = pairs_pd["higgs_3_pairs_str"].str[0].astype(int)
pairs_pd["higgs_3_jet1"] = pairs_pd["higgs_3_pairs_str"].str[1].astype(int)

higgs_1_jet0 = pairs_pd["higgs_1_jet0"].values
higgs_1_jet1 = pairs_pd["higgs_1_jet1"].values
higgs_2_jet0 = pairs_pd["higgs_2_jet0"].values
higgs_2_jet1 = pairs_pd["higgs_2_jet1"].values
higgs_3_jet0 = pairs_pd["higgs_3_jet0"].values
higgs_3_jet1 = pairs_pd["higgs_3_jet1"].values

In [None]:
boosted_index

Pandas playground

In [None]:
is_higgs_1_pair

In [None]:
higgs_1_pd = pd.DataFrame()
higgs_1_pd["pairs"] = higgs_1_pairs
higgs_1_pd["pairs_str"] = np.char.mod("%02d", higgs_1_pairs)
higgs_1_pd["jet0"] = higgs_1_pd["pairs_str"].str[0].astype(int)
higgs_1_pd["jet1"] = higgs_1_pd["pairs_str"].str[1].astype(int)
higgs_1_pd

In [None]:
h_index_1_char = np.char.mod("%02d", index_h1)
h_index_2_char = np.char.mod("%02d", index_h2)
h_index_3_char = np.char.mod("%02d", index_h3)
h_index_char = np.stack([h_index_1_char, h_index_2_char, h_index_3_char], axis=1)

In [None]:
h_index_char[:, 0][:, i]

In [None]:
# now create another hIndex that works
index_noh1 = []
for j in range(3):
    remove = []
    for i in range(45):
        x = pd.Series(h_index_char[:, 0][:, i]).astype(str)
        # x.str.split('', expand=True, n=2).drop(columns=0).astype(int)
        to_remove = (
            (x.str[0].astype(int) == higgs_1_pd["jet0"])
            | (x.str[1].astype(int) == higgs_1_pd["jet0"])
            | (x.str[0].astype(int) == higgs_1_pd["jet1"])
            | (x.str[1].astype(int) == higgs_1_pd["jet1"])
        )
        remove.append(to_remove.values)
    remove = np.array(remove).T
    remove.shape
    index_noh1.append(remove)
index_noh1 = np.array(index_noh1)

In [None]:
hIndex.to_numpy().shape

In [None]:
index_h1removed = ak.mask(hIndex, (~np.transpose(index_noh1, (1, 0, 2))))

In [None]:
index_h1removed

In [None]:
ak.mask(h_index_2_char, ~remove_2)

higgs_2 = hIndex_wo1[hDetMax[:, 1:2]]
# select the first pairs (that are not masked)
higgs_2_pairs = np.array([h.to_numpy().compressed()[0] for h in higgs_2])

In [None]:
higgs_1_pd["jet0"]

In [None]:
index_h1.shape

In [None]:
hindex_1_pd.str.split("")

In [None]:
(hindex_2_pd[0].str[0].astype(int) == higgs_1_pd["jet0"])

In [None]:
higgs_1_pd["jet0"]

In [None]:
higgs_1_pd = pd.DataFrame()
higgs_1_pd["pairs"] = higgs_1_pairs
higgs_1_pd["pairs_str"] = np.char.mod("%02d", higgs_1_pairs)
higgs_1_pd["jet0"] = higgs_1_pd["pairs_str"].str[0].astype(int)
higgs_1_pd["jet1"] = higgs_1_pd["pairs_str"].str[1].astype(int)
higgs_1_pd

Playground

In [None]:
higgs_1_pairs

In [None]:
higgs_2_pairs

In [None]:
higgs_3_pairs

In [None]:
# all possible pairings
hIndex

In [None]:
hIndex[0].to_numpy()

In [None]:
hIndexStr = np.char.zfill(np.char.mod("%d", hIndex.to_numpy()), 2)

In [None]:
hIndexStr

In [None]:
higgs_1_pairs_like = np.empty([45, higgs_1_pairs.shape[0]])
higgs_1_pairs_like[:] = higgs_1_pairs
higgs_1_pairs_like = higgs_1_pairs_like.T
higgs_1_pairs_like

In [None]:
x = np.char.mod("%02d", higgs_1_pairs_like)
np.char.split(np.char.join(",", x), ",")

In [None]:
higgs_1_pairs_like_str = np.char.zfill(np.char.mod("%d", higgs_1_pairs_like), 2)
higgs_1_pairs_like_str

In [None]:
np.array([list(word) for word in higgs_1_pairs_like_str])

In [None]:
higgs_1_pairs_str = np.char.mod("%02d", higgs_1_pairs)

In [None]:
higgs_1_pairs_str_1 = (
    np.array([s[::-1] for s in higgs_1_pairs_str.tolist()]).astype("<U1").astype(str)
)
higgs_1_pairs_str_1

In [None]:
higgs_1_pairs_str_0 = higgs_1_pairs_str.astype("<U1").astype("int32").astype(str)
higgs_1_pairs_str_0

In [None]:
np.char.split(np.char.join(",", x), ",")

In [None]:
np.char.(higgs_1_pairs_str)

In [None]:
# get jet indices used
np.char.mod("%02d", higgs_1_pairs)

In [None]:
is_pair = np.empty([higgs_1.to_numpy().shape[0], 3, 45])
is_pair[:, 0, :] = higgs_1_pairs_like
is_pair[:, 1, :] = higgs_1_pairs_like
is_pair[:, 2, :] = higgs_1_pairs_like
is_pair = np.char.zfill(np.char.mod("%d", is_pair), 2)

In [None]:
hIndex[1].to_numpy()

In [None]:
# detection probability for each higgs
hDet

In [None]:
hDetMax[:, 0:1]

In [None]:
higgs_1 = hIndex[hDetMax[:, 0:1]]

In [None]:
hIndex[hDetMax[:, 1:2]][0].to_numpy()

In [None]:
hDetMax[:, 1:2]

In [None]:
hDetMax[:, 2:3]

In [None]:
# get the indices of the higgs with maximum detection probability
higgs_1 = hIndex[hDetMax[:, 0:1]]
higgs_1

In [None]:
higgs_1[:, :, 0]

In [None]:
higgs_1.to_numpy()[0]

In [None]:
higgs_1.to_numpy()[1]

In [None]:
# get the indices of the first pair of jets (for the Higgs with maximum detection probability)
higgs_1_selected = ak.flatten(higgs_1[:, :, 0])
higgs_1_selected

In [None]:
higgs_1_selected.to_numpy().shape

In [None]:
hIndex.to_numpy().shape

In [None]:
# remove higgs_1 pairings from hIndex
# e.g. need to remove 12 from hIndex[0]
# or remove 13 from hIndex[1]

# first build booleans where pairing matches

In [None]:
# need to get array of 45
b = np.empty([45, higgs_1_selected.to_numpy().shape[0]])
b[:] = h1
b.T.shape

In [None]:
# build boolean array
x = np.empty([higgs_1.to_numpy().shape[0], 3, 45])
x[:, 0, :] = b.T
x[:, 1, :] = b.T
x[:, 2, :] = b.T
x.shape

In [None]:
hIndex[0].to_numpy()

In [None]:
(hIndex == x)[0].to_numpy()

In [None]:
(hIndex == x)[1].to_numpy()

In [None]:
hIndex[ak.any((hIndex != x), axis=2)]

In [None]:
# now remove elements
hIndex_wo1 = ak.mask(hIndex, (hIndex != x))

higgs_2 = hIndex_wo1[hDetMax[:, 1:2]]

higgs_2_pairs = np.array([h.to_numpy().compressed()[0] for h in higgs_2])

In [None]:
ak.singletons(higgs_2[0])[:, 0]

In [None]:
higgs_2_pairs

In [None]:
ak.flatten(higgs_1[:, :, 0]).to_numpy()

In [None]:
ak.flatten(higgs_2[:, :, 0]).to_numpy()

In [None]:
hDetMax[:, 1:2]

In [None]:
hIndex_wo1[0]

In [None]:
higgs_2[0].to_numpy()

In [None]:
hwo1 = hIndex_wo1[hDetMax[:, 1:2]].to_numpy()
# hwo1 = hwo1[~hwo1.mask]
hwo1.shape

In [None]:
hwo1[:, 0, :].shape

In [None]:
hwo1[:, 0, :]

In [None]:
x = np.ma.array(np.arange(9).reshape(3, 3), mask=[[1, 0, 0], [1, 0, 0], [0, 0, 0]])
np.ma.compress_rows(x)

In [None]:
np.ma.compress_rowcols(hwo1[:, 0, :], 1)

In [None]:
hIndex_masked.shape

In [None]:
higgs_1