# Study overlap

In [None]:
import uproot
import pandas as pd
import numpy as np

from HH4b.utils import load_samples, format_columns

import xgboost as xgb
from xgboost import XGBClassifier

In [None]:
import hist
import matplotlib.pyplot as plt
import mplhep as hep

hep.style.use(["CMS", "firamath"])
import matplotlib.ticker as mticker


formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))
plt.rcParams.update({"font.size": 12})
plt.rcParams["lines.linewidth"] = 2
plt.rcParams["grid.color"] = "#CCCCCC"
plt.rcParams["grid.linewidth"] = 0.5
plt.rcParams["figure.edgecolor"] = "none"
plt.style.use(hep.style.CMS)

In [None]:
# Load Model
# bdt_model_name = "v1_msd30_nomulticlass"
bdt_model_name = "v2_msd30_noovertraining"
bdt_model = XGBClassifier()
bdt_model.load_model(fname=f"../boosted/bdt_trainings_run3/{bdt_model_name}/trained_bdt.model")

from resolved_config import bdt_dataframe_resolved

Load samples from Resolved group

In [None]:
dir = "../../../../data/overlap/Main_PNet_MinDiag_w4j35_w2bj30_dHHjw30_withoutSyst_25April2024_2022_0L/mc/parts/"
# dir = "/Users/daniel/Documents/UCSD/Research/HH4b"

samples = {
    "hh4b": "GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV_powheg-pythia8_tree.root",
}

columns_to_load_resolved = [
    "passmetfilters",
    "passjetvetomap",
    "passTrig_HLT_QuadPFJet70_50_40_35_PFBTagParticleNet_2BTagSum0p65",
    "passL1unprescaled_HLT_QuadPFJet70_50_40_35_PFBTagParticleNet_2BTagSum0p65",
    "passTrigObjMatching_HLT_QuadPFJet70_50_40_35_PFBTagParticleNet_2BTagSum0p65",
    "avgbdisc_twoldgbdiscjets",
    "alljets_ht",
    "dHH_NbtagM",
    "dHH_H1_regmass",
    "dHH_H2_regmass",
    "event",
    "lumiwgt",  # luminosity in fb 26.6717 for 2022EE
    "xsecWeight",  #  xsec * 1000 / sum('genEventSumw'), xsec in pb
    "genWeight",
    "puWeight",
    "trgSF_HLT_QuadPFJet70_50_40_35_PFBTagParticleNet_2BTagSum0p65_central",
    "btagSF_central",
    "passTrig_HLT_AK8PFJet250_SoftDropMass40_PFAK8ParticleNetBB0p35",
    "passTrig_HLT_AK8PFJet425_SoftDropMass40",
]
columns_to_load_boosted = columns_to_load_resolved + [
    "n_ak8",
    "ak8_pt",
    "ak8_eta",
    "ak8_phi",
    "ak8_jetId",
    "ak8_msoftdrop",
    "ak8_mass",
    "ak8_tau3",
    "ak8_tau2",
    "ak8_Txbb",
    "ak8_PQCDb",
    "ak8_PQCDbb",
    "ak8_PQCDothers",
    "ak8_particleNet_mass",
    "pass_resolved_skim",  # trigger & >=4 jets with some pt cuts and >= 2 bjets above 30 GeV
    "pass_boosted_skim",  # trigger & >=2  tight AK8 jets with pT > 250 GeV and |eta|<2.4
    "met",
]

"""
    2b: dHH_NbtagM == 2
    4b: dHH_NbtagM == 4
    asr_4b: ASR_4b
    asr_2b: ASR_2b
    acr_4b: ACR_4b
    acr_2b: ACR_2b
    vsr_4b: VSR_4b
    vsr_2b: VSR_2b
    vcr_4b: VCR_4b
    vcr_2b: VCR_2b
"""


def get_resolved_masks(df):
    regions = {}
    regions["RES"] = (
        (df["passTrig_HLT_QuadPFJet70_50_40_35_PFBTagParticleNet_2BTagSum0p65"])
        & (df["passL1unprescaled_HLT_QuadPFJet70_50_40_35_PFBTagParticleNet_2BTagSum0p65"])
        & (df["passTrigObjMatching_HLT_QuadPFJet70_50_40_35_PFBTagParticleNet_2BTagSum0p65"])
        & (df["passmetfilters"])
        & (df["passjetvetomap"])
        & (df["avgbdisc_twoldgbdiscjets"] > 0.65)
        & (df["alljets_ht"] > 0)
    )

    # Calculate variables
    # df['AR_dHM'] = np.sqrt((df['dHH_H1_regmass'] - 125)**2 + (df['dHH_H2_regmass'] - 120)**2)
    # df['VR_dHM'] = np.sqrt((df['dHH_H1_regmass'] - 185)**2 + (df['dHH_H2_regmass'] - 182)**2)
    # Define additional regions based on these variables
    # df['ASR_4b'] = (df['AR_dHM'] < 30) & (df['dHH_NbtagM'] == 4)
    # df['ACR_4b'] = (df['AR_dHM'] >= 30) & (df['AR_dHM'] < 55) & (df['dHH_NbtagM'] == 4)
    # df['VSR_4b'] = (df['VR_dHM'] < 30) & (df['dHH_NbtagM'] == 4)
    # df['VCR_4b'] = (df['VR_dHM'] >= 30) & (df['VR_dHM'] < 55) & (df['dHH_NbtagM'] == 4)

    df["dHiggsDeltaRegMass"] = np.sqrt(
        ((df["dHH_H1_regmass"] - 125.0) * (df["dHH_H1_regmass"] - 125.0))
        + ((df["dHH_H2_regmass"] - 120.0) * (df["dHH_H2_regmass"] - 120.0))
    )

    regions = {
        **regions,
        "RES4b": (regions["RES"] & (df["dHH_NbtagM"] == 4)),
        "RES4bSR": (regions["RES"] & (df["dHiggsDeltaRegMass"] < 30.0) & (df["dHH_NbtagM"] == 4)),
    }

    return regions


for sample, sample_file in samples.items():
    print(f"{dir}/{sample_file}")
    tree = uproot.open(f"{dir}/{sample_file}:Events")
    print("Number of raw events: ", len(tree.arrays(["event"])["event"]))
    #########################################
    # Load resolved data as a pandas DataFrame
    pdf = tree.arrays(columns_to_load_resolved, library="pd")
    print("Number of resolved dataframe entries: ", len(df["event"]))

    # Weights
    pdf["resolved_weight"] = (
        pdf["lumiwgt"]
        * pdf["xsecWeight"]
        * pdf["genWeight"]
        * pdf["puWeight"]
        * pdf["trgSF_HLT_QuadPFJet70_50_40_35_PFBTagParticleNet_2BTagSum0p65_central"]
        * pdf["btagSF_central"]
    )

    # Define resolved regions
    regions = get_resolved_masks(df)

    # Get resolved yields and counts
    resolved_yields = {
        region: [np.sum(pdf["resolved_weight"][region_mask])]
        for region, region_mask in regions.items()
    }
    resolved_counts = {
        region: int(pdf["event"][region_mask].shape[0]) for region, region_mask in regions.items()
    }

    #########################################
    # Load boosted data as a pandas DataFrame
    df_b = tree.arrays(columns_to_load_boosted, library="pd")
    # Ask for at least 2 ak8 jets in boosted pandas dataframe
    df_b = df_b[
        (df_b["n_ak8"] >= 2)
        & (df_b["passmetfilters"])
        & (df_b["passjetvetomap"])
        & (
            (df_b["passTrig_HLT_AK8PFJet250_SoftDropMass40_PFAK8ParticleNetBB0p35"])
            | (df_b["passTrig_HLT_AK8PFJet425_SoftDropMass40"])
        )
    ].copy()
    print("Number of boosted dataframe entries: ", len(df_b["event"]))

    # Add weights
    df_b["boosted_weight"] = (
        df_b["lumiwgt"] * df_b["xsecWeight"] * df_b["genWeight"] * df_b["puWeight"]
    )
    df_b["resolved_weight"] = (
        df_b["lumiwgt"]
        * df_b["xsecWeight"]
        * df_b["genWeight"]
        * df_b["puWeight"]
        * df_b["trgSF_HLT_QuadPFJet70_50_40_35_PFBTagParticleNet_2BTagSum0p65_central"]
        * df_b["btagSF_central"]
    )

    # Order jets by fatjet Xbb
    df_ak8 = df_b.reset_index()
    df_ak8 = df_ak8.sort_values(by=["entry", "ak8_Txbb"], ascending=[True, False]).set_index(
        ["entry", "subentry"]
    )
    subindex = df_ak8.sort_index().index.get_level_values(1)
    df_ak8 = df_ak8.reset_index()
    df_ak8["subentry"] = subindex
    df_ak8 = df_ak8.set_index(["entry", "subentry"])

    # For boosted, yields must be obtained for one of the entries
    jet0 = df_ak8.query("subentry == 0")
    jet1 = df_ak8.query("subentry == 1")

    # Perform BDT inference
    dtest = bdt_dataframe_resolved(df_ak8)
    jet1["bdt_score"] = bdt_model.predict_proba(dtest)[:, 1]

    # Define resolved regions for this dataframe
    resolved_regions = {
        **{f"BST-{region}": region_mask for region, region_mask in get_resolved_masks(jet0).items()}
    }

    # NOTE!!!: you must use .to_numpy() to get the masks with jet0 otherwise you cannot do an OR
    boosted_regions = {
        "BST30060-X0bb08": (
            (jet0["ak8_pt"] >= 300).to_numpy()
            & (jet1["ak8_pt"] >= 300).to_numpy()
            & (jet0["ak8_msoftdrop"] >= 60).to_numpy()
            & (jet1["ak8_msoftdrop"] >= 60).to_numpy()
            & (jet0["ak8_Txbb"] >= 0.8).to_numpy()
        ),
        "BST30060-X0bb08-Xbb092": (
            (jet0["ak8_pt"] >= 300).to_numpy()
            & (jet1["ak8_pt"] >= 300).to_numpy()
            & (jet0["ak8_msoftdrop"] >= 60).to_numpy()
            & (jet1["ak8_msoftdrop"] >= 60).to_numpy()
            & (jet0["ak8_Txbb"] >= 0.8).to_numpy()
            & (jet1["ak8_Txbb"] >= 0.92).to_numpy()
        ),
        "BST30060-X0bb08-Xbb092-mass": (
            (jet0["ak8_pt"] >= 300).to_numpy()
            & (jet1["ak8_pt"] >= 300).to_numpy()
            & (jet0["ak8_msoftdrop"] >= 60).to_numpy()
            & (jet1["ak8_msoftdrop"] >= 60).to_numpy()
            & (jet0["ak8_Txbb"] >= 0.8).to_numpy()
            & (jet1["ak8_Txbb"] >= 0.92).to_numpy()
            & (jet1["ak8_msoftdrop"] >= 110).to_numpy()
            & (jet1["ak8_msoftdrop"] <= 140).to_numpy()
        ),
        "BST30060-X0bb08-Xbb092-mass-BDT094": (
            (jet0["ak8_pt"] >= 300).to_numpy()
            & (jet1["ak8_pt"] >= 300).to_numpy()
            & (jet0["ak8_msoftdrop"] >= 60).to_numpy()
            & (jet1["ak8_msoftdrop"] >= 60).to_numpy()
            & (jet0["ak8_Txbb"] >= 0.8).to_numpy()
            & (jet1["ak8_Txbb"] >= 0.92).to_numpy()
            & (jet1["ak8_msoftdrop"] >= 110).to_numpy()
            & (jet1["ak8_msoftdrop"] <= 140).to_numpy()
            & (jet1["bdt_score"] >= 0.94).to_numpy()
        ),
        # "BST30060-X0bb08-Xbb08": (
        #     (jet0["ak8_pt"] >= 300).to_numpy()
        #     & (jet1["ak8_pt"] >= 300).to_numpy()
        #     & (jet0["ak8_msoftdrop"] >= 60).to_numpy()
        #     & (jet1["ak8_msoftdrop"] >= 60).to_numpy()
        #     & (jet0["ak8_Txbb"] >= 0.8).to_numpy()
        #     & (jet1["ak8_Txbb"] >= 0.8).to_numpy()
        #     & (jet1["ak8_Txbb"] < 0.92).to_numpy()
        # ),
        # "BST30060-X0bb08-Xbbless08": (
        #     (jet0["ak8_pt"] >= 300).to_numpy()
        #     & (jet1["ak8_pt"] >= 300).to_numpy()
        #     & (jet0["ak8_msoftdrop"] >= 60).to_numpy()
        #     & (jet1["ak8_msoftdrop"] >= 60).to_numpy()
        #     & (jet0["ak8_Txbb"] >= 0.8).to_numpy()
        #     & (jet1["ak8_Txbb"] < 0.8).to_numpy()
        # ),
        # "BST25060-X0bb08": (
        #     (jet0["ak8_pt"] >= 250).to_numpy()
        #     & (jet1["ak8_pt"] >= 250).to_numpy()
        #     & (jet0["ak8_msoftdrop"] >= 60).to_numpy()
        #     & (jet1["ak8_msoftdrop"] >= 60).to_numpy()
        #     & (jet0["ak8_Txbb"] >= 0.8).to_numpy()
        # ),
    }

    # compute overlap
    overlap_regions = {
        "RES4b-BST30060-X0bb08": (
            resolved_regions["BST-RES4b"] & boosted_regions["BST30060-X0bb08"]
        ),
        "RES4b-BST30060-X0bb08-Xbb092": (
            resolved_regions["BST-RES4b"] & boosted_regions["BST30060-X0bb08-Xbb092"]
        ),
        "RES4b-BST30060-X0bb08-Xbb092-mass": (
            resolved_regions["BST-RES4b"] & boosted_regions["BST30060-X0bb08-Xbb092-mass"]
        ),
        "RES4b-BST30060-X0bb08-Xbb092-mass-BDT094": (
            resolved_regions["BST-RES4b"] & boosted_regions["BST30060-X0bb08-Xbb092-mass-BDT094"]
        ),
    }

    boosted_yields = {
        region: [np.sum(jet0["boosted_weight"][region_mask])]
        for region, region_mask in boosted_regions.items()
    }
    boosted_counts = {
        region: int(jet0["event"][region_mask].shape[0])
        for region, region_mask in boosted_regions.items()
    }

    overlap_yields = {
        region: [np.sum(jet0["resolved_weight"][region_mask])]
        for region, region_mask in overlap_regions.items()
    }
    overlap_counts = {
        region: int(jet0["event"][region_mask].shape[0])
        for region, region_mask in overlap_regions.items()
    }
    # xbb2: 0.92
    # xbb2: 0.8

    # make yields and  counts dataframe
    df_yields = pd.DataFrame(
        {
            "sample": sample,
            **resolved_yields,
            **boosted_yields,
            **overlap_yields,
        }
    )

    df_counts = pd.DataFrame(
        {
            "sample": sample,
            "all": int(df["event"].shape[0]),
            **resolved_counts,
            **boosted_counts,
            **overlap_counts,
        },
        index=[0],
    )

In [None]:
print(df_yields.to_markdown())

In [None]:
print(df_counts.to_markdown())

In [None]:
labels = {
    "RES4b-BST30060-X0bb08": "& Boosted Pre-sel",
    "RES4b-BST30060-X0bb08-Xbb092": r"& Boosted Pre-sel + Xbb$^1$ > 0.92",
    "RES4b-BST30060-X0bb08-Xbb092-mass": r"& Boosted Pre-sel + Xbb$^1$ > 0.92 + m$_{SD}^2$:[110-140] GeV",
    "RES4b-BST30060-X0bb08-Xbb092-mass-BDT094": r"& Boosted Pre-sel + Xbb$^1$ > 0.92 + m$_{SD}^2$:[110-140] GeV & BDT>0.94",
}

msd_axis = hist.axis.Regular(40, 0, 250, name="msd", label=r"m$_{SD}^2$ GeV")
cat_axis = hist.axis.StrCategory([], name="cat", growth=True)
xbb2_axis = hist.axis.Regular(40, 0.0, 1, name="xbb", label=r"Xbb Jet 2")
bdt_axis = hist.axis.Regular(40, 0.0, 1, name="bdt", label=r"BDT score")

In [None]:
h_mass = hist.Hist(msd_axis, cat_axis)
h_xbb = hist.Hist(xbb2_axis, cat_axis)
h_bdt = hist.Hist(bdt_axis, cat_axis)

h_xbb.fill(jet1["ak8_Txbb"], cat="all", weight=jet0["resolved_weight"])
h_mass.fill(jet1["ak8_msoftdrop"], cat="all", weight=jet0["resolved_weight"])
h_bdt.fill(jet1["bdt_score"], cat="all", weight=jet0["resolved_weight"])

for region, mask in overlap_regions.items():
    mask = mask.to_numpy()
    h_xbb.fill(jet1["ak8_Txbb"][mask], cat=region, weight=jet0["resolved_weight"][mask])
    h_mass.fill(jet1["ak8_msoftdrop"][mask], cat=region, weight=jet0["resolved_weight"][mask])
    h_bdt.fill(jet1["bdt_score"][mask], cat=region, weight=jet0["resolved_weight"][mask])

In [None]:
for h in [h_xbb, h_mass, h_bdt]:
    fig, ax = plt.subplots(1, 1, figsize=(12, 8))
    # hep.histplot(h_mass[{"cat": "all"}], ax=ax, label="& Boosted Skim", histtype="step", linewidth=1)
    for region in overlap_regions:
        hep.histplot(
            h[{"cat": region}],
            ax=ax,
            label=f"{labels[region]}",
            histtype="step",
            linewidth=1,
        )
    ax.legend(
        title=r"Resolved Selection",
        bbox_to_anchor=(1.05, 1),
        fontsize=15,
        loc="upper left",
    )

In [None]:
h_mass = hist.Hist(msd_axis, cat_axis)
h_xbb = hist.Hist(xbb2_axis, cat_axis)
h_bdt = hist.Hist(bdt_axis, cat_axis)

h_xbb.fill(jet1["ak8_Txbb"], cat="all", weight=jet0["resolved_weight"])
h_mass.fill(jet1["ak8_msoftdrop"], cat="all", weight=jet0["resolved_weight"])
h_bdt.fill(jet1["bdt_score"], cat="all", weight=jet0["resolved_weight"])

for region, mask in boosted_regions.items():
    print(region)
    h_xbb.fill(jet1["ak8_Txbb"][mask], cat=region, weight=jet0["resolved_weight"][mask])
    h_mass.fill(jet1["ak8_msoftdrop"][mask], cat=region, weight=jet0["resolved_weight"][mask])
    h_bdt.fill(jet1["bdt_score"][mask], cat=region, weight=jet0["resolved_weight"][mask])

for h in [h_xbb, h_mass, h_bdt]:
    fig, ax = plt.subplots(1, 1, figsize=(12, 8))
    # hep.histplot(h_mass[{"cat": "all"}], ax=ax, label="& Boosted Skim", histtype="step", linewidth=1)
    for region in boosted_regions:
        hep.histplot(
            h[{"cat": region}],
            ax=ax,
            label=f"{region}",
            histtype="step",
            linewidth=1,
        )
    ax.legend(
        title=r"Boosted Selection",
        bbox_to_anchor=(1.05, 1),
        fontsize=15,
        loc="upper left",
    )