# Study overlap

In [2]:
import uproot
import pandas as pd
import numpy as np
from HH4b.utils import load_samples, format_columns


Load samples from Resolved group

In [224]:
dir = "../../../../data/overlap/Main_PNet_MinDiag_w4j35_w2bj30_dHHjw30_withoutSyst_25April2024_2022_0L/mc/parts/"
samples = {
    "hh4b": "GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV_powheg-pythia8_tree.root",
}

ak8_columns = [
    "ak8_pt",
    "ak8_eta",
    "ak8_phi",
    "ak8_jetId",
    "ak8_msoftdrop",
    "ak8_mass",
    "ak8_tau3",
    "ak8_tau2",
    "ak8_Txbb",
    "ak8_PQCDb",
    "ak8_PQCDbb",
    "ak8_PQCDothers",
    "ak8_particleNet_mass",
]

columns_to_load_resolved = [
    "passmetfilters",
    "passjetvetomap",
    "passTrig_HLT_QuadPFJet70_50_40_35_PFBTagParticleNet_2BTagSum0p65",
    "passL1unprescaled_HLT_QuadPFJet70_50_40_35_PFBTagParticleNet_2BTagSum0p65",
    "passTrigObjMatching_HLT_QuadPFJet70_50_40_35_PFBTagParticleNet_2BTagSum0p65",
    "avgbdisc_twoldgbdiscjets",
    "alljets_ht",
    "dHH_NbtagM",
    "dHH_H1_regmass",
    "dHH_H2_regmass",
    "event",
    "lumiwgt", # luminosity in fb 26.6717 for 2022EE
    "xsecWeight", #  xsec * 1000 / sum('genEventSumw'), xsec in pb
    "genWeight",
    "puWeight",
    "trgSF_HLT_QuadPFJet70_50_40_35_PFBTagParticleNet_2BTagSum0p65_central",
    "btagSF_central",
    "passTrig_HLT_AK8PFJet250_SoftDropMass40_PFAK8ParticleNetBB0p35",
    "passTrig_HLT_AK8PFJet425_SoftDropMass40",
]
columns_to_load_boosted = columns_to_load_resolved + [
    "n_ak8",
    "ak8_pt",
    "ak8_eta",
    "ak8_phi",
    "ak8_jetId",
    "ak8_msoftdrop",
    "ak8_mass",
    "ak8_tau3",
    "ak8_tau2",
    "ak8_Txbb",
    "ak8_PQCDb",
    "ak8_PQCDbb",
    "ak8_PQCDothers",
    "ak8_particleNet_mass",
    "pass_resolved_skim", # trigger & >=4 jets with some pt cuts and >= 2 bjets above 30 GeV
    "pass_boosted_skim", # trigger & >=2  tight AK8 jets with pT > 250 GeV and |eta|<2.4
]

"""
    2b: dHH_NbtagM == 2
    4b: dHH_NbtagM == 4
    asr_4b: ASR_4b
    asr_2b: ASR_2b
    acr_4b: ACR_4b
    acr_2b: ACR_2b
    vsr_4b: VSR_4b
    vsr_2b: VSR_2b
    vcr_4b: VCR_4b
    vcr_2b: VCR_2b
"""

def get_resolved_masks(df):
    regions = {}
    regions["RES"] = (
        (df["passTrig_HLT_QuadPFJet70_50_40_35_PFBTagParticleNet_2BTagSum0p65"]) &
        (df["passL1unprescaled_HLT_QuadPFJet70_50_40_35_PFBTagParticleNet_2BTagSum0p65"]) &
        (df["passTrigObjMatching_HLT_QuadPFJet70_50_40_35_PFBTagParticleNet_2BTagSum0p65"]) &
        (df["passmetfilters"]) &
        (df["passjetvetomap"]) &
        (df["avgbdisc_twoldgbdiscjets"] > 0.65) &
        (df["alljets_ht"] > 0)
    )

    # Calculate variables
    #df['AR_dHM'] = np.sqrt((df['dHH_H1_regmass'] - 125)**2 + (df['dHH_H2_regmass'] - 120)**2)
    #df['VR_dHM'] = np.sqrt((df['dHH_H1_regmass'] - 185)**2 + (df['dHH_H2_regmass'] - 182)**2)
    # Define additional regions based on these variables
    #df['ASR_4b'] = (df['AR_dHM'] < 30) & (df['dHH_NbtagM'] == 4)
    #df['ACR_4b'] = (df['AR_dHM'] >= 30) & (df['AR_dHM'] < 55) & (df['dHH_NbtagM'] == 4)
    #df['VSR_4b'] = (df['VR_dHM'] < 30) & (df['dHH_NbtagM'] == 4)
    #df['VCR_4b'] = (df['VR_dHM'] >= 30) & (df['VR_dHM'] < 55) & (df['dHH_NbtagM'] == 4)

    df["dHiggsDeltaRegMass"] = np.sqrt(
        ((df["dHH_H1_regmass"]-125.0)*(df["dHH_H1_regmass"]-125.0)) + 
        ((df["dHH_H2_regmass"]-120.0)*(df["dHH_H2_regmass"]-120.0))
    )

    regions = {
        **regions,
        "RES4b": (regions["RES"] & (df["dHH_NbtagM"] == 4)),
        "RES4bSR": (regions["RES"] & (df["dHiggsDeltaRegMass"] < 30.) & (df["dHH_NbtagM"] == 4)),
    }

    return regions

for sample, sample_file in samples.items():
    print(f"{dir}/{sample_file}")
    tree = uproot.open(f"{dir}/{sample_file}:Events")         
    print("Number of raw events: ", len(tree.arrays(["event"])["event"]))
    #########################################
    # Load resolved data as a pandas DataFrame
    df = tree.arrays(columns_to_load_resolved, library="pd")
    print("Number of resolved dataframe entries: ",len(df["event"]))

    # Weights
    df['resolved_weight'] = df['lumiwgt'] * df['xsecWeight'] * df['genWeight'] * df["puWeight"] * df['trgSF_HLT_QuadPFJet70_50_40_35_PFBTagParticleNet_2BTagSum0p65_central'] * df["btagSF_central"]
    
    # Define resolved regions
    regions = get_resolved_masks(df)

    # Get resolved yields and counts
    resolved_yields = {region: [np.sum(df['resolved_weight'][region_mask])] for region, region_mask in regions.items()}
    resolved_counts = {region: int(df['event'][region_mask].shape[0]) for region, region_mask in regions.items()}

    #########################################
    # Load boosted data as a pandas DataFrame
    df_b = tree.arrays(columns_to_load_boosted, library="pd")
    # Ask for at least 2 ak8 jets in boosted pandas dataframe
    df_b = df_b[
        (df_b["n_ak8"]>=2) &
        (df_b["passmetfilters"]) & (df_b["passjetvetomap"]) &
        ( (df_b["passTrig_HLT_AK8PFJet250_SoftDropMass40_PFAK8ParticleNetBB0p35"]) |
           (df_b["passTrig_HLT_AK8PFJet425_SoftDropMass40"]) )
    ].copy()
    print("Number of boosted dataframe entries: ",len(df_b["event"]))

    # Add weights
    df_b["boosted_weight"] = df_b['lumiwgt'] * df_b['xsecWeight'] * df_b['genWeight'] * df_b["puWeight"]
    df_b["resolved_weight"] = df_b['lumiwgt'] * df_b['xsecWeight'] * df_b['genWeight'] * df_b["puWeight"] * df_b['trgSF_HLT_QuadPFJet70_50_40_35_PFBTagParticleNet_2BTagSum0p65_central'] * df_b["btagSF_central"]

    # Order jets by fatjet Xbb
    df_ak8 = df_b.reset_index()
    df_ak8 = df_ak8.sort_values(by=['entry', 'ak8_Txbb'], ascending=[True, False]).set_index(['entry', 'subentry'])
    subindex = df_ak8.sort_index().index.get_level_values(1)
    df_ak8 = df_ak8.reset_index()
    df_ak8['subentry'] = subindex
    df_ak8 = df_ak8.set_index(['entry', 'subentry'])

    # For boosted, yields must be obtained for one of the entries
    jet0 = df_ak8.query("subentry == 0")
    jet1 = df_ak8.query("subentry == 1")

    # Define resolved regions for this dataframe
    resolved_regions = {
        **{f"BST-{region}": region_mask for region, region_mask in get_resolved_masks(jet0).items()}
    }

    # NOTE!!!: you must use .to_numpy() to get the masks with jet0 otherwise you cannot do an OR
    boosted_regions = {
        "BST30060-X0bb08": (
            (jet0['ak8_pt'] >= 300).to_numpy() & (jet1['ak8_pt'] >= 300).to_numpy() &
            (jet0['ak8_msoftdrop'] >= 60).to_numpy() & (jet1['ak8_msoftdrop'] >= 60).to_numpy() &
            (jet0['ak8_Txbb'] >= 0.8).to_numpy()
        ),
        "BST25060-X0bb08": (
            (jet0['ak8_pt'] >= 250).to_numpy() & (jet1['ak8_pt'] >= 250).to_numpy() &
            (jet0['ak8_msoftdrop'] >= 60).to_numpy() & (jet1['ak8_msoftdrop'] >= 60).to_numpy() &
            (jet0['ak8_Txbb'] >= 0.8).to_numpy()
        ),
    }


    # compute overlap
    overlap_regions = {
        "RES4b-BST30060-X0bb08": (resolved_regions["BST-RES4b"] & boosted_regions["BST30060-X0bb08"]),
    }

    boosted_yields = {region: [np.sum(jet0['boosted_weight'][region_mask])] for region, region_mask in boosted_regions.items()}
    boosted_counts = {region: int(jet0['event'][region_mask].shape[0]) for region, region_mask in boosted_regions.items()}

    overlap_yields = {region: [np.sum(jet0['resolved_weight'][region_mask])] for region, region_mask in overlap_regions.items()}
    overlap_counts = {region: int(jet0['event'][region_mask].shape[0]) for region, region_mask in overlap_regions.items()}
    
    # make yields and  counts dataframe
    df_yields = pd.DataFrame({
        "sample": key,
        **resolved_yields,
        **boosted_yields,
        **overlap_yields,
    })
    
    df_counts = pd.DataFrame({
        "sample": key,
        "all": int(df['event'].shape[0]),
        **resolved_counts,
        **boosted_counts,
        **overlap_counts,
    }, index=[0])

../../../../data/overlap/Main_PNet_MinDiag_w4j35_w2bj30_dHHjw30_withoutSyst_25April2024_2022_0L/mc/parts//GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV_powheg-pythia8_tree.root
Number of raw events:  2377354
Number of resolved dataframe entries:  2377354
Number of boosted dataframe entries:  571933


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["dHiggsDeltaRegMass"] = np.sqrt(


In [221]:
print(df_yields.to_markdown())

|    | sample           |     RES |   RES4b |   RES4bSR |   BST30060 |   RES4b-BST30060 |
|---:|:-----------------|--------:|--------:|----------:|-----------:|-----------------:|
|  0 | hh4b_v12_private | 68.2276 | 19.9474 |   13.2769 |     3.0976 |          1.22037 |


In [222]:
print(df_counts.to_markdown())

|    | sample           |     all |     RES |   RES4b |   RES4bSR |   BST30060 |   RES4b-BST30060 |
|---:|:-----------------|--------:|--------:|--------:|----------:|-----------:|-----------------:|
|  0 | hh4b_v12_private | 2377354 | 1741977 |  559983 |    385524 |      85355 |            39097 |


In [217]:
np.any(df['genWeight'] < 0)

True

Cross check yield with boosted ntuples

In [214]:
year = "2022EE"

sample_dirs = {
    f"../../../../data/skimmer/24Apr23LegacyLowerThresholds_v12_private_signal/": {
        "hh4b_v12_private": ["GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV"],
    },
}

load_columns = [
    ("bbFatJetPNetTXbbLegacy", 2),
    ("bbFatJetPNetMassLegacy", 2),
    ("bbFatJetPNetTXbb", 2),
    ("bbFatJetPNetMass", 2),  
    ("bbFatJetMsd", 2), 
    ("bbFatJetPt", 2),
    ("weight", 1),    
    ("trigger_sf", 1),
    ("AK8PFJet250_SoftDropMass40_PFAK8ParticleNetBB0p35", 1),
    ("AK8PFJet425_SoftDropMass40", 1),
]

events_dict = {}
for input_dir, samples_dict in sample_dirs.items():
    print(input_dir)
    events_dict = {
        **events_dict,
        # this function will load files (only the columns selected), apply filters and compute a weight per event
        **load_samples(
            input_dir,
            samples_dict,
            year,
            variations=False,
            columns=format_columns(load_columns),
            reorder_legacy_txbb=True,
        ),
    }


for key in ["hh4b_v12_private"]:
    print(key)
    events = events_dict[key]

    boosted_mask = (
        (events["bbFatJetPt"][0] >300) &
        (events["bbFatJetPt"][1] >300) &
        (events["bbFatJetPNetTXbb"][0] >0.8) &
        (events["bbFatJetMsd"][0] >60) &
        (events["bbFatJetMsd"][1] >60) &
        np.any([events["AK8PFJet250_SoftDropMass40_PFAK8ParticleNetBB0p35"], events["AK8PFJet425_SoftDropMass40"]])
    )
    print(np.sum(events["finalWeight"][boosted_mask]), len(events["finalWeight"][boosted_mask]))

../../../../data/skimmer/24Apr23LegacyLowerThresholds_v12_private_signal/
Loaded GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV: 169191 entries
hh4b_v12_private
2.935299445451215 82763


In [220]:
events['weight']

Unnamed: 0,0
0,505.399161
1,155.528530
2,134.532464
3,-399.780456
4,532.347110
...,...
169186,-219.166446
169187,399.780456
169188,505.399161
169189,437.040691
