In [None]:
import os
import pandas as pd
import uproot
import numpy as np
import pickle
import vector

from HH4b.postprocessing import bb_assignment
from HH4b.utils import ShapeVar, CUT_MAX_VAL, load_samples
from HH4b.utils import get_feat, make_vector

In [None]:
# automatically reloads imported files on edits
%load_ext autoreload
%autoreload 2

Load Run2 HH4b sample (v9_privatepfnano)

In [None]:
MAIN_DIR = "../../../"
path_to_dir = f"{MAIN_DIR}/../data/skimmer/23Nov16_v9_privatepfnano/"
year = "2018"
date = "24Feb2_2018"

plot_dir = f"{MAIN_DIR}/plots/PostProcessing/{date}/{year}"
_ = os.system(f"mkdir -p {plot_dir}")

samples = {"hh4b": ["GluGlutoHHto4B_cHHH1_TuneCP5_PSWeights_13TeV-powheg-pythia8"]}
sample_dirs = {path_to_dir: samples}

filters = [
    [
        ("('ak8FatJetPt', '0')", ">=", 300),
        ("('ak8FatJetPt', '1')", ">=", 300),
        ("('ak8FatJetMsd', '0')", ">=", 50),
        ("('ak8FatJetMsd', '1')", ">=", 50),
        ("('ak8FatJetPNetXbb', '0')", ">=", 0.8),
    ],
    [
        ("('ak8FatJetPt', '0')", ">=", 300),
        ("('ak8FatJetPt', '1')", ">=", 300),
        ("('ak8FatJetMsd', '0')", ">=", 50),
        ("('ak8FatJetMsd', '1')", ">=", 50),
        ("('ak8FatJetPNetXbb', '1')", ">=", 0.8),
    ],
    [
        ("('ak8FatJetPt', '0')", ">=", 300),
        ("('ak8FatJetPt', '1')", ">=", 300),
        ("('ak8FatJetPNetMass', '0')", ">=", 50),
        ("('ak8FatJetPNetMass', '1')", ">=", 50),
        ("('ak8FatJetPNetXbb', '0')", ">=", 0.8),
    ],
    [
        ("('ak8FatJetPt', '0')", ">=", 300),
        ("('ak8FatJetPt', '1')", ">=", 300),
        ("('ak8FatJetPNetMass', '0')", ">=", 50),
        ("('ak8FatJetPNetMass', '1')", ">=", 50),
        ("('ak8FatJetPNetXbb', '1')", ">=", 0.8),
    ],
]

# columns to load
load_columns = [
    ("run", 1),
    ("lumi", 1),
    ("event", 1),
    ("weight", 1),
    ("ak8FatJetPt", 2),
    ("ak8FatJetEta", 2),
    ("ak8FatJetPhi", 2),
    ("ak8FatJetMsd", 2),
    ("ak8FatJetPNetQCDb", 2),
    ("ak8FatJetPNetQCDbb", 2),
    ("ak8FatJetPNetQCDothers", 2),
    ("ak8FatJetPNetXbb", 2),
    ("ak8FatJetTau3OverTau2", 2),
    ("GenHiggsPt", 2),
    ("GenHiggsEta", 2),
    ("GenHiggsPhi", 2),
]
# reformat into ("column name", "idx") format for reading multiindex columns
columns = []
for key, num_columns in load_columns:
    for i in range(num_columns):
        columns.append(f"('{key}', '{i}')")

# dictionary that will contain all information (from all samples)
events_dict = {}
for input_dir, samples in sample_dirs.items():
    events_dict = {
        **events_dict,
        # this function will load files (only the columns selected), apply filters and compute a weight per event
        **load_samples(
            input_dir,
            samples,
            year,
            filters=filters,
            columns_mc=columns,
        ),
    }

bb_masks = bb_assignment(events_dict)

In [None]:
list(events_dict["hh4b"].columns)

Load Run2 HH4b sample (directly from old skimmer)

In [None]:
# From lxplus: /eos/cms/store/group/phys_susy/razor/Run2Analysis/HH/HHTo4BNtupler/20230207/option5/combined/BDT/2018/
# From lxplus: /eos/cms/store/group/phys_susy/razor/Run2Analysis/HH/HHTo4BNtupler/20211209_regression/option5/combined/BDT/2018/

# path_to_dir_run2 = f"{MAIN_DIR}/../data/skimmer/20230207_BDT/"
path_to_dir_run2 = f"{MAIN_DIR}/../data/skimmer/20211209_regression/"
samples_run2 = {
    "hh4b_run2": [
        "GluGluToHHTo4B_node_cHHH1_TuneCP5_PSWeights_13TeV-powheg-pythia8_1pb_weighted_Testing_BDTs.root"
    ],
}

columns = [
    "run",
    "luminosityBlock",
    "event",
    "fatJet1Pt",
    "fatJet1Eta",
    "fatJet1Phi",
    "fatJet1Mass",
    "fatJet1MassSD",
    "fatJet1PNetXbb",
    "fatJet1PNetQCDb",
    "fatJet1PNetQCDbb",
    "fatJet1PNetQCDothers",
    "fatJet1Tau3OverTau2",
    "fatJet2Pt",
    "fatJet2Eta",
    "fatJet2Phi",
    "fatJet2Mass",
    "fatJet2MassSD",
    "fatJet2PNetXbb",
    "fatJet2PNetQCDb",
    "fatJet2PNetQCDbb",
    "fatJet2PNetQCDothers",
    "fatJet2Tau3OverTau2",
    "fatJet1PtOverMHH",
    "fatJet2PtOverMHH",
    # "fatJet1MassSD_noJMS",
    "ptj2_over_ptj1",
    "hh_pt",
    "hh_eta",
    "hh_mass",
    "met",
    "genHiggs1Pt",
    "genHiggs1Eta",
    "genHiggs1Phi",
    "disc_qcd_and_ttbar_Run2_enhanced_v8p2",
]
for key, datasets in samples_run2.items():
    for dset in datasets:
        pdf = uproot.open(f"{path_to_dir_run2}/{year}/{dset}:Events").arrays(columns, library="pd")
        events_dict[key] = pdf.rename(columns={"luminosityBlock": "lumi"})

In [None]:
list(events_dict["hh4b_run2"].columns)

In [None]:
pdf = events_dict["hh4b_run2"]
df_ev_run2 = pdf[pdf.event == 877647]

In [None]:
pdf = events_dict["hh4b"]
df_ev = pdf[(pdf.event == 877647).to_numpy().squeeze()]

In [None]:
df_ev["ak8FatJetPNetXbb"]

In [None]:
df_ev["ak8FatJetPt"]

In [None]:
df_ev_run2["fatJet1Pt"]

In [None]:
df_ev_run2["fatJet1PNetXbb"]

Load BDT from run2

In [None]:
_model_name = (
    f"{MAIN_DIR}/../data/model_xgboost_training_weights_qcd_and_ttbar_Run2_bdt_enhanced_v8p2.pkl"
)

with open(_model_name, "rb") as pkl_file:
    model = pickle.load(pkl_file)

In [None]:
model.get_booster().feature_names

In [None]:
model.feature_importances_

In [None]:
variables = [
    # branche name, BDT name
    ["hh_pt", "hh_pt", "$p_{T}^{HH}$ (GeV)", 40, 0, 5000],
    ["hh_eta", "hh_eta", "$\eta^{HH}$", 40, -5.0, 5.0],
    ["hh_mass", "hh_mass", "$m_{HH}$ (GeV)", 40, 0, 1500],
    ["met", "met", "$MET$ (GeV)", 60, 0, 600],
    ["fatJet1Tau3OverTau2", "fatJet1Tau3OverTau2", "fatJet1Tau3OverTau2", 50, 0.0, 1.0],
    ["fatJet2Tau3OverTau2", "fatJet2Tau3OverTau2", "fatJet2Tau3OverTau2", 50, 0.0, 1.0],
    ["fatJet1MassSD", "j1_mass_sd", "$M_{j1}$ (GeV)", 40, 0.0, 5000.0],
    ["fatJet1Pt", "j1_pt", "$p_{T}^{j1}$ (GeV)", 40, 0.0, 5000.0],
    ["fatJet1Eta", "j1_eta", "$\eta^{j1}$", 40, -2.5, 2.5],
    ["fatJet1PNetXbb", "fatJet1PNetXbb", "fatJet1PNetXbb", 40, -100, 100],
    ["fatJet1PNetQCDb", "fatJet1PNetQCDb", "fatJet1PNetQCDb", 40, -100, 100],
    ["fatJet1PNetQCDbb", "fatJet1PNetQCDbb", "fatJet1PNetQCDbb", 40, -100, 100],
    ["fatJet1PNetQCDothers", "fatJet1PNetQCDothers", "fatJet1PNetQCDothers", 40, -100, 100],
    ["fatJet2Pt", "j2_pt", "$p_{T}^{j2}$ (GeV)", 40, 0.0, 500.0],
    ["fatJet1PtOverMHH", "ptj1Omhh", "$p_{T}^{j1}/m_{HH}$", 40, 0.0, 1.0],
    ["fatJet2PtOverMHH", "ptj2Omhh", "$p_{T}^{j2}/m_{HH}$", 40, 0.0, 0.7],
    ["ptj2_over_ptj1", "ptj2Optj1", "$p_{T}^{j2}/p_{T}^{j1}$", 40, 0.5, 1.0],
]
var_names = [x[0] for x in variables]


def bdt_dataframe(key):
    events = events_dict[key]
    bb_mask = bb_masks[key]
    events_bdt = pd.DataFrame()
    events_bdt["fatJet1Pt"] = get_feat(events, "bb0FatJetPt", bb_mask)
    events_bdt["fatJet1Eta"] = get_feat(events, "bb0FatJetEta", bb_mask)
    events_bdt["fatJet1Phi"] = get_feat(events, "bb0FatJetPhi", bb_mask)
    events_bdt["fatJet1Mass"] = get_feat(events, "bb0FatJetMsd", bb_mask)

    events_bdt["fatJet2Pt"] = get_feat(events, "bb1FatJetPt", bb_mask)
    events_bdt["fatJet2Eta"] = get_feat(events, "bb1FatJetEta", bb_mask)
    events_bdt["fatJet2Phi"] = get_feat(events, "bb1FatJetPhi", bb_mask)
    events_bdt["fatJet2Mass"] = get_feat(events, "bb1FatJetMsd", bb_mask)

    events_bdt["fatJet1PNetXbb"] = get_feat(events, "bb0FatJetPNetXbb", bb_mask)
    events_bdt["fatJet1PNetQCDb"] = get_feat(events, "bb0FatJetPNetQCDb", bb_mask)
    events_bdt["fatJet1PNetQCDbb"] = get_feat(events, "bb0FatJetPNetQCDbb", bb_mask)
    events_bdt["fatJet1PNetQCDothers"] = get_feat(events, "bb0FatJetPNetQCDothers", bb_mask)

    events_bdt["fatJet1MassSD"] = get_feat(events, "bb0FatJetMsd", bb_mask)

    h1 = make_vector(events, "bb0FatJet", bb_mask=bb_mask, mstring="Msd")
    h2 = make_vector(events, "bb1FatJet", bb_mask=bb_mask, mstring="Msd")
    hh = h1 + h2
    events_bdt["hh_pt"] = hh.pt
    events_bdt["hh_eta"] = hh.eta
    events_bdt["hh_mass"] = hh.mass

    events_bdt["met"] = get_feat(events, "MET_pt")
    events_bdt["fatJet1Tau3OverTau2"] = get_feat(events, "bb0FatJetTau3OverTau2", bb_mask)
    events_bdt["fatJet2Tau3OverTau2"] = get_feat(events, "bb1FatJetTau3OverTau2", bb_mask)
    events_bdt["fatJet1PtOverMHH"] = events_bdt["fatJet1Pt"] / (hh.mass)
    events_bdt["fatJet2PtOverMHH"] = events_bdt["fatJet2Pt"] / (hh.mass)
    events_bdt["ptj2_over_ptj1"] = events_bdt["fatJet2Pt"] / events_bdt["fatJet1Pt"]

    events_bdt = events_bdt[var_names]
    # getting a numpy array from two pandas data frames
    x_test = events_bdt.values
    # creating numpy array for target variables
    y_test = np.zeros(len(events_bdt))
    # predict
    y_pred = model.predict_proba(x_test)[:, 1]

    events_bdt["bdt_prediction"] = y_pred
    events_bdt["event"] = get_feat(events, "event")

    return events_bdt


# same function w/o bb masks
def bdt_dataframe_nobb(key):
    events = events_dict[key]
    events_bdt = pd.DataFrame()
    events_bdt["fatJet1Pt"] = events["bbFatJetPt0"]
    events_bdt["fatJet1Eta"] = events["bbFatJetEta0"]
    events_bdt["fatJet1Phi"] = events["bbFatJetPhi0"]
    events_bdt["fatJet1Mass"] = events["bbFatJetMsd0"]

    events_bdt["fatJet2Pt"] = events["bbFatJetPt1"]
    events_bdt["fatJet2Eta"] = events["bbFatJetEta1"]
    events_bdt["fatJet2Phi"] = events["bbFatJetPhi1"]
    events_bdt["fatJet2Mass"] = events["bbFatJetMsd1"]

    events_bdt["fatJet1PNetXbb"] = events["bbFatJetPNetXbb0"]
    events_bdt["fatJet1PNetQCDb"] = events["bbFatJetPNetQCDb0"]
    events_bdt["fatJet1PNetQCDbb"] = events["bbFatJetPNetQCDbb0"]
    events_bdt["fatJet1PNetQCDothers"] = events["bbFatJetPNetQCDothers0"]

    events_bdt["fatJet1MassSD"] = events["bbFatJetMsd0"]

    h1 = vector.array(
        {
            "pt": events_bdt["fatJet1Pt"],
            "phi": events_bdt["fatJet1Phi"],
            "eta": events_bdt["fatJet1Eta"],
            "M": events_bdt["fatJet1Mass"],
        }
    )
    h2 = vector.array(
        {
            "pt": events_bdt["fatJet2Pt"],
            "phi": events_bdt["fatJet2Phi"],
            "eta": events_bdt["fatJet2Eta"],
            "M": events_bdt["fatJet2Mass"],
        }
    )

    hh = h1 + h2
    events_bdt["hh_pt"] = hh.pt
    events_bdt["hh_eta"] = hh.eta
    events_bdt["hh_mass"] = hh.mass

    events_bdt["met"] = events["MET_pt"]
    events_bdt["fatJet1Tau3OverTau2"] = events["bbFatJetTau3OverTau20"]
    events_bdt["fatJet2Tau3OverTau2"] = events["bbFatJetTau3OverTau21"]
    events_bdt["fatJet1PtOverMHH"] = events_bdt["fatJet1Pt"] / (hh.mass)
    events_bdt["fatJet2PtOverMHH"] = events_bdt["fatJet2Pt"] / (hh.mass)
    events_bdt["ptj2_over_ptj1"] = events_bdt["fatJet2Pt"] / events_bdt["fatJet1Pt"]

    events_bdt = events_bdt[var_names]
    # getting a numpy array from two pandas data frames
    x_test = events_bdt.values
    # creating numpy array for target variables
    y_test = np.zeros(len(events_bdt))
    # predict
    y_pred = model.predict_proba(x_test)[:, 1]

    events_bdt["bdt_prediction"] = y_pred
    events_bdt["event"] = get_feat(events, "event")

    return events_bdt


def bdt_dataframe_run2(key):
    events = events_dict[key]
    events_bdt = pd.DataFrame()
    events_bdt["fatJet1Pt"] = events["fatJet1Pt"]
    events_bdt["fatJet1Eta"] = events["fatJet1Eta"]
    events_bdt["fatJet1Phi"] = events["fatJet1Phi"]
    events_bdt["fatJet1Mass"] = events["fatJet1Mass"]

    events_bdt["fatJet2Pt"] = events["fatJet2Pt"]
    events_bdt["fatJet2Eta"] = events["fatJet2Eta"]
    events_bdt["fatJet2Phi"] = events["fatJet2Phi"]
    events_bdt["fatJet2Mass"] = events["fatJet2Mass"]

    events_bdt["fatJet1PNetXbb"] = events["fatJet1PNetXbb"]
    events_bdt["fatJet1PNetQCDb"] = events["fatJet1PNetQCDb"]
    events_bdt["fatJet1PNetQCDbb"] = events["fatJet1PNetQCDbb"]
    events_bdt["fatJet1PNetQCDothers"] = events["fatJet1PNetQCDothers"]

    events_bdt["fatJet1MassSD"] = events["fatJet1MassSD"]
    # events_bdt["fatJet1MassSD"] = events["fatJet1MassSD_noJMS"]

    h1 = vector.array(
        {
            "pt": events["fatJet1Pt"],
            "phi": events["fatJet1Phi"],
            "eta": events["fatJet1Eta"],
            "M": events["fatJet1MassSD"],
        }
    )
    h2 = vector.array(
        {
            "pt": events["fatJet2Pt"],
            "phi": events["fatJet2Phi"],
            "eta": events["fatJet2Eta"],
            "M": events["fatJet2MassSD"],
        }
    )

    hh = h1 + h2
    events_bdt["hh_pt"] = hh.pt
    events_bdt["hh_eta"] = hh.eta
    events_bdt["hh_mass"] = hh.mass

    events_bdt["hh_pt"] = events["hh_pt"]
    events_bdt["hh_eta"] = events["hh_eta"]
    events_bdt["hh_mass"] = events["hh_mass"]

    events_bdt["met"] = events["met"]
    events_bdt["fatJet1Tau3OverTau2"] = events["fatJet1Tau3OverTau2"]
    events_bdt["fatJet2Tau3OverTau2"] = events["fatJet2Tau3OverTau2"]
    # events_bdt["fatJet1PtOverMHH"] = events["fatJet1PtOverMHH"]
    # events_bdt["fatJet2PtOverMHH"] = events["fatJet2PtOverMHH"]
    # events_bdt["ptj2_over_ptj1"] = events["ptj2_over_ptj1"]
    events_bdt["fatJet1PtOverMHH"] = events_bdt["fatJet1Pt"] / (hh.mass)
    events_bdt["fatJet2PtOverMHH"] = events_bdt["fatJet2Pt"] / (hh.mass)
    events_bdt["ptj2_over_ptj1"] = events_bdt["fatJet2Pt"] / events_bdt["fatJet1Pt"]

    events_bdt = events_bdt[var_names]
    # getting a numpy array from two pandas data frames
    x_test = events_bdt.values
    # creating numpy array for target variables
    y_test = np.zeros(len(events_bdt))
    # predict
    y_pred = model.predict_proba(x_test)[:, 1]

    events_bdt["bdt_prediction"] = y_pred
    events_bdt["event"] = get_feat(events, "event")

    events_bdt["disc_qcd_and_ttbar_Run2_enhanced_v8p2"] = events[
        "disc_qcd_and_ttbar_Run2_enhanced_v8p2"
    ]

    return events_bdt

In [None]:
events_bdt_dict = {}
# events_bdt_dict["hh4b"] = bdt_dataframe("hh4b")
events_bdt_dict["hh4b_run2"] = bdt_dataframe_run2("hh4b_run2")

Test if inference in Run2 HH4b (skimmer )== Run 2 HH4b (old skimmer)
- Tricky because kinematics don't agree

In [None]:
df_ev_bdt = events_bdt_dict["hh4b"]
df_ev_bdt[(df_ev_bdt.event == 877647)].bdt_prediction

In [None]:
df_ev_run2.disc_qcd_and_ttbar_Run2_enhanced_v8p2

Test if local inference on HH4b Run2 file is the same as saved in the skimmer ntuple

In [None]:
events_bdt_dict["hh4b_run2"].bdt_prediction

In [None]:
events_bdt_dict["hh4b_run2"].disc_qcd_and_ttbar_Run2_enhanced_v8p2

In [None]:
# it looks like very few differences
inference_bdt = events_bdt_dict["hh4b_run2"].bdt_prediction
saved_bdt = events_bdt_dict["hh4b_run2"].disc_qcd_and_ttbar_Run2_enhanced_v8p2
diff = inference_bdt != saved_bdt
same = inference_bdt == saved_bdt
inference_bdt[diff]

In [None]:
# with the exact same variables (0.08)
# with the make vector (0.09)
np.sum(diff) / (np.sum(same) + np.sum(diff))

Load new Run2 HH4b sample produced with updated skimmer, JECs and v9_private_hh

In [None]:
path_to_dir_run2_skim = f"{MAIN_DIR}/../data/skimmer/Feb7_v9hh/"
samples_run2_skim = {
    # "hh4b_run2_skim": ["GluGlutoHHto4B_cHHH1_TuneCP5_PSWeights_13TeV-powheg-pythia8/nano_skim_0-30.root"],
    "hh4b_run2_skim": [
        "GluGlutoHHto4B_cHHH1_TuneCP5_PSWeights_13TeV-powheg-pythia8/nano_skim_0-1.root"
    ],
}

columns = [
    "run",
    "luminosityBlock",
    "event",
    "bbFatJetPt0",
    "bbFatJetEta0",
    "bbFatJetPhi0",
    "bbFatJetMass0",
    "bbFatJetMsd0",
    "bbFatJetPNetXbb0",
    "bbFatJetPNetQCDb0",
    "bbFatJetPNetQCDbb0",
    "bbFatJetPNetQCDothers0",
    "bbFatJetTau3OverTau20",
    "bbFatJetPt1",
    "bbFatJetEta1",
    "bbFatJetPhi1",
    "bbFatJetMass1",
    "bbFatJetMsd1",
    "bbFatJetPNetXbb1",
    "bbFatJetPNetQCDb1",
    "bbFatJetPNetQCDbb1",
    "bbFatJetPNetQCDothers1",
    "bbFatJetTau3OverTau21",
    "MET_pt",
]
for key, datasets in samples_run2_skim.items():
    for dset in datasets:
        df = uproot.open(f"{path_to_dir_run2_skim}/{year}/{dset}:Events").arrays(
            columns, library="pd"
        )
        events_dict[key] = df.rename(columns={"luminosityBlock": "lumi"})

In [None]:
events_bdt_dict["hh4b_run2_skim"] = bdt_dataframe_nobb("hh4b_run2_skim")

Test if inference in Run 2 HH4b (new skimmer) == Run2 HH4b (old skimmer)

In [None]:
df_ev_bdt = events_bdt_dict["hh4b_run2"]
print(df_ev_bdt[(df_ev_bdt.event == 877647)].fatJet1Pt)
print(df_ev_bdt[(df_ev_bdt.event == 877647)].fatJet1PNetXbb)
print(df_ev_bdt[(df_ev_bdt.event == 877647)].bdt_prediction)
print(df_ev_bdt[(df_ev_bdt.event == 877647)].disc_qcd_and_ttbar_Run2_enhanced_v8p2)

In [None]:
df_ev_bdt = events_bdt_dict["hh4b_run2_skim"]
print(df_ev_bdt[(df_ev_bdt.event == 877647)].fatJet1Pt)
print(df_ev_bdt[(df_ev_bdt.event == 877647)].fatJet1PNetXbb)
print(df_ev_bdt[(df_ev_bdt.event == 877647)].bdt_prediction)

get consistent masks:

    example of one event 447853 with 20230207_BDT 
    hh_pt 447853 [75.29954] [74.99527821]
    hh_eta 447853 [3.8064182] [3.81099406]
    hh_mass 447853 [972.2381] [972.90843845]
    met 447853 [58.920475] [51.539936]
    fatJet1Tau3OverTau2 447853 [0.88720536] [0.88720536]
    fatJet2Tau3OverTau2 447853 [0.6964158] [0.69641578]
    fatJet1MassSD 447853 [117.] [117.]
    fatJet1Pt 447853 [427.17966] [427.55249023]
    fatJet1Eta 447853 [1.5109863] [1.51098633]
    fatJet1PNetXbb 447853 [0.99867326] [0.99861515]
    fatJet1PNetQCDb 447853 [6.6529974e-05] [6.6529974e-05]
    fatJet1PNetQCDbb 447853 [0.00126004] [0.00126004]
    fatJet1PNetQCDothers 447853 [5.3305876e-10] [5.33058764e-10]
    fatJet2Pt 447853 [501.32425] [501.38690186]
    fatJet1PtOverMHH 447853 [0.43986812] [0.4394581]
    fatJet2PtOverMHH 447853 [0.516215] [0.5153485]
    ptj2_over_ptj1 447853 [1.1735677] [1.17269087]
    bdt_prediction 447853 [0.8880559] [0.8857299]
    event 447853 [447853] [447853]

In [None]:
run2 = events_bdt_dict["hh4b_run2"]
run2_skim = events_bdt_dict["hh4b_run2_skim"]

run2 = run2[(run2.fatJet1Pt > 300) & (run2.fatJet2Pt > 300)]
run2_skim = run2_skim[(run2_skim.fatJet1Pt > 300) & (run2_skim.fatJet2Pt > 300)]

# for i, ev in enumerate(run2.event):
for i, ev in enumerate([447853, 699941]):
    for key in run2_skim.keys():
        # for key in ["fatJet1Pt", "fatJet1PNetXbb", "bdt_prediction"]:
        val_run2 = run2[(run2.event == ev)][key].values
        val_run2_skim = run2_skim[(run2_skim.event == ev)][key].values
        print(key, ev, val_run2, val_run2_skim)
    print("\n")

In [None]:
import hist
import matplotlib.pyplot as plt
import mplhep as hep
import matplotlib.ticker as mticker

hep.style.use(["CMS", "firamath"])

formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))
plt.rcParams.update({"font.size": 12})
plt.rcParams["lines.linewidth"] = 2
plt.rcParams["grid.color"] = "#CCCCCC"
plt.rcParams["grid.linewidth"] = 0.5
plt.rcParams["figure.edgecolor"] = "none"

bdt_axis = hist.axis.Regular(40, 0, 1, name="bdt", label=r"BDT")
xbb_axis = hist.axis.Regular(40, 0.8, 1, name="xbb", label=r"Xbb")
cat_axis = hist.axis.StrCategory([], name="cat", growth=True)

In [None]:
h_bdt = hist.Hist(bdt_axis, cat_axis)
h_xbb = hist.Hist(xbb_axis, cat_axis)

keys = ["hh4b_run2_skim", "hh4b_run2"]
for key in keys:
    mask = (events_bdt_dict[key].fatJet1Pt > 300) & (
        events_bdt_dict[key].fatJet2Pt > 300
    )  # & (events_bdt_dict[key].fatJet1PNetXbb > 0.8)
    h_bdt.fill(bdt=events_bdt_dict[key][mask]["bdt_prediction"].to_numpy(), cat=key)
    h_xbb.fill(xbb=events_bdt_dict[key][mask]["fatJet1PNetXbb"].to_numpy(), cat=key)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 4))
hep.histplot(
    h_bdt[{"cat": "hh4b_run2_skim"}],
    ax=ax,
    label="hh4b v9 hh",
    histtype="step",
    linewidth=1,
    color="black",
    density=True,
)
hep.histplot(
    h_bdt[{"cat": "hh4b_run2"}],
    ax=ax,
    label="hh4b skimmer",
    histtype="step",
    linewidth=1,
    color="red",
    density=True,
)
# hep.histplot(
#     h_bdt[{"cat": "hh4b"}],
#     ax=ax,
#     label="hh4b run3",
#     histtype="step",
#     linewidth=1,
#     color="blue",
#     density=True,
# )
ax.set_yscale("log")
ax.legend()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 4))
hep.histplot(
    h_bdt[{"cat": "hh4b_run2_skim"}],
    ax=ax,
    label="hh4b v9 hh",
    histtype="step",
    linewidth=1,
    color="black",
    density=True,
)
hep.histplot(
    h_bdt[{"cat": "hh4b_run2"}],
    ax=ax,
    label="hh4b skimmer",
    histtype="step",
    linewidth=1,
    color="red",
    density=True,
)
# hep.histplot(
#     h_bdt[{"cat": "hh4b"}],
#     ax=ax,
#     label="hh4b run3",
#     histtype="step",
#     linewidth=1,
#     color="blue",
#     density=True,
# )
ax.set_yscale("log")
ax.legend()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 4))
hep.histplot(
    h_xbb[{"cat": "hh4b_run2_skim"}],
    ax=ax,
    label="hh4b v9 hh",
    histtype="step",
    linewidth=1,
    color="black",
    density=True,
)
hep.histplot(
    h_xbb[{"cat": "hh4b_run2"}],
    ax=ax,
    label="hh4b skimmer",
    histtype="step",
    linewidth=1,
    color="red",
    density=True,
)
# hep.histplot(
#     h_xbb[{"cat": "hh4b"}],
#     ax=ax,
#     label="hh4b run3",
#     histtype="step",
#     linewidth=1,
#     color="blue",
#     density=True,
# )
ax.set_yscale("log")
ax.legend()

A couple of conclusions:
- Skimmer reproduces reasonably well ntuples for Run2, except for a couple of differences:
  - mSD corrected by JMS/JMR disagrees
  - pT disagrees at the 0.1% level
  - MET disagrees at few % level
- BDT inference seems to work
- Not sure it is worth doing more checks event by event