In [5]:
# automatically reloads imported files on edits
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from argparse import Namespace
from HH4b.postprocessing.PostProcess import load_process_run3_samples

In [None]:
# fix: are we only loading 2022 despite the loop?
data_folder = "24May24_v12_private_signal"
args = Namespace(
    templates_tag="24June27",
    data_dir="/ceph/cms/store/user/cmantill/bbbb/skimmer/",
    tag="24May24_v12_private_signal",
    years=["2022"],  # TODO: this line may be redundant, overridden later
    training_years=None,
    mass="H2PNetMass",
    bdt_model="24May31_lr_0p02_md_8_AK4Away",
    bdt_config="24May31_lr_0p02_md_8_AK4Away",
    txbb_wps=[0.975, 0.92],
    bdt_wps=[0.98, 0.88, 0.03],
    method="sideband",
    vbf_txbb_wp=0.95,
    vbf_bdt_wp=0.98,
    sig_keys=["hh4b", "vbfhh4b"],
    pt_first=300,
    pt_second=250,
    bdt_roc=False,
    control_plots=False,
    fom_scan=False,
    fom_scan_bin1=True,
    fom_scan_bin2=True,
    fom_scan_vbf=False,
    templates=False,
    legacy=True,
    vbf=True,
    vbf_priority=False,
    weight_ttbar_bdt=1,
    blind=True,
)

In [None]:
bdt_training_keys = ["qcd", "vbfhh4b-k2v0", "hh4b", "ttbar"]
mass_window = np.array([105, 150])
years = ["2022", "2022EE", "2023", "2023BPix"]

ev_dicts = []
for year in years:
    ev_dict, _ = load_process_run3_samples(
        args,
        year=year,
        bdt_training_keys=bdt_training_keys,
        control_plots=False,
        plot_dir="plot_dir",
        mass_window=mass_window,
    )
    ev_dicts.append((year, ev_dict))

"""
python3 PostProcess.py --templates-tag 24June27 --tag 24May24_v12_private_signal --mass H2PNetMass --legacy --bdt-config 24May31_lr_0p02_md_8_AK4Away --bdt-model 24May31_lr_0p02_md_8_AK4Away --txbb-wps 0.975 0.92 --bdt-wps 0.98 0.88 0.03 --vbf-txbb-wp 0.95 --vbf-bdt-wp 0.98 --no-bdt-roc --no-fom-scan --no-fom-scan-bin2 --no-fom-scan-bin1 --data-dir /ceph/cms/store/user/cmantill/bbbb/skimmer/ --method abcd --no-vbf-priority --vbf --no-fom-scan-vbf --pt-second 250 --templates --years 2022 --sig-keys hh4b vbfhh4b
"""
# make array with event nr, signal category
# make mask with signal category cutoffs, apply to array

# save as root

In [None]:
import uproot
import pandas as pd

# select columns to extract for eventlist
eventlist_dict = ["event", "bdt_score", "bdt_score_vbf", "H2TXbb", "H2Msd", "run", "H2PNetMass"]

# Loop over all years and save event lists for each year in separate root files
for year, ev_dict in ev_dicts:

    hh4b_df = ev_dict["hh4b"]
    event_list = hh4b_df[eventlist_dict]
    eventlist_folder = "eventlist_files"
    array_to_save = {col: event_list[col].to_numpy() for col in event_list.columns}

    with uproot.recreate(f"{eventlist_folder}/eventlist_boostedHH4b_{year}.root") as file:
        file["tree"] = array_to_save

In [6]:
# Double-check file contents
years = ["2022", "2022EE", "2023", "2023BPix"]
dfs_from_root = {}

for year in years:
    filename = f"eventlist_files/eventlist_boostedHH4b_{year}.root"
    with uproot.open(filename) as file:
        tree = file["tree"]
        arrays = tree.arrays(library="np")
        df = pd.DataFrame(arrays)

        # Make dict
        dfs_from_root[year] = df

test_df = dfs_from_root["2022"] - dfs_from_root["2022EE"]
print(test_df)
# Display df
for year, df in dfs_from_root.items():
    print(f"DataFrame for year {year}:\n{df}\n")

           event  bdt_score  bdt_score_vbf    H2TXbb      H2Msd  run  \
0       933278.0  -0.190691      -0.287247 -0.877972  10.437500  0.0   
1       933279.0   0.387595       0.172054  0.929077 -16.750000  0.0   
2       985809.0  -0.030619      -0.176729 -0.309961 -67.968750  0.0   
3       976389.0  -0.267515       0.785077 -0.206275   8.812500  0.0   
4       976378.0  -0.890878      -0.761103 -0.188850   4.888672  0.0   
...          ...        ...            ...       ...        ...  ...   
123020       NaN        NaN            NaN       NaN        NaN  NaN   
123021       NaN        NaN            NaN       NaN        NaN  NaN   
123022       NaN        NaN            NaN       NaN        NaN  NaN   
123023       NaN        NaN            NaN       NaN        NaN  NaN   
123024       NaN        NaN            NaN       NaN        NaN  NaN   

        H2PNetMass  
0        29.488575  
1        -5.171450  
2       -39.118672  
3       -10.132099  
4       -68.518494  
...      