In [1]:
# automatically reloads imported files on edits
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from argparse import Namespace
from HH4b.postprocessing.PostProcess import load_process_run3_samples

In [2]:
# fix: are we only loading 2022 despite the loop?
data_folder = "24May24_v12_private_signal"
args = Namespace(
    templates_tag="24June27",
    data_dir="/ceph/cms/store/user/cmantill/bbbb/skimmer/",
    tag=data_folder,
    years=["2022"],
    training_years=None,
    mass="H2PNetMass",
    bdt_model="24May31_lr_0p02_md_8_AK4Away",
    bdt_config="24May31_lr_0p02_md_8_AK4Away",
    txbb_wps=[0.975, 0.92],
    bdt_wps=[0.98, 0.88, 0.03],
    method="sideband",
    vbf_txbb_wp=0.95,
    vbf_bdt_wp=0.98,
    sig_keys=["hh4b", "vbfhh4b"],
    pt_first=300,
    pt_second=250,
    bdt_roc=False,
    control_plots=False,
    fom_scan=False,
    fom_scan_bin1=True,
    fom_scan_bin2=True,
    fom_scan_vbf=False,
    templates=False,
    legacy=True,
    vbf=True,
    vbf_priority=False,
    weight_ttbar_bdt=1,
    blind=True,
)

In [None]:
bdt_training_keys = ["qcd", "vbfhh4b-k2v0", "hh4b", "ttbar"]
mass_window = np.array([105, 150])
years = ["2022", "2022EE", "2023", "2023BPix"]

ev_dicts = []
for year in years:
    ev_dict, _ = load_process_run3_samples(
        args,
        year=year,
        bdt_training_keys=bdt_training_keys,
        control_plots=False,
        plot_dir="plot_dir",
        mass_window=mass_window,
    )
    ev_dicts.append((year, ev_dict))

"""
python3 PostProcess.py --templates-tag 24June27 --tag 24May24_v12_private_signal --mass H2PNetMass --legacy --bdt-config 24May31_lr_0p02_md_8_AK4Away --bdt-model 24May31_lr_0p02_md_8_AK4Away --txbb-wps 0.975 0.92 --bdt-wps 0.98 0.88 0.03 --vbf-txbb-wp 0.95 --vbf-bdt-wp 0.98 --no-bdt-roc --no-fom-scan --no-fom-scan-bin2 --no-fom-scan-bin1 --data-dir /ceph/cms/store/user/cmantill/bbbb/skimmer/ --method abcd --no-vbf-priority --vbf --no-fom-scan-vbf --pt-second 250 --templates --years 2022 --sig-keys hh4b vbfhh4b
"""
# make array with event nr, signal category
# make mask with signal category cutoffs, apply to array

# save as root

In [None]:
import uproot
import pandas as pd
import os

# select columns to extract for eventlist
eventlist_dict = [
    "event",
    "bdt_score",
    "bdt_score_vbf",
    "H2TXbb",
    "H2Msd",
    "run",
    "H2PNetMass",
    "luminosityBlock",
]
keys_to_save = ["hh4b", "vbfhh4b", "data"]

# Ensure the eventlist folder exists
eventlist_folder = "eventlist_files_2024Oct22"
os.makedirs(eventlist_folder, exist_ok=True)

# TODO: check how data is loaded (should reference the folder names somewhere)
# TODO: print eventlist_dict to see if you can get keys "data" and vbfhh4b
# Loop over all years and save event lists for each year in separate root files
for year, ev_dict in ev_dicts:
    for key in ev_dict.keys():
        if "data" in key or "hh4b" in key or "vbfhh4b" in key:
            print(key)
            tree_df = ev_dict[key]
            event_list = tree_df[eventlist_dict]
            array_to_save = {col: event_list[col].to_numpy() for col in event_list.columns}

            # Define the ROOT file path
            file_path = f"{eventlist_folder}/eventlist_boostedHH4b_{year}.root"

            # Check if the ROOT file already exists
            if os.path.exists(file_path):
                # File exists, use update mode to append the new tree
                with uproot.update(file_path) as file:
                    file[key] = array_to_save  # Append new tree
            else:
                # File doesn't exist, create a new one
                with uproot.recreate(file_path) as file:
                    file[key] = array_to_save  # Create the first tree

In [None]:
# Double-check file contents
years = ["2022", "2022EE", "2023", "2023BPix"]
dfs_from_root = {}

for year in years:
    # Define the ROOT file path
    file_path = f"{eventlist_folder}/eventlist_boostedHH4b_{year}.root"

    with uproot.open(file_path) as file:
        for key in file.keys():
            tree = file[key]
            arrays = tree.arrays(library="np")
            df = pd.DataFrame(arrays)

            # Make dict
            dfs_from_root[(year, key)] = df

# test_df = dfs_from_root["2022"] - dfs_from_root["2022EE"]
# print(test_df)
# Display df
for (year, key), df in dfs_from_root.items():
    print(f"DataFrame for year {year}, tree: '{key}':\n{df}\n")