In [None]:
# automatically reloads imported files on edits
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np

from pathlib import Path

from HH4b import utils
import itertools
import correctionlib

In [None]:
# YEARS = ["2022", "2022EE", "2023", "2023BPix", "2024", "2025"]
YEARS = ["2024", "2025"]
YEARS_COMBINED_DICT = {
    # "2022All": ["2022", "2022EE"],
    # "2023All": ["2023", "2023BPix"],
    "2024All": ["2024"],
    "2025All": ["2025"],
}

tag = "nanov15_20251202_v15_signal"

STORAGE_PROJ_DIR = Path("/ceph/cms/store/user/zichun/bbbb")
DATA_DIR = STORAGE_PROJ_DIR / f"skimmer/{tag}"
PROCESSED_DIR = STORAGE_PROJ_DIR / f"signal_processed/skimmer/{tag}"
PROCESSED_DIR.mkdir(exist_ok=True, parents=True)
PLOT_DIR = Path("bbbbskimmer_plots") / tag
PLOT_DIR.mkdir(exist_ok=True, parents=True)

REPROCESS: bool = False  # if True, reprocess from the skimmed ntuples
APPLY_TRIGGER_SF: bool = True

SAMPLES_DICT = {
    # referenced from hh_vars
    "data": ["JetMET"],
    "hh4b": ["GluGlutoHHto4B"],
    "vbfhh4b": ["VBFHHto4B_CV_1_C2V_1_C3_1"],
    "vbfhh4b-k2v0": ["VBFHHto4B_CV_1_C2V_0_C3_1"],
    "qcd": [
        # "QCD_HT-200to400",
        "QCD_HT-400to600",
        "QCD_HT-600to800",
        "QCD_HT-800to1000",
        "QCD_HT-1000to1200",
        "QCD_HT-1200to1500",
        "QCD_HT-1500to2000",
        "QCD_HT-2000",
    ],
    "ttbar": ["TTto4Q", "TTtoLNu2Q", "TTto2L2Nu"],
    "vhtobb": [
        "WplusH_Hto2B_Wto2Q_M-125",
        "WminusH_Hto2B_Wto2Q_M-125",
        "ZH_Hto2B_Zto2Q_M-125",
        "ggZH_Hto2B_Zto2Q_M-125",
    ],
    "novhhtobb": ["GluGluHto2B_PT-200_M-125", "VBFHto2B_M-125"],
    "tthtobb": ["ttHto2B_M-125"],
    "zz": ["ZZ"],
    "nozzdiboson": ["WW", "WZ"],
    "vjets": ["Wto2Q-2Jets_Bin-PTQQ", "Zto2Q-2Jets_PTQQ"],
}
MC_SAMPLES_LIST = [sample for sample in SAMPLES_DICT.keys() if sample != "data"]

In [None]:
from HH4b.hh_vars import LUMI

actual_lumis = {
    "2022": 7.966932936,
    "2022EE": 26.640878954,
    "2023": 18.062658998,
    "2023BPix": 9.692645994,
    "2024": 107.125771309,
    "2025": (20780.0 + 14000.0 + 25290.0 + 30350.0) / 1000,  # projected lumi in fb-1
}
for year, era_list in YEARS_COMBINED_DICT.items():
    lumi_acc = 0
    for era in era_list:
        lumi_acc += actual_lumis[era]
    actual_lumis[year] = lumi_acc


# convert fb-1 to pb-1
for k, v in actual_lumis.items():
    actual_lumis[k] = v * 1000
    # override
    LUMI[k] = v * 1000
print("Actual lumis (pb-1):", actual_lumis)

In [None]:
# Columns to load from the ntuples
sys_vars = ["FSRPartonShower", "ISRPartonShower", "pileup"]
weight_shifts = sys_vars + ["pdf_weights", "scale_weights"]

fatjet_vars = [
    "bbFatJetPt",
    "bbFatJetEta",
    "bbFatJetPhi",
    "bbFatJetMsd",
    "bbFatJetPNetMassLegacy",
    "bbFatJetParT3massGeneric",
    "bbFatJetParT3massX2p",
    "bbFatJetParT3TXbb",
]
mass_vars = [
    "bbFatJetParT3massGeneric",
    "bbFatJetParT3massX2p",
]

pt_variations = []
# for jesr, ud in itertools.product(["JES", "JER"], ["up", "down"]):
#     pt_variations.append(f"bbFatJetPt_{jesr}_{ud}")

mass_variations = []
# for jmsr, ud in itertools.product(["JMS", "JMR"], ["up", "down"]):
#     for var in mass_vars:
#         mass_variations.append(f"{var}_{jmsr}_{ud}")


base_columns = [(var, 2) for var in fatjet_vars] + [("weight", 1)]

load_columns_pt_var = []
for pt_var in pt_variations:
    load_columns_pt_var.append((pt_var, 2))

load_columns_mass_var = []
for mass_var in mass_variations:
    load_columns_mass_var.append((mass_var, 2))

load_weight_shifts = []
for var, ud in itertools.product(sys_vars, ["Up", "Down"]):
    load_weight_shifts.append((f"weight_{var}{ud}", 1))

MC_common_extra_columns = load_columns_mass_var + load_columns_pt_var + load_weight_shifts
MC_common_extra_columns = []  # TODO: disable variations for now

extra_columns_dict = {
    "data": [],
}

for sample in MC_SAMPLES_LIST:
    extra_columns_dict[sample] = MC_common_extra_columns
extra_columns_dict

In [None]:
triggers = {
    "2022": [
        "AK8PFJet425_SoftDropMass40",
        "AK8PFJet250_SoftDropMass40_PFAK8ParticleNetBB0p35",
    ],
    "2022EE": [
        "AK8PFJet425_SoftDropMass40",
        "AK8PFJet250_SoftDropMass40_PFAK8ParticleNetBB0p35",
    ],
    "2023": [
        "AK8PFJet425_SoftDropMass40",
        "AK8PFJet250_SoftDropMass40_PFAK8ParticleNetBB0p35",
        "AK8PFJet230_SoftDropMass40_PNetBB0p06",
    ],
    "2023BPix": [
        "AK8PFJet425_SoftDropMass40",
        "AK8PFJet230_SoftDropMass40_PNetBB0p06",
    ],
    "2024": [
        "AK8PFJet400_SoftDropMass30",
        "AK8PFJet425_SoftDropMass30",
        "AK8PFJet230_SoftDropMass40_PNetBB0p06",
    ],
    "2025": [
        "AK8PFJet400_SoftDropMass30",
        "AK8PFJet425_SoftDropMass30",
        "AK8PFJet230_SoftDropMass40_PNetBB0p06",
    ],
}

In [None]:
def get_era_path(year):
    return PROCESSED_DIR / f"events_{year}.pkl"


def get_combined_path(combined_year):
    return PROCESSED_DIR / f"events_{combined_year}.pkl"


# Check if all combined years exist
all_combined_exist = all(
    get_combined_path(combined_year).exists() for combined_year in YEARS_COMBINED_DICT.keys()
)

if not REPROCESS and all_combined_exist:
    # Load all combined years directly
    print("Loading all combined years...")
    events_combined = {}
    for combined_year in YEARS_COMBINED_DICT.keys():
        combined_path = get_combined_path(combined_year)
        print(f"Loading combined year {combined_year}...")
        with combined_path.open("rb") as f:
            events_combined[combined_year] = pd.read_pickle(f)
    print("All combined years loaded!")

else:
    # ============================================================================
    # STEP 1: Process each era individually
    # ============================================================================

    for year in YEARS:
        era_path = get_era_path(year)

        if not era_path.exists():
            print(f"Processing era: {year}")
            events_era = {}

            # Process each sample for this era
            for sample, sample_list in SAMPLES_DICT.items():
                print(f"Loading {sample} for {year}...")
                triggers_cols = [(trigger, 1) for trigger in triggers[year]]

                columns = triggers_cols + base_columns + extra_columns_dict.get(sample, [])
                dataframes = {
                    **utils.load_samples(
                        data_dir=str(DATA_DIR),
                        samples={sample: sample_list},
                        year="2024" if (year == "2025" and sample != "data") else year,
                        columns=utils.format_columns(columns),
                        variations=True,
                        weight_shifts=weight_shifts,
                    )
                }

                # Process and concatenate dataframes for this sample
                sample_dfs = []
                for key, df in dataframes.items():
                    # Scale finalWeight for 2025 MC (which is actually 2024 MC)
                    if year == "2025" and sample != "data":
                        df["finalWeight"] = df["finalWeight"] * (
                            actual_lumis["2025"] / actual_lumis["2024"]
                        )

                    sample_dfs.append(df)

                # Concatenate all dataframes for this sample
                events_era[sample] = pd.concat(sample_dfs, ignore_index=True)
                print(f"  {sample}: {len(events_era[sample])} events")

                # Clear intermediate dataframes to free memory
                del dataframes, sample_dfs

            # Save this era's data
            with era_path.open("wb") as f:
                pd.to_pickle(events_era, f)
            print(f"Era {year} saved to {era_path}")

            # Clear era data to free memory
            del events_era
        else:
            print(f"Era {year} already processed at {era_path}")

    print("Individual era processing complete!")

    # ============================================================================
    # STEP 2: Combine eras into combined years
    # ============================================================================

    events_combined = {}

    for combined_year, year_list in YEARS_COMBINED_DICT.items():
        combined_path = get_combined_path(combined_year)

        if REPROCESS or not combined_path.exists():
            print(f"\nCombining eras for {combined_year}: {year_list}")

            # Load each era
            era_data = {}
            for year in year_list:
                era_path = get_era_path(year)
                if era_path.exists():
                    print(f"Loading era {year}...")
                    with era_path.open("rb") as f:
                        era_data[year] = pd.read_pickle(f)
                else:
                    print(f"Warning: Era file {era_path} not found!")

            # Combine samples across eras
            events_combined[combined_year] = {}
            for sample in SAMPLES_DICT.keys():
                sample_dfs = []
                for year in year_list:
                    if year in era_data and sample in era_data[year]:
                        sample_dfs.append(era_data[year][sample])

                if sample_dfs:
                    events_combined[combined_year][sample] = pd.concat(
                        sample_dfs, ignore_index=True
                    )
                    total_weights = events_combined[combined_year][sample]["finalWeight"].sum()
                    total_events = len(events_combined[combined_year][sample])
                    print(f"  {sample}: {total_events} events (total weight: {total_weights})")

            # Save combined year
            with combined_path.open("wb") as f:
                pd.to_pickle(events_combined[combined_year], f)
            print(f"Combined year {combined_year} saved to {combined_path}")

            # Clear era data to free memory for next iteration
            del era_data
        else:
            # If combined year already exists, load it
            print(f"Combined year {combined_year} already exists, loading...")
            with combined_path.open("rb") as f:
                events_combined[combined_year] = pd.read_pickle(f)

    print("\nAll processing complete!")

In [None]:
events_sel = {}
for year, events_dict in events_combined.items():
    print(f"Year {year} selection")
    events_sel[year] = {}
    for sample, df in events_dict.items():
        mask = (
            (df[("bbFatJetPt", 0)] > 250)
            & (df[("bbFatJetPt", 1)] > 250)
            & (df[("bbFatJetMsd", 0)] > 40)
            & (df[("bbFatJetParT3TXbb", 0)] > 0.3)
            & (df[("bbFatJetParT3massX2p", 0)] > 60)
            & (df[("bbFatJetParT3massX2p", 1)] > 60)
        )
        trigger_mask = np.zeros(len(df), dtype=bool)
        for trigger in triggers[year.replace("All", "")]:
            trigger_mask |= df[(trigger, 0)] == 1
        total_mask = mask & trigger_mask
        total_sel_weights = df["finalWeight"][total_mask].sum()
        total_weights = df["finalWeight"].sum()

        print(
            f"    {sample}: {total_sel_weights:.2f} / {total_weights:.2f} selected ({100 * total_sel_weights / total_weights:.2f}%)"
        )
        events_sel[year][sample] = df[total_mask].copy()

In [None]:
from HH4b.utils import ShapeVar, singleVarHist
from HH4b import plotting
from HH4b.hh_vars import bg_keys

sig_keys = ["hh4b", "vbfhh4b", "vbfhh4b-k2v0"]
control_plot_vars = [
    ShapeVar(var="bbFatJetPhi0", label="Fatjet 0 $\phi$", bins=[50, -np.pi, np.pi]),
    ShapeVar(var="bbFatJetPhi1", label="Fatjet 1 $\phi$", bins=[50, -np.pi, np.pi]),
    ShapeVar(var="bbFatJetParT3TXbb0", label=r"FatJet 0 GloParT-v3 $T_{Xbb}$", bins=[50, 0.3, 1]),
    ShapeVar(var="bbFatJetParT3TXbb1", label=r"FatJet 1 GloParT-v3 $T_{Xbb}$", bins=[50, 0.0, 1]),
    ShapeVar(
        var="bbFatJetParT3massGeneric0",
        label="Fatjet 0 GloParT-v3 Regressed Mass (Generic) [GeV]",
        bins=[50, 50, 150],
    ),
    ShapeVar(
        var="bbFatJetParT3massGeneric1",
        label="Fatjet 1 GloParT-v3 Regressed Mass (Generic) [GeV]",
        bins=[50, 50, 150],
    ),
    ShapeVar(
        var="bbFatJetParT3massX2p0",
        label="Fatjet 0 GloParT-v3 Regressed Mass (X2p) [GeV]",
        bins=[50, 60, 150],
    ),
    ShapeVar(
        var="bbFatJetParT3massX2p1",
        label="Fatjet 1 GloParT-v3 Regressed Mass (X2p) [GeV]",
        bins=[50, 60, 150],
    ),
    ShapeVar(var="bbFatJetPt0", label="Fatjet 0 $p_{T}$ [GeV]", bins=[50, 250, 2000]),
    ShapeVar(var="bbFatJetPt1", label="Fatjet 1 $p_{T}$ [GeV]", bins=[50, 250, 2000]),
    ShapeVar(var="bbFatJetEta0", label="Fatjet 0 $\eta$", bins=[50, -2.5, 2.5]),
    ShapeVar(var="bbFatJetEta1", label="Fatjet 1 $\eta$", bins=[50, -2.5, 2.5]),
]


for year in YEARS_COMBINED_DICT.keys():
    print(f"Making control plots for {year}...")
    plot_dir = PLOT_DIR / f"control/{year}"
    plot_dir.mkdir(exist_ok=True, parents=True)

    # Find the normalization needed to reweight QCD
    kwargs = {}

    hists = {}
    for i, shape_var in enumerate(control_plot_vars):
        if shape_var.var not in hists:
            hists[shape_var.var] = singleVarHist(
                events_sel[year],
                shape_var,
                weight_key="finalWeight",
            )

            qcd_norm = plotting.ratioHistPlot(
                hists[shape_var.var],
                year,
                sig_keys,
                bg_keys,
                name=str(plot_dir / f"{shape_var.var}"),
                show=False,
                log=True,
                plot_significance=False,
                significance_dir=shape_var.significance_dir,
                ratio_ylims=[0.2, 1.8],
                bg_err_mcstat=True,
                reweight_qcd=True,
                xbin_gev="[GeV]" in shape_var.label,
                **kwargs,
            )

            # pick the normalization weight chosen for the first variable
            if i == 0:
                kwargs["qcd_norm"] = qcd_norm