In [None]:
# automatically reloads imported files on edits
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from pathlib import Path
from tqdm import tqdm

from HH4b import utils
from HH4b.hh_vars import LUMI
import mplhep as hep

hep.style.use(hep.style.CMS)

from constants import (
    MASS_RANGE,
    MASS_BINS,
    PT_RANGE,
    PT_BINS,
    BR_Z_QQ,
    BR_Z_EE,
    BR_Z_MUMU,
    BR_Z_TAUTAU,
)

In [None]:
YEARS = ["2022", "2022EE", "2023", "2023BPix"]
YEARS_COMBINED_DICT = {
    "2022All": ["2022", "2022EE"],
    "2023All": ["2023", "2023BPix"],
}

PROCESSED_PATH: Path = Path("processed/corr_ZQQ_DYLL.pkl")
(PROCESSED_PATH.parent).mkdir(parents=True, exist_ok=True)
REPROCESS: bool = False  # if True, reprocess from the skimmed ntuples

SAMPLES_DICT = {
    "DYto2L": ["DYto2L"],
    "Zto2Q": ["Zto2Q"],
}

PLOT_DIR = Path("plots/corr_ZQQ_DYLL")
PLOT_DIR.mkdir(parents=True, exist_ok=True)

OUTPUT_PATH = Path("corrs/ZQQ_DYLL.pkl")
(OUTPUT_PATH.parent).mkdir(parents=True, exist_ok=True)

P4 = ("Mass", "Pt", "Eta", "Phi")
DYLL_columns = [("weight", 1)]
ZQQ_columns = [("weight", 1)]

for i in range(2):
    for branch in P4:
        DYLL_columns.append((f"GenLep{i+1}{branch}", 1))
        ZQQ_columns.append((f"GenQ{i+1}{branch}", 1))

for branch in P4:
    DYLL_columns.append((f"GenZ{branch}", 1))
    ZQQ_columns.append((f"GenZ{branch}", 1))

COLUMN_DICT = {
    "DYto2L": DYLL_columns,
    "Zto2Q": ZQQ_columns,
}

SAMPLE_BR = {
    "DYto2L": BR_Z_EE + BR_Z_MUMU + BR_Z_TAUTAU,
    "Zto2Q": BR_Z_QQ,
}

In [None]:
def get_GenZ_PtEtaPhiMass(df: pd.DataFrame, particle: str) -> pd.Series:
    """Calculate the GenZ mass from the muon 4-vectors."""
    part1_pt = df[(f"{particle}1Pt", 0)]
    part1_eta = df[(f"{particle}1Eta", 0)]
    part1_phi = df[(f"{particle}1Phi", 0)]
    part1_mass = df[(f"{particle}1Mass", 0)]
    part1_px = part1_pt * np.cos(part1_phi)
    part1_py = part1_pt * np.sin(part1_phi)
    part1_pz = part1_pt * np.sinh(part1_eta)
    part1_E = np.sqrt(part1_px**2 + part1_py**2 + part1_pz**2 + part1_mass**2)

    part2_pt = df[(f"{particle}2Pt", 0)]
    part2_eta = df[(f"{particle}2Eta", 0)]
    part2_phi = df[(f"{particle}2Phi", 0)]
    part2_mass = df[(f"{particle}2Mass", 0)]
    part2_px = part2_pt * np.cos(part2_phi)
    part2_py = part2_pt * np.sin(part2_phi)
    part2_pz = part2_pt * np.sinh(part2_eta)
    part2_E = np.sqrt(part2_px**2 + part2_py**2 + part2_pz**2 + part2_mass**2)

    GenZ_px = part1_px + part2_px
    GenZ_py = part1_py + part2_py
    GenZ_pz = part1_pz + part2_pz
    GenZ_E = part1_E + part2_E

    GenZ_pt = np.sqrt(GenZ_px**2 + GenZ_py**2)
    GenZ_eta = np.arcsinh(GenZ_pz / np.sqrt(GenZ_px**2 + GenZ_py**2))
    GenZ_phi = np.arctan2(GenZ_py, GenZ_px)
    GenZ_mass = np.sqrt(GenZ_E**2 - (GenZ_px**2 + GenZ_py**2 + GenZ_pz**2))

    return {
        "GenZFromDecayPt": GenZ_pt,
        "GenZFromDecayEta": GenZ_eta,
        "GenZFromDecayPhi": GenZ_phi,
        "GenZFromDecayMass": GenZ_mass,
    }

In [None]:
if REPROCESS or not PROCESSED_PATH.exists():
    path_dir = (
        "/ceph/cms/store/user/zichun/bbbb/skimmer/ZQQtoDYLLHT25June4_v12_ZbbSFZMuMu_zbb-Zto2Q-DYLL"
    )

    events_dict = {}
    # Process eras
    for year in tqdm(YEARS):
        events_dict[year] = {}

        for sample, sample_list in SAMPLES_DICT.items():
            columns = COLUMN_DICT[sample]
            temp_dataframes = utils.load_samples(
                data_dir=path_dir,
                samples={sample: sample_list},
                year=year,
                columns=utils.format_columns(columns),
                variations=True,
                weight_shifts=[],
            )

            # Process each dataframe in temp_dataframes
            for sample_name, df in temp_dataframes.items():
                # Add GenZ kinematics
                if sample_name == "DYto2L":
                    particle = "GenLep"
                elif sample_name == "Zto2Q":
                    particle = "GenQ"
                else:
                    raise ValueError(f"Unknown sample name: {sample_name}")

                df = df.assign(**get_GenZ_PtEtaPhiMass(df, particle))
                events_dict[year][sample_name] = df

    # Combine into years
    events_combined = {year: {} for year in YEARS_COMBINED_DICT.keys()}
    for sample in SAMPLES_DICT:
        for combined_year, year_list in YEARS_COMBINED_DICT.items():
            events_combined[combined_year][sample] = pd.concat(
                [
                    events_dict[year][sample]
                    for year in year_list
                    if year in events_dict and sample in events_dict[year]
                ]
            )

    with PROCESSED_PATH.open("wb") as f:
        pd.to_pickle(events_combined, f)
    print(f"Processed data saved to {PROCESSED_PATH}")
else:
    print(f"Loading processed data from {PROCESSED_PATH}")
    with PROCESSED_PATH.open("rb") as f:
        events_combined = pd.read_pickle(f)

In [None]:
cuts = {
    ("GenZPt", 0): PT_RANGE,
    ("GenZMass", 0): (0, np.inf),
}

events_combined_sel = {}
for year, events in events_combined.items():
    events_combined_sel[year] = {}
    for sample, df in events.items():
        # Apply cuts
        for cut_name, (low, high) in cuts.items():
            df = df[(df[cut_name] >= low) & (df[cut_name] <= high)]
        # Reset index after filtering
        df = df.reset_index(drop=True)

        # Store the selected events
        events_combined_sel[year][sample] = df

In [None]:
# Define which samples go in numerator and denominator
numerator_samples = ["DYto2L"]  # Add more samples as needed
denominator_samples = ["Zto2Q"]  # Add more samples as needed

for feature_name, feature_label, bins in zip(
    [("GenZMass", 0), ("GenZPt", 0), "GenZFromDecayMass", "GenZFromDecayPt"],
    [r"Z Mass [GeV]", r"Z $p_\mathrm{T}$ [GeV]", r"Z Mass [GeV]", r"Z $p_\mathrm{T}$ [GeV]"],
    [MASS_BINS, PT_BINS, MASS_BINS, PT_BINS],
):
    for year in events_combined_sel:
        fig, (ax1, ax2) = plt.subplots(
            2, 1, figsize=(12, 10), gridspec_kw={"height_ratios": [3, 1], "hspace": 0.1}
        )
        bin_centers = (bins[:-1] + bins[1:]) / 2

        # Initialize arrays for numerator and denominator
        numerator_total = np.zeros(len(bins) - 1)
        numerator_error_sq = np.zeros(len(bins) - 1)
        denominator_total = np.zeros(len(bins) - 1)
        denominator_error_sq = np.zeros(len(bins) - 1)

        # Plot individual samples and calculate sums
        for sample in events_combined_sel[year]:
            feature = events_combined_sel[year][sample][feature_name].to_numpy()
            branching_ratio = SAMPLE_BR[sample]
            # weight = events_combined_sel[year][sample]["finalWeight"].to_numpy() / branching_ratio
            weight = events_combined_sel[year][sample]["finalWeight"].to_numpy()

            # Alternative: Calculate both without density, then normalize manually
            hist_counts, bin_edges = np.histogram(feature, bins=bins, weights=weight, density=False)
            weight_sq_hist, _ = np.histogram(feature, bins=bins, weights=weight**2, density=False)

            # Calculate normalization factors
            bin_widths = bin_edges[1:] - bin_edges[:-1]
            total_sum_weights = np.sum(weight)

            # Convert to density
            hist = hist_counts / (bin_widths * total_sum_weights)
            weight_sq_hist_density = weight_sq_hist / (bin_widths * total_sum_weights) ** 2

            # Plot on main axis with density normalization
            ax1.hist(
                feature,
                bins=bins,
                weights=weight,
                label=sample,
                histtype="step",
                density=True,
            )

            # Determine which sum this sample contributes to
            if sample in numerator_samples:
                numerator_total += hist
                numerator_error_sq += weight_sq_hist_density
            elif sample in denominator_samples:
                denominator_total += hist
                denominator_error_sq += weight_sq_hist_density

        numerator_error = np.sqrt(numerator_error_sq)
        denominator_error = np.sqrt(denominator_error_sq)

        # Main plot formatting
        ax1.set_ylabel("Density")
        ax1.set_yscale("log")
        ax1.legend()
        ax1.tick_params(axis="x", labelbottom=False)  # Hide x-axis labels on top plot

        # Set x-axis limits - extend to 2000 for pT plots
        if feature_name == "DimuonPt":
            ax1.set_xlim(200, 2000)

        # Ratio plot
        if len(numerator_samples) > 0 and len(denominator_samples) > 0:
            # Calculate ratio and its error
            ratio = np.divide(
                numerator_total,
                denominator_total,
                out=np.zeros_like(numerator_total),
                where=denominator_total != 0,
            )

            # Error propagation for ratio: sqrt((σ_num/denom)² + (num*σ_denom/denom²)²)
            ratio_error = np.zeros_like(ratio)
            mask = denominator_total > 0
            ratio_error[mask] = np.sqrt(
                (numerator_error[mask] / denominator_total[mask]) ** 2
                + (numerator_total[mask] * denominator_error[mask] / denominator_total[mask] ** 2)
                ** 2
            )

            # Plot ratio with error bars
            ax2.errorbar(bin_centers, ratio, yerr=ratio_error, fmt="ko", markersize=3, capsize=2)

            # Add horizontal line at y=1
            ax2.axhline(y=1, color="red", linestyle="--", alpha=0.7)

            # Create dynamic label for ratio plot
            numerator_label = " + ".join(numerator_samples)
            denominator_label = " + ".join(denominator_samples)
            ratio_label = f"{numerator_label} / {denominator_label}"

            # Ratio plot formatting
            ax2.set_xlabel(feature_label)
            ax2.set_ylabel(ratio_label)
            ax2.set_ylim(0, 2)  # Adjust as needed
            ax2.grid(True, alpha=0.3)

            # Set x-axis limits for ratio plot too
            if feature_name == "GenZPt":
                ax2.set_xlim(PT_BINS[0], PT_BINS[-1])

        plt.tight_layout()

        hep.cms.label(
            ax=ax1,
            label="Work in Progress",
            data=True,
            year=year.replace("All", ""),
            com=13.6,
            lumi=(round(LUMI[year] / 1000, 2)),
        )

        if isinstance(feature_name, tuple):
            name = feature_name[0]
        else:
            name = feature_name
        plt.savefig(PLOT_DIR / f"{name}_{year}.pdf", bbox_inches="tight")
        plt.close()

# Derive $f_\mathrm{ZQQ \to DYLL} (p_\mathrm{T})$

In [None]:
# Define which samples go in numerator and denominator
numerator_samples = ["DYto2L"]
denominator_samples = ["Zto2Q"]

SF_dict = {}

for year in events_combined_sel:
    numerator_counts = np.zeros(len(PT_BINS) - 1)
    denominator_counts = np.zeros(len(PT_BINS) - 1)

    # Statistical error tracking
    numerator_sumw2 = np.zeros(len(PT_BINS) - 1)
    denominator_sumw2 = np.zeros(len(PT_BINS) - 1)

    # Process each sample
    for sample in events_combined_sel[year]:
        # pt = events_combined_sel[year][sample]["GenZFromDecayPt"].to_numpy()
        pt = events_combined_sel[year][sample][("GenZPt", 0)].to_numpy()
        weight = events_combined_sel[year][sample]["finalWeight"].to_numpy()

        # Create histogram for this sample in PT bins
        hist, bin_edges = np.histogram(pt, bins=PT_BINS, weights=weight, density=True)
        hist_sumw2, _ = np.histogram(
            pt, bins=PT_BINS, weights=weight**2, density=False
        )  # Used for statistical error

        # Calculate bin widths for proper density error conversion
        bin_widths = bin_edges[1:] - bin_edges[:-1]

        # Convert sumw2 to density scale: divide by (bin_width * total_sum_weights)^2
        total_sum_weights = np.sum(weight)
        hist_sumw2_density = hist_sumw2 / (bin_widths * total_sum_weights) ** 2

        # Determine which sum this sample contributes to
        if sample in numerator_samples:
            numerator_counts += hist
            numerator_sumw2 += hist_sumw2_density
        elif sample in denominator_samples:
            denominator_counts += hist
            denominator_sumw2 += hist_sumw2_density

    # Calculate scale factors
    scale_factor = np.divide(
        numerator_counts,
        denominator_counts,
        out=np.zeros_like(numerator_counts),
        where=denominator_counts > 0,
    )

    # Calculate statistical errors (scaled for density)
    numerator_stat_err = np.sqrt(numerator_sumw2)
    denominator_stat_err = np.sqrt(denominator_sumw2)

    # Statistical error propagation for scale factor
    # For ratio A/B: σ_ratio = |A/B| * sqrt((σ_A/A)² + (σ_B/B)²)
    numerator_rel_err = np.divide(
        numerator_stat_err,
        np.abs(numerator_counts),
        out=np.zeros_like(numerator_stat_err),
        where=np.abs(numerator_counts) > 0,
    )
    denominator_rel_err = np.divide(
        denominator_stat_err,
        denominator_counts,
        out=np.zeros_like(denominator_stat_err),
        where=denominator_counts > 0,
    )

    sf_rel_stat_err = np.sqrt(numerator_rel_err**2 + denominator_rel_err**2)
    sf_stat_err = np.abs(scale_factor) * sf_rel_stat_err

    # Scale factor with statistical uncertainties
    scale_factor_up = scale_factor + sf_stat_err
    scale_factor_down = scale_factor - sf_stat_err

    # Create dynamic labels for the scale factor
    numerator_label = "_".join(numerator_samples)
    denominator_label = "_".join(denominator_samples)

    SF_dict[year] = {
        "nominal": scale_factor,
        "up": scale_factor_up,
        "down": scale_factor_down,
        "pt": PT_BINS,
    }

    # Print summary for this year
    print(f"\n{year} Scale Factors ({numerator_label}/{denominator_label}):")
    pt_centers = (PT_BINS[:-1] + PT_BINS[1:]) / 2
    for i, (pt_center, sf, sf_err) in enumerate(zip(pt_centers, scale_factor, sf_stat_err)):
        print(f"  pT bin {pt_center:.0f} GeV: {sf:.3f} ± {sf_err:.3f}")

# Save to pickle file
with OUTPUT_PATH.open("wb") as f:
    pd.to_pickle(SF_dict, f)

print(f"\nScale factors saved to {OUTPUT_PATH}")

In [None]:
# Plot the scale factors
for year in SF_dict:
    sf = SF_dict[year]

    fig, ax = plt.subplots(figsize=(10, 6))

    # Plot nominal scale factor
    ax.errorbar(
        (PT_BINS[:-1] + PT_BINS[1:]) / 2,
        sf["nominal"],
        xerr=(PT_BINS[1:] - PT_BINS[:-1]) / 2,
        yerr=sf["up"] - sf["nominal"],
        fmt="o",
        color="blue",
        markersize=5,
        capsize=3,
    )

    # plot 1
    ax.axhline(y=1, color="red", linestyle="--", label="Nominal SF = 1")

    ax.set_xlabel(r"Z $p_\mathrm{T}$ [GeV]")
    ax.set_ylabel("Scale Factor")
    # ax.set_xlim(PT_BINS[0], PT_BINS[-1])
    ax.set_ylim(0.5, 1.5)
    # ax.grid(True)

    hep.cms.label(
        ax=ax,
        label="Work in Progress",
        data=True,
        year=year.replace("All", ""),
        com=13.6,
        lumi=(round(LUMI[year] / 1000, 2)),
    )

    plt.tight_layout()
    plt.savefig(PLOT_DIR / f"SF_{year}.pdf", bbox_inches="tight")
    plt.show()