In [1]:
import os
import sys

sys.path.append("..")
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import mplhep as hep
import matplotlib.ticker as mticker
import HH4b.utils as utils
import HH4b.plotting as plotting
from HH4b.postprocessing.postprocessing import Region, weight_shifts
from HH4b.utils import ShapeVar, CUT_MAX_VAL
from HH4b.hh_vars import samples, data_key, bg_keys, sig_keys
from sklearn.metrics import roc_curve, auc

plt.style.use(hep.style.CMS)
hep.style.use("CMS")
formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))
plt.rcParams.update({"font.size": 12})

In [2]:
# automatically reloads imported files on edits
%load_ext autoreload
%autoreload 2

In [3]:
year = "2022EE"

samples_year = samples[year].copy()

MAIN_DIR = "/ceph/cms/store/user/dprimosc/bbbb/skimmer"
# this is the directory to the files

tag = "24Sep3_v12_private_signal"
path_to_dir = f"{MAIN_DIR}/{tag}/"

# define dictionary with directories of files (this can be configured in a yaml file later in the script)
sig_keys = ["hh4b"]
for key in list(samples_year.keys()):
    if key not in ["qcd", "ttbar"] + sig_keys:
        del samples_year[key]

sample_dirs = {path_to_dir: samples_year}

# make plot and template directory
date = "24Sep6"  # date of plotting
plot_dir = f"{MAIN_DIR}/plots/PostProcessing/{date}/{year}"
template_dir = f"templates/{date}/"  # why needed?
_ = os.system(f"mkdir -p {plot_dir}")
_ = os.system(f"mkdir -p {template_dir}/cutflows/{year}")

In [None]:
# Checking contents of parquet files
def examine_parquet_files(dir_path):

    file_printed = False
    for file_name in os.listdir(dir_path):
        if file_printed:
            break

        if file_name.endswith(".parquet"):
            file_printed = True

            file_path = os.path.join(dir_path, file_name)

            try:
                # Read the parquet file into a DataFrame
                pdf = pd.read_parquet(file_path)

                # Display file information
                print(f"\nContents of {file_path}:")
                print("Columns:")
                for column in pdf.columns:
                    if column[1] == 0:
                        # print column and number of entries
                        print(f"{column[0]}: {pdf[column[0]].shape[1]}")

            except Exception as e:
                print(f"Error reading {file_path}: {e}")


test_directory = "/ceph/cms/store/user/dprimosc/bbbb/skimmer/24Sep3_v12_private_signal/2022EE/GluGlutoHHto4B_kl-1p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV/parquet"
print(test_directory)
examine_parquet_files(test_directory)

In [5]:
# separate signal and background for to fit with load_samples function
bkg_sample_dirs = {}
sig_sample_dirs = {}

for path, datasets in sample_dirs.items():
    bkg_sample_dirs[path] = {"qcd": datasets.get("qcd", []), "ttbar": datasets.get("ttbar", [])}
    sig_sample_dirs[path] = {"hh4b": datasets.get("hh4b", [])}

In [None]:
# filters, currently just a placeholder value to apply no real filter
# TODO: what weights to use
filters = [
    [
        ("('ak8FatJetPt',0)", ">=", 400)("('ak8FatJetPt',0)", "<=", 600)(
            "('ak8FatJetPt',1)", ">=", 400
        )("('ak8FatJetPt',1)", "<=", 600)("('ak8FatJetMsd',0)", ">=", 50)(
            "('ak8FatJetMsd',0)", "<=", 250
        )(
            "('ak8FatJetMsd',1)", ">=", 50
        )(
            "('ak8FatJetMsd',1)", "<=", 250
        )
    ],
]

# background columns to load
bkg_load_columns = [
    ("weight", 1),
    ("ak8FatJetPNetTXbb", 2),
    ("ak8FatJetPNetTXbbLegacy", 2),
    ("ak8FatJetPt", 2),
    ("ak8FatJetMsd", 2),
]

# signal columns to load
sig_load_columns = [
    ("weight", 1),
    ("ak8FatJetPNetTXbb", 2),
    ("ak8FatJetPNetTXbbLegacy", 2),
    ("ak8FatJetPt", 2),
    ("ak8FatJetMsd", 2),
    ("ak8FatJetHiggsMatchIndex", 2),
    ("ak8FatJetNumBMatchedH1", 2),
    ("ak8FatJetNumBMatchedH2", 2),
]

# reformat into ("column name", "idx") format for reading multiindex columns
bkg_columns = []
for key, num_columns in bkg_load_columns:
    for i in range(num_columns):
        bkg_columns.append(f"('{key}', '{i}')")

sig_columns = []
for key, num_columns in sig_load_columns:
    for i in range(num_columns):
        sig_columns.append(f"('{key}', '{i}')")

# save cutflow as pandas table
# TODO: check this is running correctly
cutflow = pd.DataFrame(index=list(samples_year.keys()))

# dictionary that will contain all information (from all samples)
events_dict = {}
for input_dir, samples_dict in bkg_sample_dirs.items():
    events_dict = {
        **events_dict,
        # this function will load files (only the columns selected), apply filters and compute a weight per event
        # load background
        **utils.load_samples(
            input_dir,
            samples_dict,
            year,
            filters=filters,
            columns=bkg_columns,
            reorder_txbb=False,
            variations=False,
            # columns_mc=utils.format_columns(load_columns_mc),
        ),
    }
for input_dir, samples_dict in sig_sample_dirs.items():
    events_dict = {
        **events_dict,
        # load signal
        **utils.load_samples(
            input_dir,
            samples_dict,
            year,
            filters=filters,
            columns=sig_columns,
            reorder_txbb=False,
            variations=False,
            # columns_mc=utils.format_columns(load_columns_mc),
        ),
    }

utils.add_to_cutflow(events_dict, "Preselection", "finalWeight", cutflow)
print("\n", cutflow)

In [None]:
indexak8 = events_dict["hh4b"][
    "ak8FatJetHiggsMatchIndex"
].to_numpy()  # index of higgs matched to jet
nbh1ak8 = events_dict["hh4b"][
    "ak8FatJetNumBMatchedH1"
].to_numpy()  # number of bquarks matched to H1
nbh2ak8 = events_dict["hh4b"][
    "ak8FatJetNumBMatchedH2"
].to_numpy()  # number of bquarks matched to H2

matched_to_h1 = (indexak8 == 0) & (nbh1ak8 == 2)
matched_to_h2 = (indexak8 == 1) & (nbh2ak8 == 2)
matchedak8 = matched_to_h1 | matched_to_h2  # 2 b quarks matched to higgs (ak8 jet 0, ak8 ket 1)

# TODO: revise how this is done, something is still not right
pdf = pd.DataFrame(matchedak8, columns=["j1", "j2"])
mask = pd.concat([pdf["j1"], pdf["j2"]], ignore_index=True)
sig_true = np.ones(2 * len(events_dict["hh4b"]))
sig_true[~mask] = 0
print(matchedak8)

In [8]:
# Need to split up jets for object level analysis
# Just concatenate the two rows

In [None]:
# jet = 1  # second jet
# we use both jets
sig_key = "hh4b"
bg_keys = ["qcd", "ttbar"]
# bg_skip = 1

y_true = np.concatenate(
    [
        sig_true,
        np.zeros(
            np.sum(2 * len(events_dict[bg_key]) for bg_key in bg_keys)
        ),  # factor 2 needed bc we use both jets indivdiually
    ]
)
# print(y_true[np.sum(sig_cut):])

weights = np.concatenate(
    [events_dict[sig_key]["finalWeight"]]
    + [events_dict[sig_key]["finalWeight"]]
    + [events_dict[bg_key]["finalWeight"] for bg_key in bg_keys]
    + [events_dict[bg_key]["finalWeight"] for bg_key in bg_keys],
)

# following needs to match structure of weights
scores = np.concatenate(
    [events_dict[sig_key]["ak8FatJetPNetTXbb"][jet] for jet in (0, 1)]
    + [events_dict[bg_key]["ak8FatJetPNetTXbb"][0] for bg_key in bg_keys]
    + [events_dict[bg_key]["ak8FatJetPNetTXbb"][1] for bg_key in bg_keys],
)

legacy_scores = np.concatenate(
    [events_dict[sig_key]["ak8FatJetPNetTXbbLegacy"][jet] for jet in (0, 1)]
    + [events_dict[bg_key]["ak8FatJetPNetTXbbLegacy"][0] for bg_key in bg_keys]
    + [events_dict[bg_key]["ak8FatJetPNetTXbbLegacy"][1] for bg_key in bg_keys],
)

In [10]:
fpr, tpr, thresholds = roc_curve(y_true, scores, sample_weight=weights)
roc = {"fpr": fpr, "tpr": tpr, "thresholds": thresholds, "label": "v12"}

legacy_fpr, legacy_tpr, legacy_thresholds = roc_curve(y_true, legacy_scores, sample_weight=weights)
legacy_roc = {
    "fpr": legacy_fpr,
    "tpr": legacy_tpr,
    "thresholds": legacy_thresholds,
    "label": "Legacy",
}

rocs = {
    "V12 ROC": {"roc_1": roc},
    "Legacy ROC": {"roc_2": legacy_roc},
}

In [None]:
plotting.multiROCCurveGrey(
    rocs=rocs,
    sig_effs=[0.8, 0.9],  # Thresholds for signal efficiency lines
    xlim=[0, 1],
    ylim=[1e-5, 1],
    plot_dir=plot_dir,
    name="ak8FatJet12ROC_Comparison",
    show=True,
)