# Ratio Plot after JSMR

In [1]:
import numpy as np
import pandas as pd


import matplotlib.pyplot as plt
import mplhep as hep
import vector

import HH4b.utils as utils
from HH4b.utils import ShapeVar
import HH4b.plotting as plotting

In [2]:
def make_vector(events: pd.DataFrame, obj: str):
    """Create a ``vector`` object from the columns of the dataframe"""
    mstring = "PNetMass" if obj == "ak8FatJet" else "Mass"

    return vector.array(
        {
            "pt": events[f"{obj}Pt"],
            "phi": events[f"{obj}Phi"],
            "eta": events[f"{obj}Eta"],
            "M": events[f"{obj}{mstring}"],
            "tau21": events["ak8FatJetTau2OverTau1"],
        }
    )

## Load Dataset

In [3]:
year = "2022EE"  #
dir_name = "24Apr22_v12_signal"
path_to_dir = f"/eos/uscms/store/user/haoyang/bbbb/ttSkimmer/{dir_name}"

In [4]:
# Load your dataset
samples = {
    "tt": ["TTto2L2Nu", "TTto4Q", "TTtoLNu2Q"],
}

dirs = {path_to_dir: samples}

filters = None

# columns to load
# the parquet files are too big so we can only load a few columns at a time without consumming much memory
load_columns = [
    ("weight", 1),
    ("ak8FatJetTau2OverTau1", 2),
    ("ak8FatJetMsd", 2),
    ("ak8FatJetPNetMass", 2),
    ("ak8FatJetEta", 2),
    ("ak8FatJetPhi", 2),
    ("ak8FatJetPt", 2),
    ("bbFatJetTopMatch", 2),
    ("bbFatJetNumQMatchedTop1", 2),
    ("bbFatJetNumQMatchedTop2", 2),
    ("bbFatJetNumBMatchedTop1", 2),
    ("bbFatJetNumBMatchedTop2", 2),
    ("finalWeight", 0),
]
# reformat into ("column name", "idx") format for reading multiindex columns
columns = []
for key, num_columns in load_columns:
    for i in range(num_columns):
        columns.append(f"('{key}', '{i}')")


events_dict = {}
for input_dir, samples in dirs.items():
    events_dict = {
        **events_dict,
        # this function will load files (only the columns selected), apply filters and compute a weight per event
        **utils.load_samples(
            input_dir, samples, year, filters=filters, columns=columns, reorder_legacy_txbb=False
        ),
    }

samples_loaded = list(events_dict.keys())
keys_loaded = list(events_dict[samples_loaded[0]].keys())
print(f"Keys in events_dict")
for i in keys_loaded:
    print(i)

Loaded TTto2L2Nu                                         : 808779 entries
Loaded TTto4Q                                            : 12829 entries
Loaded TTtoLNu2Q                                         : 3477992 entries
Keys in events_dict
('weight', 0)
('ak8FatJetTau2OverTau1', 0)
('ak8FatJetTau2OverTau1', 1)
('ak8FatJetMsd', 0)
('ak8FatJetMsd', 1)
('ak8FatJetPNetMass', 0)
('ak8FatJetPNetMass', 1)
('ak8FatJetEta', 0)
('ak8FatJetEta', 1)
('ak8FatJetPhi', 0)
('ak8FatJetPhi', 1)
('ak8FatJetPt', 0)
('ak8FatJetPt', 1)
('bbFatJetTopMatch', 0)
('bbFatJetTopMatch', 1)
('bbFatJetNumQMatchedTop1', 0)
('bbFatJetNumQMatchedTop1', 1)
('bbFatJetNumQMatchedTop2', 0)
('bbFatJetNumQMatchedTop2', 1)
('bbFatJetNumBMatchedTop1', 0)
('bbFatJetNumBMatchedTop1', 1)
('bbFatJetNumBMatchedTop2', 0)
('bbFatJetNumBMatchedTop2', 1)
('weight_noxsec', 0)
('weight_nonorm', '')
('finalWeight', '')


In [6]:
# Load your dataset
samples = {
    "muon": [
        "Muon_Run2022E",
        "Muon_Run2022F",
        "Muon_Run2022G",
    ],
}

dirs = {path_to_dir: samples}

filters = None

# columns to load
# the parquet files are too big so we can only load a few columns at a time without consumming much memory
load_columns = [
    ("weight", 1),
    ("ak8FatJetTau2OverTau1", 2),
    ("ak8FatJetMsd", 2),
    ("ak8FatJetPNetMass", 2),
    ("ak8FatJetEta", 2),
    ("ak8FatJetPhi", 2),
    ("ak8FatJetPt", 2),
    ("finalWeight", 0),
]
# reformat into ("column name", "idx") format for reading multiindex columns
columns = []
for key, num_columns in load_columns:
    for i in range(num_columns):
        columns.append(f"('{key}', '{i}')")

print(columns)

events_dict_data = {}
for input_dir, samples in dirs.items():
    events_dict_data = {
        **events_dict,
        # this function will load files (only the columns selected), apply filters and compute a weight per event
        **utils.load_samples(
            input_dir, samples, year, filters=filters, columns=columns, reorder_legacy_txbb=False
        ),
    }

samples_loaded = list(events_dict.keys())
keys_loaded = list(events_dict[samples_loaded[0]].keys())
print(f"Keys in events_dict")
for i in keys_loaded:
    print(i)

["('weight', '0')", "('ak8FatJetTau2OverTau1', '0')", "('ak8FatJetTau2OverTau1', '1')", "('ak8FatJetMsd', '0')", "('ak8FatJetMsd', '1')", "('ak8FatJetPNetMass', '0')", "('ak8FatJetPNetMass', '1')", "('ak8FatJetEta', '0')", "('ak8FatJetEta', '1')", "('ak8FatJetPhi', '0')", "('ak8FatJetPhi', '1')", "('ak8FatJetPt', '0')", "('ak8FatJetPt', '1')"]


ArrowInvalid: No match for FieldRef.Name(('weight_noxsec', '0')) in ('run', '0'): uint32
('event', '0'): uint64
('luminosityBlock', '0'): uint32
('MET_pt', '0'): float
('ak8FatJetEta', '0'): double
('ak8FatJetEta', '1'): double
('ak8FatJetEta', '2'): double
('ak8FatJetPhi', '0'): double
('ak8FatJetPhi', '1'): double
('ak8FatJetPhi', '2'): double
('ak8FatJetMass', '0'): double
('ak8FatJetMass', '1'): double
('ak8FatJetMass', '2'): double
('ak8FatJetPt', '0'): double
('ak8FatJetPt', '1'): double
('ak8FatJetPt', '2'): double
('ak8FatJetMsd', '0'): double
('ak8FatJetMsd', '1'): double
('ak8FatJetMsd', '2'): double
('ak8FatJetPNetXbb', '0'): double
('ak8FatJetPNetXbb', '1'): double
('ak8FatJetPNetXbb', '2'): double
('ak8FatJetPNetXjj', '0'): double
('ak8FatJetPNetXjj', '1'): double
('ak8FatJetPNetXjj', '2'): double
('ak8FatJetPNetQCD', '0'): double
('ak8FatJetPNetQCD', '1'): double
('ak8FatJetPNetQCD', '2'): double
('ak8FatJetPNetMass', '0'): double
('ak8FatJetPNetMass', '1'): double
('ak8FatJetPNetMass', '2'): double
('ak8FatJetPNetMassRaw', '0'): double
('ak8FatJetPNetMassRaw', '1'): double
('ak8FatJetPNetMassRaw', '2'): double
('ak8FatJetTau2OverTau1', '0'): double
('ak8FatJetTau2OverTau1', '1'): double
('ak8FatJetTau2OverTau1', '2'): double
('ak8FatJetTau3OverTau2', '0'): double
('ak8FatJetTau3OverTau2', '1'): double
('ak8FatJetTau3OverTau2', '2'): double
('ak8FatJetrawFactor', '0'): double
('ak8FatJetrawFactor', '1'): double
('ak8FatJetrawFactor', '2'): double
('leptonEta', '0'): double
('leptonEta', '1'): double
('leptonPhi', '0'): double
('leptonPhi', '1'): double
('leptonMass', '0'): double
('leptonMass', '1'): double
('leptonPt', '0'): double
('leptonPt', '1'): double
('leptonId', '0'): int64
('leptonId', '1'): int64
('weight', '0'): double
__fragment_index: int32
__batch_index: int32
__last_in_fragment: bool
__filename: string

## Event cuts

In [None]:
# Higgs candidate selection example
events_data = events_dict_data["muon"]
events_mc = events_dict["tt"]

In [None]:
# AK4OutsideJet pt cut
# jets_outside_raw = make_vector(events_raw, "ak4JetOutside")
# j3_raw = jets_outside_raw[:, 0]
# j4_raw = jets_outside_raw[:, 1]
# j3j4_pt_cut = (j3_raw.pt > 20) & (j4_raw.pt > 20)

In [None]:
len(events_data)

In [None]:
len(events_mc)

In [None]:
len(events_mc) + len(events_data)

## Define different matching categories

In [None]:
# derive fatjet attributes
# use != as XOR
has_2_daughter_qs = np.array(events_mc["bbFatJetNumQMatchedTop1"] == 2) != np.array(
    events_mc["bbFatJetNumQMatchedTop2"] == 2
)
has_1_b = np.array(events_mc["bbFatJetNumBMatchedTop1"] == 1) != np.array(
    events_mc["bbFatJetNumBMatchedTop2"] == 1
)

In [None]:
top_matched = (has_2_daughter_qs) & (has_1_b)
W_matched = (has_2_daughter_qs) & (~has_1_b)
unmatched = ~has_2_daughter_qs

## Select Leading Fatjet by pT

In [None]:
fatjets_mc = make_vector(events_mc, "ak8FatJet")
mc_sort_by_fj_pt = np.argsort(fatjets_mc.pt, axis=1)[:, ::-1]
fj_sorted_mc = np.take_along_axis(fatjets_mc, mc_sort_by_fj_pt, axis=1)
leading_fj_mc = fj_sorted_mc[:, 0]

In [None]:
fatjets_data = make_vector(events_data, "ak8FatJet")
data_sort_by_fj_pt = np.argsort(fatjets_data.pt, axis=1)[:, ::-1]
fj_sorted_data = np.take_along_axis(fatjets_data, data_sort_by_fj_pt, axis=1)
leading_fj_data = fj_sorted_data[:, 0]

## Sort leading fatjets tau21 into each category

In [None]:
top_matched_sorted = np.take_along_axis(top_matched, mc_sort_by_fj_pt, axis=1)
leading_fj_mc_is_top_matched = top_matched_sorted[:, 0]
leading_fj_mc_top = leading_fj_mc[leading_fj_mc_is_top_matched]

In [None]:
events_mc.loc[leading_fj_mc_is_top_matched, "leading_fj_tau21"] = leading_fj_mc_top["tau21"]
events_mc.loc[leading_fj_mc_is_top_matched, "LeadAK8FatJetPNetMass"] = leading_fj_mc_top.M

In [None]:
W_matched_sorted = np.take_along_axis(W_matched, mc_sort_by_fj_pt, axis=1)
leading_fj_mc_is_W_matched = W_matched_sorted[:, 0]
leading_fj_mc_W = leading_fj_mc[leading_fj_mc_is_W_matched]

In [None]:
events_mc.loc[leading_fj_mc_is_W_matched, "leading_fj_tau21"] = leading_fj_mc_W["tau21"]
events_mc.loc[leading_fj_mc_is_W_matched, "LeadAK8FatJetPNetMass"] = leading_fj_mc_W.M

In [None]:
unmatched_sorted = np.take_along_axis(unmatched, mc_sort_by_fj_pt, axis=1)
leading_fj_mc_is_unmatched = unmatched_sorted[:, 0]
leading_fj_mc_unmatched = leading_fj_mc[leading_fj_mc_is_unmatched]

In [None]:
events_mc.loc[leading_fj_mc_is_unmatched, "leading_fj_tau21"] = leading_fj_mc_unmatched["tau21"]
events_mc.loc[leading_fj_mc_is_unmatched, "LeadAK8FatJetPNetMass"] = leading_fj_mc_unmatched.M

In [None]:
events_data.loc[:, "leading_fj_tau21"] = leading_fj_data["tau21"]
events_data.loc[:, "LeadAK8FatJetPNetMass"] = leading_fj_data.M
events_data.loc[:, "finalWeight"] = 1

## Define Events by their leading fj matching

In [None]:
# parse the events df to a way that util can accept
events_dict = {}
events_dict["data"] = events_data
events_dict["top_matched"] = events_mc[leading_fj_mc_is_top_matched]
events_dict["W_matched"] = events_mc[leading_fj_mc_is_W_matched]
events_dict["unmatched"] = events_mc[leading_fj_mc_is_unmatched]

In [None]:
events_dict["top_matched"]["leading_fj_tau21"]

## Select events by tau21 WP

In [None]:
tau21_WP = 0.21

In [None]:
events_passed_dict = {}
events_failed_dict = {}
for key, df in events_dict.items():
    filter = df["leading_fj_tau21"] < tau21_WP
    events_passed_dict[key] = df[filter]
    events_failed_dict[key] = df[~filter]

## Plot tau21

In [None]:
control_plot_vars = [
    ShapeVar(
        var="LeadAK8FatJetPNetMass",
        label=r"W Jet PNet Mass (GeV)",
        bins=list(np.arange(20, 250, 5)),
        reg=False,
    ),
]

In [None]:
ylims = {
    "2022": 5e4,
    "2022EE": 4e3,
    "2023-pre-BPix": 4e5,
}

In [None]:
for year in ["2022EE"]:
    hists = {}
    for shape_var in control_plot_vars:
        print(shape_var)
        if shape_var.var not in hists:
            hists[shape_var.var] = utils.singleVarHist(
                events_passed_dict,
                shape_var,
                weight_key="finalWeight",
            )

        bkgs = ["top_matched", "W_matched", "unmatched"]
        sigs = []

        plotting.ratioHistPlot(
            hists[shape_var.var],
            year,
            sigs,
            bkgs,
            name="ratioPlot_passed",
            show=True,
            log=False,
            bg_err=None,
            bg_order=["top_matched", "W_matched", "unmatched"],
            plot_data=True,
            plot_significance=False,
            significance_dir=shape_var.significance_dir,
            ylim=1.4e3,
            ylim_low=1e1,
        )

In [None]:
for year in ["2022EE"]:
    hists = {}
    for shape_var in control_plot_vars:
        print(shape_var)
        if shape_var.var not in hists:
            hists[shape_var.var] = utils.singleVarHist(
                events_failed_dict,
                shape_var,
                weight_key="finalWeight",
            )

        bkgs = ["top_matched", "W_matched", "unmatched"]
        sigs = []

        plotting.ratioHistPlot(
            hists[shape_var.var],
            year,
            sigs,
            bkgs,
            name="ratioPlot_failed",
            show=True,
            log=False,
            bg_err=None,
            bg_order=["top_matched", "W_matched", "unmatched"],
            plot_data=True,
            plot_significance=False,
            significance_dir=shape_var.significance_dir,
            ylim=1e4,
            ylim_low=1e1,
        )