In [2]:
import pandas as pd
import numpy as np
import vector
import os
import uproot
from xgboost import XGBClassifier

import HH4b.utils as utils
from HH4b.utils import ShapeVar
import HH4b.plotting as plotting
import HH4b.postprocessing as postprocessing
from HH4b.postprocessing import Region

import hist
import matplotlib.pyplot as plt
import mplhep as hep
import matplotlib.ticker as mticker

formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))

from HH4b.postprocessing import load_columns_legacy
from HH4b.postprocessing.PostProcess import add_bdt_scores

In [3]:
# automatically reloads imported files on edits
%load_ext autoreload
%autoreload 2

# Jet Mass Resolution

In [16]:
samples = {
    "2022EE": {
        "data": [
            "Muon_Run2022E",
            "Muon_Run2022F",
            "Muon_Run2022G",
        ],
        "diboson": [
            "WW",
            "ZZ",
            "WZ",
        ],
        "vjetslnu": [
            "WtoLNu-4Jets",
            "DYto2L-4Jets_MLL-50",
        ],
        "qcd": [
            "QCD_PT-120to170_MuEnrichedPt5",
            "QCD_PT-1000_MuEnrichedPt5",
            "QCD_PT-600to800_MuEnrichedPt5",
            "QCD_PT-300to470_MuEnrichedPt5",
            "QCD_PT-470to600_MuEnrichedPt5",
            "QCD_PT-170to300_MuEnrichedPt5",
        ],
    },
    "2022": {
        "data": [
            "Muon_Run2022C_single",
            "Muon_Run2022C",
            "Muon_Run2022D",
        ],
        "diboson": [
            "WW",
            "ZZ",
            "WZ",
        ],
        #     "vjetslnu": [
        #         "WtoLNu-2Jets",
        #     ],
    },
    "2023": {
        "data": [
            "Muon_Run2023Cv1",
            "Muon_Run2023Cv2",
        ],
        "diboson": [
            "WW",
            "ZZ",
            "WZ",
        ],
    },
    "2023BPix": {
        "data": [
            "Muon_Run2023D",
        ],
        "diboson": [
            "WW",
            "ZZ",
            "WZ",
        ],
    },
}

MAIN_DIR = "../../../"
dir_name = "24May13_v12_private_signal"
path_to_dir = f"/eos/uscms/store/user/cmantill/bbbb/ttSkimmer/{dir_name}/"
dirs = {path_to_dir: samples}

load_columns = [
    ("weight", 1),
    ("ak8FatJetPt", 2),
    ("ak8FatJetMsd", 2),
    ("ak8FatJetPNetTXbbLegacy", 2),
    ("ak8FatJetPNetMassLegacy", 2),
    # ("ak8FatJetTau2OverTau1", 2),
    ("ak8FatJetPNetTXqqLegacy", 2),
    ("leptonPt", 1),
    ("MET_pt", 1),
]

samples_tt = {
    "2022EE": {
        "ttbar": [
            "TTto4Q",
            "TTtoLNu2Q",
            "TTto2L2Nu",
        ],
    },
    "2022": {
        "ttbar": [
            "TTto4Q",
            "TTtoLNu2Q",
            "TTto2L2Nu",
        ],
    },
    "2023": {
        "ttbar": [
            "TTto4Q",
            "TTtoLNu2Q",
            "TTto2L2Nu",
        ],
    },
    "2023BPix": {
        "ttbar": [
            "TTto4Q",
            "TTtoLNu2Q",
            "TTto2L2Nu",
        ],
    },
}

load_columns_tt = load_columns + [
    ("GenTopW0Pt", 1),
    ("GenTopW0Mass", 1),
    ("GenTopW1Pt", 1),
    ("GenTopW1Mass", 1),
    ("bbFatJetTopMatch", 2),
    # ("bbFatJetTopMatchIndex", 2),
    ("bbFatJetNumQMatchedTop1", 2),
    ("bbFatJetNumQMatchedTop2", 2),
    ("bbFatJetNumBMatchedTop1", 2),
    ("bbFatJetNumBMatchedTop2", 2),
]

events_dict = {}
for year in ["2022"]:
    events_dict[year] = {}
    for input_dir, samples in dirs.items():
        print(samples_tt[year])
        events_dict[year] = {
            **events_dict[year],
            **utils.load_samples(
                input_dir,
                samples[year],
                year,
                filters=None,
                columns=utils.format_columns(load_columns),
                reorder_txbb=False,
                variations=False,
            ),
            **utils.load_samples(
                input_dir,
                samples_tt[year],
                year,
                filters=None,
                columns=utils.format_columns(load_columns_tt),
                reorder_txbb=False,
                variations=False,
            ),
        }

{'ttbar': ['TTto4Q', 'TTtoLNu2Q', 'TTto2L2Nu']}
Loaded Muon_Run2022C                                     : 43198 entries
Loaded Muon_Run2022C_single                              : 4876 entries
Loaded Muon_Run2022D                                     : 25562 entries




In [15]:
events_dict["2023BPix"].keys()

dict_keys(['data', 'diboson', 'ttbar'])

Get variables

In [4]:
events_dict["2022EE"]["ttbar"]["leptonPt"][0] + events_dict["2022EE"]["ttbar"]["MET_pt"][0]

0          249.148331
1          152.583153
2          201.585312
3          334.577301
4          376.238815
              ...    
5260795    196.965805
5260796    143.680584
5260797    235.148708
5260798    208.603481
5260799    209.535507
Name: 0, Length: 5637423, dtype: float64

In [5]:
events = events_dict["2022EE"]["ttbar"]
has_2_daughter_qs = np.array(events["bbFatJetNumQMatchedTop1"] == 2) != np.array(
    events["bbFatJetNumQMatchedTop2"] == 2
)
has_1_b = np.array(events["bbFatJetNumBMatchedTop1"] == 1) != np.array(
    events["bbFatJetNumBMatchedTop2"] == 1
)
mass = "PNetMassLegacy"

vpt_matched = (
    (events["ak8FatJetPt"][0] - events["GenTopW0Pt"][0]) / events["GenTopW0Pt"][0] < 0.5
) | ((events["ak8FatJetPt"][0] - events["GenTopW1Pt"][0]) / events["GenTopW1Pt"][0] < 0.5)
vmass_matched = (
    (events[f"ak8FatJet{mass}"][0] - events["GenTopW0Mass"][0]) / events["GenTopW0Mass"][0] < 0.3
) | ((events[f"ak8FatJet{mass}"][0] - events["GenTopW1Mass"][0]) / events["GenTopW1Mass"][0] < 0.3)
top_matched = ((has_2_daughter_qs) & (has_1_b))[:, 0]
W_matched = ((has_2_daughter_qs) & (~has_1_b))[:, 0]
W_matched_tight = W_matched & vpt_matched & vmass_matched
unmatched = ((~has_2_daughter_qs))[:, 0] | (W_matched & ~W_matched_tight)

Tagger 
- 1% mistag rate from 2018
- https://indico.cern.ch/event/1329821/contributions/5610204/attachments/2730340/4746252/Top_W_Calibration_Run3.pdf page 6

In [6]:
def get_ev_dataframe(events_dict):
    ev_dataframe_dict = {}
    for key in events_dict:
        events = events_dict[key]

        mass = "PNetMassLegacy"

        wlnu_pt = events["leptonPt"][0] + events["MET_pt"][0]

        # apply masks
        events = events[
            (events["ak8FatJetPt"][0] > 300)
            & (events[f"ak8FatJet{mass}"][0] >= 55)
            & (events[f"ak8FatJet{mass}"][0] <= 200)
            & (wlnu_pt >= 100)
        ]

        ev_dataframe = pd.DataFrame(
            {
                "WPNetMass": events["ak8FatJetPNetMassLegacy"][0],
                "WMsd": events["ak8FatJetMsd"][0],
                "WPt": events["ak8FatJetPt"][0],
                "weight": events["finalWeight"],
                "PNetTXqq": events["ak8FatJetPNetTXqqLegacy"][0],
            }
        )

        if key == "ttbar":
            has_2_daughter_qs = np.array(events["bbFatJetNumQMatchedTop1"] == 2) != np.array(
                events["bbFatJetNumQMatchedTop2"] == 2
            )
            has_1_b = np.array(events["bbFatJetNumBMatchedTop1"] == 1) != np.array(
                events["bbFatJetNumBMatchedTop2"] == 1
            )
            vpt_matched = (
                (events["ak8FatJetPt"][0] - events["GenTopW0Pt"][0]) / events["GenTopW0Pt"][0] < 0.5
            ) | (
                (events["ak8FatJetPt"][0] - events["GenTopW1Pt"][0]) / events["GenTopW1Pt"][0] < 0.5
            )
            vmass_matched = (
                (events[f"ak8FatJet{mass}"][0] - events["GenTopW0Mass"][0])
                / events["GenTopW0Mass"][0]
                < 0.3
            ) | (
                (events[f"ak8FatJet{mass}"][0] - events["GenTopW1Mass"][0])
                / events["GenTopW1Mass"][0]
                < 0.3
            )
            top_matched = ((has_2_daughter_qs) & (has_1_b))[:, 0]
            W_matched = ((has_2_daughter_qs) & (~has_1_b))[:, 0]
            W_matched_tight = W_matched & vpt_matched & vmass_matched
            unmatched = ((~has_2_daughter_qs))[:, 0] | (W_matched & ~W_matched_tight)

            ev_dataframe_dict = {
                **ev_dataframe_dict,
                "top_matched": ev_dataframe[top_matched],
                "W_matched": ev_dataframe[W_matched_tight],
                "unmatched": ev_dataframe[unmatched],
            }
        else:
            ev_dataframe_dict[key] = ev_dataframe

    return ev_dataframe_dict

In [7]:
ev_dataframe_dict = {}
for year in events_dict:
    ev_dataframe_dict[year] = get_ev_dataframe(events_dict[year])

In [8]:
ev_dataframe_dict["2022"].keys()

dict_keys(['data'])

In [13]:
control_plot_vars = [
    # ShapeVar(var="WPNetMass", label=r"W PNet Mass (v11) (GeV)", bins=[30, 50, 200]),
    # 3.33 gev bins
    ShapeVar(var="WPNetMass", label=r"W PNet Mass (v11) (GeV)", bins=[21, 55, 125]),
    ShapeVar(var="WMsd", label=r"W Msd (GeV)", bins=[30, 50, 200]),
    ShapeVar(var="WPt", label=r"W p$_{T}$ (GeV)", bins=[30, 300, 800]),
    ShapeVar(var="PNetTXqq", label=r"W PNet TXqq (GeV)", bins=[30, 0, 1]),
]

odir = "pnetmass"
os.system(f"mkdir -p {odir}")


ev_dataframe_dict_pass = {}
for year in ev_dataframe_dict:
    ev_dataframe_dict_pass[year] = {}
    for key in ev_dataframe_dict[year]:
        ev_dataframe_dict_pass[year][key] = ev_dataframe_dict[year][key][
            ev_dataframe_dict[year][key]["PNetTXqq"] >= 0.82
        ]

ev_dataframe_dict_fail = {}
for year in ev_dataframe_dict:
    ev_dataframe_dict_fail[year] = {}
    for key in ev_dataframe_dict[year]:
        ev_dataframe_dict_fail[year][key] = ev_dataframe_dict[year][key][
            ev_dataframe_dict[year][key]["PNetTXqq"] < 0.82
        ]

out_file = "TnPSF/run3_templates/2022EE/2022EE_topCR_May14.root"
f_out = uproot.recreate(out_file)

names_by_sample = {
    "data": "data_obs",
    "W_matched": "catp2",
}

for year in ["2022EE"]:
    hists = {}
    hists_pass = {}
    hists_fail = {}
    for shape_var in control_plot_vars:
        print(shape_var)
        if shape_var.var not in hists:
            hists[shape_var.var] = utils.singleVarHist(
                ev_dataframe_dict[year], shape_var, weight_key="weight"
            )
            hists_pass[shape_var.var] = utils.singleVarHist(
                ev_dataframe_dict_pass[year], shape_var, weight_key="weight"
            )
            hists_fail[shape_var.var] = utils.singleVarHist(
                ev_dataframe_dict_fail[year], shape_var, weight_key="weight"
            )

        bkgs = ["top_matched", "W_matched", "unmatched", "diboson", "qcd", "vjetslnu"]
        sigs = []

        plotting.ratioHistPlot(
            hists[shape_var.var],
            year,
            sigs,
            bkgs,
            name=f"{odir}/{shape_var.var}",
            show=False,
            log=False,
            bg_err=None,
            bg_order=["diboson", "vjetslnu", "unmatched", "top_matched", "W_matched"],
            plot_data=True,
            plot_significance=False,
            # ylim=1.2e4,
            # ylim_low=0,
        )

        plotting.ratioHistPlot(
            hists_pass[shape_var.var],
            year,
            sigs,
            bkgs,
            name=f"{odir}/{shape_var.var}_pass",
            show=False,
            log=False,
            bg_err=None,
            bg_order=["diboson", "vjetslnu", "unmatched", "top_matched", "W_matched"],
            plot_data=True,
            plot_significance=False,
            # ylim=1.2e4,
            # ylim_low=0,
        )

        plotting.ratioHistPlot(
            hists_fail[shape_var.var],
            year,
            sigs,
            bkgs,
            name=f"{odir}/{shape_var.var}_fail",
            show=False,
            log=False,
            bg_err=None,
            bg_order=["diboson", "vjetslnu", "unmatched", "top_matched", "W_matched"],
            plot_data=True,
            plot_significance=False,
            # ylim=1.2e4,
            # ylim_low=0,
        )

        if shape_var.var == "WPNetMass":
            for sample in hists_pass[shape_var.var].axes[0]:
                if sample in names_by_sample:
                    f_out[f"{names_by_sample[sample]}_pass_nominal"] = hists_pass[shape_var.var][
                        {"Sample": sample}
                    ]
                    f_out[f"{names_by_sample[sample]}_fail_nominal"] = hists_fail[shape_var.var][
                        {"Sample": sample}
                    ]
            f_out[f"catp1_pass_nominal"] = sum(
                [
                    hists_pass[shape_var.var][{"Sample": sample}]
                    for sample in hists_pass[shape_var.var].axes[0]
                    if sample not in ["data", "W_matched"]
                ]
            )
            f_out[f"catp1_fail_nominal"] = sum(
                [
                    hists_fail[shape_var.var][{"Sample": sample}]
                    for sample in hists_fail[shape_var.var].axes[0]
                    if sample not in ["data", "W_matched"]
                ]
            )

f_out.close()

ShapeVar(var='WPNetMass', label='W PNet Mass (v11) (GeV)', bins=[21, 55, 125], reg=True, blind_window=None, significance_dir='right', plot_args=None)
ShapeVar(var='WMsd', label='W Msd (GeV)', bins=[30, 50, 200], reg=True, blind_window=None, significance_dir='right', plot_args=None)
ShapeVar(var='WPt', label='W p$_{T}$ (GeV)', bins=[30, 300, 800], reg=True, blind_window=None, significance_dir='right', plot_args=None)
ShapeVar(var='PNetTXqq', label='W PNet TXqq (GeV)', bins=[30, 0, 1], reg=True, blind_window=None, significance_dir='right', plot_args=None)


In [9]:
source_file = uproot.open(out_file)

In [10]:
source_file.keys()

['data_obs_pass_nominal;1',
 'data_obs_fail_nominal;1',
 'catp2_pass_nominal;1',
 'catp2_fail_nominal;1',
 'catp1_pass_nominal;1',
 'catp1_fail_nominal;1']