In [None]:
from __future__ import annotations

import os

import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
import uproot

import HH4b.plotting as plotting
import HH4b.utils as utils
from HH4b.utils import ShapeVar

formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))

In [None]:
# automatically reloads imported files on edits
%load_ext autoreload
%autoreload 2

# Jet Mass Resolution

In [None]:
samples = {
    "2022EE": {
        "data": [
            "Muon_Run2022E",
            "Muon_Run2022F",
            "Muon_Run2022G",
        ],
        "diboson": [
            "WW",
            "ZZ",
            "WZ",
        ],
        "vjetslnu": [
            "WtoLNu-4Jets",
            "DYto2L-4Jets_MLL-50",
        ],
        "qcd": [
            "QCD_HT-1000to1200",
            "QCD_HT-200to400",
            "QCD_HT-1200to1500",
            "QCD_HT-400to600",
            "QCD_HT-1500to2000",
            "QCD_HT-2000",
            "QCD_HT-800to1000",
            "QCD_HT-600to800",
        ],
        "singletop": [
            "TbarBQ_t-channel_4FS",
            "TBbarQ_t-channel_4FS",
            "TWminustoLNu2Q",
            "TbarWplustoLNu2Q",
        ],
    },
    "2022": {
        "data": [
            "Muon_Run2022C_single",
            "Muon_Run2022C",
            "Muon_Run2022D",
        ],
        "diboson": [
            "WW",
            "ZZ",
            "WZ",
        ],
        "vjetslnu": [
            "WtoLNu-4Jets",
        ],
        "qcd": [
            "QCD_HT-1000to1200",
            "QCD_HT-200to400",
            "QCD_HT-1200to1500",
            "QCD_HT-400to600",
            "QCD_HT-1500to2000",
            "QCD_HT-2000",
            "QCD_HT-800to1000",
            "QCD_HT-600to800",
        ],
        "singletop": [
            "TbarBQ_t-channel_4FS",
            "TBbarQ_t-channel_4FS",
            "TWminustoLNu2Q",
            "TbarWplustoLNu2Q",
        ],
    },
    "2023": {
        "data": [
            "Muon_Run2023Cv1",
            "Muon_Run2023Cv2",
            "Muon_Run2023Cv3",
            "Muon_Run2023Cv4",
        ],
        "diboson": [
            "WW",
            "ZZ",
            "WZ",
        ],
        "vjetslnu": [
            "WtoLNu-4Jets",
        ],
        "qcd": [
            "QCD_HT-1000to1200",
            "QCD_HT-200to400",
            "QCD_HT-1200to1500",
            "QCD_HT-400to600",
            "QCD_HT-1500to2000",
            "QCD_HT-2000",
            "QCD_HT-800to1000",
            "QCD_HT-600to800",
        ],
        "singletop": [
            "TbarWplustoLNu2Q",
        ],
    },
    "2023BPix": {
        "data": [
            "Muon_Run2023D",
        ],
        "diboson": [
            "WW",
            "ZZ",
            "WZ",
        ],
        "vjetslnu": [
            "WtoLNu-4Jets",
        ],
        "singletop": [
            "TWminustoLNu2Q",
            "TbarWplustoLNu2Q",
        ],
        "qcd": [
            "QCD_HT-1000to1200",
            "QCD_HT-200to400",
            "QCD_HT-1200to1500",
            "QCD_HT-400to600",
            "QCD_HT-1500to2000",
            "QCD_HT-2000",
            "QCD_HT-800to1000",
            "QCD_HT-600to800",
        ],
    },
}

MAIN_DIR = "../../../"
dir_name = "24May13_v12_private_signal"
path_to_dir = f"/eos/uscms/store/user/cmantill/bbbb/ttSkimmer/{dir_name}/"
dirs = {path_to_dir: samples}

load_columns = [
    ("weight", 1),
    ("ak8FatJetPt", 2),
    ("ak8FatJetMsd", 2),
    # ("ak8FatJetPNetTXbbLegacy", 2),
    ("ak8FatJetPNetMassLegacy", 2),
    # ("ak8FatJetTau2OverTau1", 2),
    ("ak8FatJetPNetTXqqLegacy", 2),
    ("leptonPt", 1),
    ("MET_pt", 1),
]

samples_tt = {
    "2022EE": {
        "ttbar": [
            "TTto4Q",
            "TTtoLNu2Q",
            "TTto2L2Nu",
        ],
    },
    "2022": {
        "ttbar": [
            "TTto4Q",
            "TTtoLNu2Q",
            "TTto2L2Nu",
        ],
    },
    "2023": {
        "ttbar": [
            "TTto4Q",
            "TTtoLNu2Q",
            "TTto2L2Nu",
        ],
    },
    "2023BPix": {
        "ttbar": [
            "TTto4Q",
            "TTtoLNu2Q",
            "TTto2L2Nu",
        ],
    },
}

load_columns_tt = load_columns + [
    ("GenTopW0Pt", 1),
    ("GenTopW0Mass", 1),
    ("GenTopW1Pt", 1),
    ("GenTopW1Mass", 1),
    ("bbFatJetTopMatch", 2),
    # ("bbFatJetTopMatchIndex", 2),
    ("bbFatJetNumQMatchedTop1", 2),
    ("bbFatJetNumQMatchedTop2", 2),
    ("bbFatJetNumBMatchedTop1", 2),
    ("bbFatJetNumBMatchedTop2", 2),
]

events_dict = {}
for year in ["2022", "2022EE", "2023", "2023BPix"]:
    events_dict[year] = {}
    for input_dir, samples in dirs.items():
        print(samples_tt[year])
        events_dict[year] = {
            **events_dict[year],
            **utils.load_samples(
                input_dir,
                samples[year],
                year,
                filters=None,
                columns=utils.format_columns(load_columns),
                reorder_txbb=False,
                variations=False,
            ),
            **utils.load_samples(
                input_dir,
                samples_tt[year],
                year,
                filters=None,
                columns=utils.format_columns(load_columns_tt),
                reorder_txbb=False,
                variations=False,
            ),
        }

In [None]:
events_dict["2023BPix"].keys()

Get variables

In [None]:
events_dict["2022EE"]["ttbar"]["leptonPt"][0] + events_dict["2022EE"]["ttbar"]["MET_pt"][0]

In [None]:
events = events_dict["2022EE"]["ttbar"]
has_2_daughter_qs = np.array(events["bbFatJetNumQMatchedTop1"] == 2) != np.array(
    events["bbFatJetNumQMatchedTop2"] == 2
)
has_1_b = np.array(events["bbFatJetNumBMatchedTop1"] == 1) != np.array(
    events["bbFatJetNumBMatchedTop2"] == 1
)
mass = "PNetMassLegacy"

vpt_matched = (
    (events["ak8FatJetPt"][0] - events["GenTopW0Pt"][0]) / events["GenTopW0Pt"][0] < 0.5
) | ((events["ak8FatJetPt"][0] - events["GenTopW1Pt"][0]) / events["GenTopW1Pt"][0] < 0.5)
vmass_matched = (
    (events[f"ak8FatJet{mass}"][0] - events["GenTopW0Mass"][0]) / events["GenTopW0Mass"][0] < 0.3
) | ((events[f"ak8FatJet{mass}"][0] - events["GenTopW1Mass"][0]) / events["GenTopW1Mass"][0] < 0.3)
top_matched = ((has_2_daughter_qs) & (has_1_b))[:, 0]
W_matched = ((has_2_daughter_qs) & (~has_1_b))[:, 0]
W_matched_tight = W_matched & vpt_matched & vmass_matched
unmatched = (~has_2_daughter_qs)[:, 0] | (W_matched & ~W_matched_tight)

Tagger 
- 1% mistag rate from 2018
- https://indico.cern.ch/event/1329821/contributions/5610204/attachments/2730340/4746252/Top_W_Calibration_Run3.pdf page 6

In [None]:
def get_ev_dataframe(events_dict):
    ev_dataframe_dict = {}
    for key in events_dict:
        events = events_dict[key]

        mass = "PNetMassLegacy"

        wlnu_pt = events["leptonPt"][0] + events["MET_pt"][0]

        # apply masks
        events = events[
            (events["ak8FatJetPt"][0] > 300)
            & (events[f"ak8FatJet{mass}"][0] >= 55)
            & (events[f"ak8FatJet{mass}"][0] <= 200)
            & (wlnu_pt >= 100)
        ]

        ev_dataframe = pd.DataFrame(
            {
                "WPNetMass": events["ak8FatJetPNetMassLegacy"][0],
                "WMsd": events["ak8FatJetMsd"][0],
                "WPt": events["ak8FatJetPt"][0],
                "weight": events["finalWeight"],
                "PNetTXqq": events["ak8FatJetPNetTXqqLegacy"][0],
            }
        )

        if key == "ttbar":
            has_2_daughter_qs = np.array(events["bbFatJetNumQMatchedTop1"] == 2) != np.array(
                events["bbFatJetNumQMatchedTop2"] == 2
            )
            has_1_b = np.array(events["bbFatJetNumBMatchedTop1"] == 1) != np.array(
                events["bbFatJetNumBMatchedTop2"] == 1
            )
            vpt_matched = (
                (events["ak8FatJetPt"][0] - events["GenTopW0Pt"][0]) / events["GenTopW0Pt"][0] < 0.5
            ) | (
                (events["ak8FatJetPt"][0] - events["GenTopW1Pt"][0]) / events["GenTopW1Pt"][0] < 0.5
            )
            vmass_matched = (
                (events[f"ak8FatJet{mass}"][0] - events["GenTopW0Mass"][0])
                / events["GenTopW0Mass"][0]
                < 0.3
            ) | (
                (events[f"ak8FatJet{mass}"][0] - events["GenTopW1Mass"][0])
                / events["GenTopW1Mass"][0]
                < 0.3
            )
            top_matched = ((has_2_daughter_qs) & (has_1_b))[:, 0]
            W_matched = ((has_2_daughter_qs) & (~has_1_b))[:, 0]
            W_matched_tight = W_matched & vpt_matched & vmass_matched
            unmatched = (~has_2_daughter_qs)[:, 0] | (W_matched & ~W_matched_tight)

            ev_dataframe_dict = {
                **ev_dataframe_dict,
                "top_matched": ev_dataframe[top_matched],
                "W_matched": ev_dataframe[W_matched_tight],
                "unmatched": ev_dataframe[unmatched],
            }
        else:
            ev_dataframe_dict[key] = ev_dataframe

    return ev_dataframe_dict

In [None]:
ev_dataframe_dict = {}
for year in events_dict:
    ev_dataframe_dict[year] = get_ev_dataframe(events_dict[year])

In [None]:
ev_dataframe_dict["2022"].keys()

In [None]:
control_plot_vars = [
    # ShapeVar(var="WPNetMass", label=r"W PNet Mass (v11) (GeV)", bins=[30, 50, 200]),
    # 3.33 gev bins
    ShapeVar(var="WPNetMass", label=r"W PNet Mass (v11) (GeV)", bins=[21, 55, 125]),
    ShapeVar(var="WMsd", label=r"W Msd (GeV)", bins=[30, 50, 200]),
    ShapeVar(var="WPt", label=r"W p$_{T}$ (GeV)", bins=[30, 300, 800]),
    ShapeVar(var="PNetTXqq", label=r"W PNet TXqq (GeV)", bins=[30, 0, 1]),
]


ev_dataframe_dict_pass = {}
ev_dataframe_dict_fail = {}
ev_dataframe_dict_pass_pt0 = {}
ev_dataframe_dict_fail_pt0 = {}
ev_dataframe_dict_pass_pt1 = {}
ev_dataframe_dict_fail_pt1 = {}
for year in ev_dataframe_dict:
    ev_dataframe_dict_pass[year] = {}
    ev_dataframe_dict_fail[year] = {}

    ev_dataframe_dict_pass_pt0[year] = {}
    ev_dataframe_dict_fail_pt0[year] = {}
    ev_dataframe_dict_pass_pt1[year] = {}
    ev_dataframe_dict_fail_pt1[year] = {}

    for key in ev_dataframe_dict[year]:
        pnet_mask = ev_dataframe_dict[year][key]["PNetTXqq"] >= 0.82
        pnet_mask_inv = ev_dataframe_dict[year][key]["PNetTXqq"] < 0.82

        pt0_mask = (ev_dataframe_dict[year][key]["WPt"] >= 300) & (
            ev_dataframe_dict[year][key]["WPt"] < 400
        )

        pt1_mask = ev_dataframe_dict[year][key]["WPt"] >= 400

        ev_dataframe_dict_pass[year][key] = ev_dataframe_dict[year][key][pnet_mask]
        ev_dataframe_dict_fail[year][key] = ev_dataframe_dict[year][key][pnet_mask_inv]

        ev_dataframe_dict_pass_pt0[year][key] = ev_dataframe_dict[year][key][pnet_mask & pt0_mask]
        ev_dataframe_dict_fail_pt0[year][key] = ev_dataframe_dict[year][key][
            pnet_mask_inv & pt0_mask
        ]

        ev_dataframe_dict_pass_pt1[year][key] = ev_dataframe_dict[year][key][pnet_mask & pt1_mask]
        ev_dataframe_dict_fail_pt1[year][key] = ev_dataframe_dict[year][key][
            pnet_mask_inv & pt1_mask
        ]

names_by_sample = {
    "data": "data_obs",
    "W_matched": "catp2",
}
tag = "May15"

for year in ["2022"]:
    os.system(f"mkdir -p TnPSF/run3_templates/{year}/all")
    os.system(f"mkdir -p TnPSF/run3_templates/{year}/pt0")
    os.system(f"mkdir -p TnPSF/run3_templates/{year}/pt1")

    out_file = f"TnPSF/run3_templates/{year}/all/topCR_{tag}.root"
    out_file_pt0 = f"TnPSF/run3_templates/{year}/pt0/topCR_{tag}.root"
    out_file_pt1 = f"TnPSF/run3_templates/{year}/pt1/topCR_{tag}.root"

    odir = f"pnetmass/{year}"
    os.system(f"mkdir -p {odir}")

    hists = {}
    hists_pass = {}
    hists_fail = {}

    hists_pass_pt0 = {}
    hists_fail_pt0 = {}
    hists_pass_pt1 = {}
    hists_fail_pt1 = {}

    for shape_var in control_plot_vars:
        print(shape_var)
        if shape_var.var not in hists:
            hists[shape_var.var] = utils.singleVarHist(
                ev_dataframe_dict[year], shape_var, weight_key="weight"
            )
            hists_pass[shape_var.var] = utils.singleVarHist(
                ev_dataframe_dict_pass[year], shape_var, weight_key="weight"
            )
            hists_fail[shape_var.var] = utils.singleVarHist(
                ev_dataframe_dict_fail[year], shape_var, weight_key="weight"
            )

            hists_pass_pt0[shape_var.var] = utils.singleVarHist(
                ev_dataframe_dict_pass_pt0[year], shape_var, weight_key="weight"
            )
            hists_pass_pt1[shape_var.var] = utils.singleVarHist(
                ev_dataframe_dict_pass_pt1[year], shape_var, weight_key="weight"
            )
            hists_fail_pt0[shape_var.var] = utils.singleVarHist(
                ev_dataframe_dict_fail_pt0[year], shape_var, weight_key="weight"
            )
            hists_fail_pt1[shape_var.var] = utils.singleVarHist(
                ev_dataframe_dict_fail_pt1[year], shape_var, weight_key="weight"
            )

        bkgs = ["top_matched", "W_matched", "unmatched", "diboson", "qcd", "vjetslnu", "singletop"]
        bkg_order = [
            "qcd",
            "diboson",
            "vjetslnu",
            "unmatched",
            "top_matched",
            "singletop",
            "W_matched",
        ]

        # bkgs = ["top_matched", "W_matched", "unmatched", "diboson", "vjetslnu"]
        # bkg_order = ["diboson", "vjetslnu", "unmatched", "top_matched", "W_matched"]
        sigs = []

        for hname, h in {
            f"{odir}/{shape_var.var}": hists[shape_var.var],
            f"{odir}/{shape_var.var}_pass": hists_pass[shape_var.var],
            f"{odir}/{shape_var.var}_fail": hists_fail[shape_var.var],
            f"{odir}/{shape_var.var}_pass_pt0": hists_pass_pt0[shape_var.var],
            f"{odir}/{shape_var.var}_fail_pt0": hists_fail_pt0[shape_var.var],
            f"{odir}/{shape_var.var}_pass_pt1": hists_pass_pt1[shape_var.var],
            f"{odir}/{shape_var.var}_fail_pt1": hists_fail_pt1[shape_var.var],
        }.items():
            plotting.ratioHistPlot(
                h,
                year,
                sigs,
                bkgs,
                name=hname,
                show=False,
                log=False,
                bg_err=None,
                bg_order=bkg_order,
                plot_data=True,
                plot_significance=False,
                bg_err_mcstat=True,
                exclude_qcd_mcstat=False,
                # ylim=1.2e4,
                # ylim_low=0,
            )

    def save_to_file(out_file, hists_pass, hists_fail):
        f_out = uproot.recreate(out_file)
        var = "WPNetMass"
        f_out["data_obs_pass_nominal"] = hists_pass[var][{"Sample": "data"}]
        f_out["data_obs_fail_nominal"] = hists_fail[var][{"Sample": "data"}]

        # matched
        f_out["catp2_pass_nominal"] = sum(
            [
                hists_pass[var][{"Sample": sample}]
                for sample in hists_pass[var].axes[0]
                if sample in ["W_matched", "singletop"]
            ]
        )
        f_out["catp2_fail_nominal"] = sum(
            [
                hists_fail[var][{"Sample": sample}]
                for sample in hists_fail[var].axes[0]
                if sample in ["W_matched"]
            ]
        )

        # unmatched
        f_out["catp1_pass_nominal"] = sum(
            [
                hists_pass[var][{"Sample": sample}]
                for sample in hists_pass[var].axes[0]
                if sample in ["top_matched", "unmatched", "diboson", "qcd", "vjetslnu"]
            ]
        )
        f_out["catp1_fail_nominal"] = sum(
            [
                hists_fail[var][{"Sample": sample}]
                for sample in hists_fail[var].axes[0]
                if sample in ["top_matched", "unmatched", "diboson", "qcd", "vjetslnu", "singletop"]
            ]
        )
        f_out.close()

    save_to_file(out_file, hists_pass, hists_fail)
    save_to_file(out_file_pt0, hists_pass_pt0, hists_fail_pt0)
    save_to_file(out_file_pt1, hists_pass_pt1, hists_fail_pt1)

In [None]:
source_file = uproot.open(out_file)

In [None]:
source_file.keys()