# data/mc plots in 2018

### Post-process Data and MC samples

- Sum all MC samples that belong to the same process
- Scale the number of events by the total sum of weights

In [None]:
# import utilities for post-process
import utils

In [None]:
samples = {
    "hh4b": ["GluGlutoHHto4B_cHHH1_TuneCP5_PSWeights_13TeV-powheg-pythia8"],
    "hh4b-kl0": ["GluGlutoHHto4B_cHHH0_TuneCP5_PSWeights_13TeV-powheg-pythia8"],
    "hh4b-kl2p45": ["GluGlutoHHto4B_cHHH2p45_TuneCP5_PSWeights_13TeV-powheg-pythia8"],
    "hh4b-kl5": ["GluGlutoHHto4B_cHHH5_TuneCP5_PSWeights_13TeV-powheg-pythia8"],
    "qcd": [
        "QCD_HT-200to300-13TeV",
        "QCD_HT-300to500-13TeV"
        "QCD_HT-500to700-13TeV",
        "QCD_HT-700to1000-13TeV",
        "QCD_HT-1000to1500-13TeV",
        "QCD_HT-1500to2000-13TeV",
        "QCD_HT-2000toInf-13TeV",
    ],
    "ttbar": [
        "TTToSemiLeptonic_13TeV",
        "TTToHadronic_13TeV",
        "TTTo2L2Nu_13TeV",
    ],
    "vjets": [
        "WJetsToQQ_HT-200to400_13TeV",
        "WJetsToQQ_HT-400to600_13TeV",
        "WJetsToQQ_HT-600to800_13TeV",
        "WJetsToQQ_HT-800toInf_13TeV", 
        "ZJetsToQQ_HT-200to400_13TeV",
        "ZJetsToQQ_HT-400to600_13TeV",
        "ZJetsToQQ_HT-600to800_13TeV",
        "ZJetsToQQ_HT-800toInf_13TeV",
    ],
}

# define dictionary with directories of files (this can be configured in a yaml file later in the script)

# this is the directory to the files
path_to_dir = "/eos/uscms/store/user/cmantill/bbbb/matching/Oct9"
dirs = {path_to_dir: samples}
year = "2018"

In [None]:
# filters are sequences of strings that can be used to place a selection or mask in the parquet files                                                                                                                                                                              
# e.g. https://github.com/rkansal47/HHbbVV/blob/main/src/HHbbVV/postprocessing/postprocessing.py#L80                                                                                                                                                  
filters = [
    [
        ("('ak8FatJetPt', '0')", ">=", 300),
        ("('ak8FatJetMsd', '0')", ">=", 60),
    ]
]

In [None]:
# columns to load                                                                                                                                                                                                                                     
# the parquet files are too big so we can only load a few columns at a time without consumming much memory
load_columns = [
    ("weight", 1),
    # ("ak4JetPt", 6),
    ("ak4JetbtagDeepFlavB", 6),
    # ("ak4JetEta", 6),
    # ("ak4JetPhi", 6),
    # ("ak4JetMass", 6),
    ('ak4DijetPt0', 1),
    ('ak4DijetPt1', 1),
    ('ak4DijetDeltaR', 1),
	("ak8FatJetPt", 2),
    ('ak8FatJetPNetMass', 2),
	("ak8FatJetPNetXbb", 2),                                                                                                                                                                                                                     
]
# reformat into ("column name", "idx") format for reading multiindex columns                                                                                                                                                                          
columns = []
for key, num_columns in load_columns:
    for i in range(num_columns):
        columns.append(f"('{key}', '{i}')")

In [None]:
# dictionary that will contain all information (from all samples)

events_dict = {}
for input_dir, samples in dirs.items():
    events_dict = {
        **events_dict,
        # this function will load files (only the columns selected), apply filters and compute a weight per event
        **utils.load_samples(input_dir, samples, year, filters=filters, columns=columns)
	}

In [None]:
# this will be the weight that will be stored in the eventsDict once the utils.loadSamples function is done
weight_key = ["finalWeight"]

In [None]:
samples_loaded = list(events_dict.keys())
keys_loaded = list(events_dict[samples_loaded[0]].keys())
print(f"Keys in events_dict {keys_loaded}")  

In [None]:
samples_loaded

In [None]:
samples_to_fill = [
    #"data",
    "qcd",
    "vjets",
    "ttbar",
    #"hbb"
]
vars_to_plot = [
    "ak4DijetPt0",
    "ak4DijetPt1",
    "ak8FatJetPt0",
    "ak8FatJetPt1",
    "ak4JetbtagDeepFlavB0",
    "ak4JetbtagDeepFlavB1",
    # "DijetMass",
    "ak8FatJetPNetXbb0",
    "ak8FatJetPNetMass0",
    "ak8FatJetPNetXbb1",
    "ak8FatJetPNetMass1",
]

In [None]:
# define ShapeVar (label and bins for a given variable)                                                                                                                                                                                                   
from utils import ShapeVar

var_to_shapevar = {
    # var must match key in events dictionary (i.e. as saved in parquet file)                                                                                                                                                                             
    "DijetMass": ShapeVar(
        var="DijetMass", label=r"AK8 $m^{jj}$ (GeV)", bins=[30, 600, 4000]
    ),
    "ak4JetbtagDeepFlavB0": ShapeVar(
        var="ak4JetbtagDeepFlavB0", label=r"AK4 deepFlavB $^0$ (GeV)", bins=[30, 0 , 1], significance_dir="right"
    ),
    "ak4JetbtagDeepFlavB1": ShapeVar(
        var="ak4JetbtagDeepFlavB1", label=r"AK4 deepFlavB $^1$ (GeV)", bins=[30, 0 , 1], significance_dir="right"
    ),
    "ak4DijetPt0": ShapeVar(
        var="ak4DijetPt0", label=r"AK4 jj (0) $p_T$ (GeV)", bins=[30, 300, 1500], significance_dir="right"
    ),
    "ak4DijetPt1": ShapeVar(
        var="ak4DijetPt1", label=r"$AK4 jj (1) p_T$ (GeV)", bins=[30, 300, 1500], significance_dir="right"
    ),
    "ak8FatJetPt0": ShapeVar(
        var="ak8FatJetPt0", label=r"$AK8 p_T^0$ (GeV)", bins=[30, 300, 1500], significance_dir="right"
    ),
    "ak8FatJetPt1": ShapeVar(
        var="ak8FatJetPt1", label=r"AK8 $p_T^1$ (GeV)", bins=[30, 300, 1500], significance_dir="right"
    ),
    "ak8FatJetPNetMass0": ShapeVar(
        var="ak8FatJetPNetMass0", label=r"AK8 $m_{reg}^{0}$ (GeV)", bins=[20, 0, 260]
    ),
    "ak8FatJetPNetXbb0": ShapeVar(
        var="ak8FatJetPNetXbb0", label=r"AK8 $TX_{bb}^{0}$", bins=[50, 0.8, 1],
    ),
    "ak8FatJetPNetMass1": ShapeVar(
        var="ak8FatJetPNetMass1", label=r"AK8 $m_{reg}^{1}$ (GeV)", bins=[20, 0, 260]
    ),
    "ak8FatJetPNetXbb1": ShapeVar(
        var="ak8FatJetPNetXbb1", label=r"AK8 $TX_{bb}^{1}$", bins=[100, -2, 1],
    ),
}

In [None]:
# make a histogram                                                                                                                                                                                                                                
hists = {}
for var in vars_to_plot:
    shape_var = var_to_shapevar[var]
    if shape_var.var not in hists:
        hists[shape_var.var] = utils.singleVarHist(
            events_dict,
            shape_var,
            weight_key=weight_key,
            selection=None,
        )


In [None]:
hists

In [None]:
hists["ak8FatJetPt0"][{"Sample": "qcd"}]

In [None]:
import mplhep as hep
hist, bins = hists["ak8FatJetPt0"][{"Sample": "qcd"}].to_numpy()
print(hist)
hep.histplot(hist,bins=bins,stack=True)

In [None]:
hists

In [None]:
 # make a stacked plot    
from plotting import plot_hists
plot_hists(
    year,
    hists,
    [
    "ak4JetbtagDeepFlavB0",
    "ak4JetbtagDeepFlavB1",
    "ak4DijetPt0",
    "ak4DijetPt1",
    "ak8FatJetPt0",
    "ak8FatJetPt1",
    # "DijetMass",
    "ak8FatJetPNetXbb0",
    "ak8FatJetPNetMass0",
    "ak8FatJetPNetXbb1",
    "ak8FatJetPNetMass1",
    ],
    58.6,
    add_data = False,
    mult_factor = 100,
    logy = True
)