# This is a template BDT notebook for developing

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import HH4b.utils as utils
import HH4b.plotting as plotting
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import mplhep as hep
import vector

In [None]:
def make_vector(events: pd.DataFrame, obj: str):
    """Create a ``vector`` object from the columns of the dataframe"""
    mstring = "PNetMass" if obj == "ak8FatJet" else "Mass"

    return vector.array(
        {
            "pt": events[f"{obj}Pt"],
            "phi": events[f"{obj}Phi"],
            "eta": events[f"{obj}Eta"],
            "M": events[f"{obj}{mstring}"],
        }
    )

## Load Dataset

In [None]:
year = "2018"  #
dir_name = "23Nov18_WSel_v9_private"
path_to_dir = f"/Users/billyli/UCSD/{dir_name}/"

In [None]:
# Load your dataset
samples = {
    "hh4b": ["GluGlutoHHto4B_cHHH1_TuneCP5_PSWeights_13TeV-powheg-pythia8"],
    # "hh4b-c2p45": ["GluGlutoHHto4B_cHHH2p45_TuneCP5_PSWeights_13TeV-powheg-pythia8"],
    # "hh4b-c5": ["GluGlutoHHto4B_cHHH5_TuneCP5_PSWeights_13TeV-powheg-pythia8"],
    # "hh4b-c0": ["GluGlutoHHto4B_cHHH0_TuneCP5_PSWeights_13TeV-powheg-pythia8"],
    "qcd": [
        "QCD_HT-200to300-13TeV",
        "QCD_HT-300to500-13TeV",
        "QCD_HT-500to700-13TeV",
        "QCD_HT-700to1000-13TeV",
        "QCD_HT-1000to1500-13TeV",
        "QCD_HT-1500to2000-13TeV",
        "QCD_HT-2000toInf-13TeV",
    ],
    "ttbar": [
        "TTTo2L2Nu_13TeV",
        "TTToHadronic_13TeV",
        "TTToSemiLeptonic_13TeV",
    ],
}

dirs = {path_to_dir: samples}

filters = [
    [
        # one good fatjet (fatjet with index 0 has the largest Xbb score)
        ("('ak8FatJetPt', '0')", ">=", 300),
        ("('ak8FatJetMsd', '0')", ">=", 60),
        ("('ak8FatJetPNetXbb', '0')", ">=", 0.9734),  # medium WP
        # second fatjet has lower
        ("('ak8FatJetPNetXbb', '1')", "<", 0.9880),  # tight WP
    ]
]

# columns to load
# the parquet files are too big so we can only load a few columns at a time without consumming much memory
load_columns = [
    ("weight", 1),
    ("ak8FatJetMsd", 2),
    ("ak8FatJetPNetMass", 2),
    ("ak8FatJetPNetXbb", 2),
]
# reformat into ("column name", "idx") format for reading multiindex columns
columns = []
for key, num_columns in load_columns:
    for i in range(num_columns):
        columns.append(f"('{key}', '{i}')")


events_dict = {}
for input_dir, samples in dirs.items():
    events_dict = {
        **events_dict,
        # this function will load files (only the columns selected), apply filters and compute a weight per event
        **utils.load_samples(input_dir, samples, year, filters=filters),
    }

samples_loaded = list(events_dict.keys())
keys_loaded = list(events_dict[samples_loaded[0]].keys())
print(f"Keys in events_dict")
for i in keys_loaded:
    print(i)

## Event cuts

In [None]:
# Higgs candidate selection example
events_raw = pd.concat(
    [events_dict["hh4b"], events_dict["qcd"], events_dict["ttbar"]], keys=["hh4b", "qcd", "ttbar"]
)

In [None]:
# AK4OutsideJet pt cut
jets_outside_raw = make_vector(events_raw, "ak4JetOutside")
j3_raw = jets_outside_raw[:, 0]
j4_raw = jets_outside_raw[:, 1]
j3j4_pt_cut = (j3_raw.pt > 20) & (j4_raw.pt > 20)

In [None]:
combined_filter = j3j4_pt_cut
events = events_raw[combined_filter]

## Define Targets

In [None]:
events["target"] = 0  # default is background
events.loc["hh4b", "target"] = 1  # Set to 1 for 'hh4b' samples (signal)

In [None]:
# define target
target = events["target"]

In [None]:
multiIndex = events.index
events = events.reset_index()

## Define Features

In [None]:
# Use pandas df as feature container
pdf = pd.DataFrame()

# H1 features
fatjets = make_vector(events, "ak8FatJet")  # fatjets sorted by xbb
h1 = fatjets[:, 0]
pdf["logPtH1"] = np.log(h1.pt)  #
pdf["hb_eta"] = h1.eta  #


# H2 features
jets_outside = make_vector(events, "ak4JetOutside")  # sorted by b-score
j3 = jets_outside[:, 0]
j4 = jets_outside[:, 1]
h2 = j3 + j4
pdf["hr_pt"] = h2.pt  #
pdf["hr_eta"] = h2.eta  #
pdf["hr_mass"] = h2.mass  #

# j3 j4 features
j3_deepJetB = events.ak4JetOutsidebtagDeepFlavB[0]
j4_deepJetB = events.ak4JetOutsidebtagDeepFlavB[1]

pn_b = events.ak4JetOutsidebtagPNetProbb + events.ak4JetOutsidebtagPNetProbbb
pn_c = events.ak4JetOutsidebtagPNetProbc + events.ak4JetOutsidebtagPNetProbcc
pn_uds = events.ak4JetOutsidebtagPNetProbuds
pn_g = events.ak4JetOutsidebtagPNetProbg
ak4JetOutsidePNetFlavB = pn_b / (pn_c + pn_uds + pn_g)
j3_PNetFlavB = ak4JetOutsidePNetFlavB[0]
j4_PNetFlavB = ak4JetOutsidePNetFlavB[1]

j3_btag = j3_PNetFlavB
j4_btag = j4_PNetFlavB

pdf["logPtJ3"] = np.log(j3.pt)  #
pdf["logPtJ4"] = np.log(j4.pt)  #
pdf["j3_btag"] = j3_btag  #
pdf["j4_btag"] = j4_btag  #

pdf["dPhiJ3J4"] = j3.deltaphi(j4)
pdf["dRJ3J4"] = j3.deltaR(j4)  #
pdf["j3_j4_ratio"] = j3.pt / j4.pt  #

# HH features
hh = h1 + h2
drh1h2 = np.sqrt((h1.eta - h2.eta) ** 2 + (h1.phi - h2.phi) ** 2)

pdf["logPtHH"] = np.log(hh.pt)  #
pdf["HHMass"] = hh.M  #
pdf["HHEta"] = hh.eta  #  #

pdf["H1Pt_H2Pt"] = h1.pt / h2.pt  #
pdf["dphi"] = h1.deltaphi(h2)  #
pdf["deta"] = h1.deltaeta(h2)  #
pdf["dRH1H2"] = drh1h2  #

# Second fatjet
fj2 = fatjets[:, 1]
pdf["fj2_pt"] = fj2.pt  #
pdf["fj2_eta"] = fj2.eta  #
pdf["fj2_mass"] = fj2.M  #

In [None]:
# LOAD FEATURES INTO PD DF
features = pdf.set_index(multiIndex)

In [None]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=42
)

In [None]:
# Initialize the BDT model
bdt_model = XGBClassifier(
    n_estimators=196, max_depth=17, learning_rate=0.1, subsample=0.5, random_state=42
)

In [None]:
# Train the model
bdt_model.fit(X_train, y_train)

In [None]:
y_scores = bdt_model.predict_proba(X_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y_test, y_scores)
roc_auc = auc(fpr, tpr)

# histogram for y scores signal & background
# rank by importance

# number of b-jets that can be identified in resolved
# another feather cos(theta_star)

# Plottting
plt.figure()
plt.plot(tpr, fpr, color="darkorange", lw=2, label="ROC curve (area = %0.2f)" % roc_auc)
plt.xlabel("Signal (HH)")
plt.ylabel("Background (QCD&ttbar)")
plt.title("ROC")
plt.legend(loc="upper left")
plt.yscale("log")
plt.show()

In [None]:
# determine importance of the features

importances = bdt_model.feature_importances_

feature_names = pdf.columns
feature_importance = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)

for feature in feature_importance:
    print(f"{feature[0]}: {feature[1]:.3f}")

In [None]:
print(importances.size)

In [None]:
hh4b_scores = bdt_model.predict_proba(X_test.loc["hh4b"])[:, 1]
qcd_scores = bdt_model.predict_proba(X_test.loc["qcd"])[:, 1]
ttbar_scores = bdt_model.predict_proba(X_test.loc["ttbar"])[:, 1]

In [None]:
plt.figure()
plt.hist(hh4b_scores, bins=40, histtype="step", linewidth=1.5, color="darkblue")
plt.hist(qcd_scores, bins=40, histtype="step", linewidth=1.5, color="red")
plt.hist(ttbar_scores, bins=40, histtype="step", linewidth=1.5, color="darkgreen")
plt.legend(["hh4b", "qcd", "ttbar"])
plt.ylabel("Frequency")
plt.xlabel("Prediciton Score")
plt.title("Model Predictions Histogram")
plt.yscale("log")
plt.show()