# Fit B in the FoM TXbb and BDT space

In [None]:
B_max = 150435.0  # obtained from nevents_regionB at (0,0)

# Misc Checks

In [None]:
import pandas as pd
import numpy as np
import vector
import os
from xgboost import XGBClassifier
from pathlib import Path

import HH4b.utils as utils
from HH4b.utils import ShapeVar
import HH4b.plotting as plotting
from HH4b.postprocessing import PostProcess, Region
import HH4b.postprocessing as postprocessing
from HH4b.hh_vars import samples, years, samples_run3

import hist
import matplotlib.pyplot as plt
import mplhep as hep
import matplotlib.ticker as mticker
import importlib

formatter = mticker.ScalarFormatter(useMathText=True)
formatter.set_powerlimits((-3, 3))

In [None]:
pdf = pd.read_json("df_txbb_0pt6_bdt_0pt6.json")

In [None]:
pdf

# Fitting Starts

In [None]:
import matplotlib.pyplot as plt
import numpy as np

from matplotlib import cm
from matplotlib.ticker import LinearLocator

In [None]:
# plot region parameter
txbb_low = 0.95
bdt_low = 0.90
# x_low = 0.6
# y_low = 0.6

# scan parameters
txbb_stepsize = 0.002
bdt_stepsize = 0.005

In [None]:
# init data
Txbb = np.array(df["txbb_cut"])
BDT = np.array(df["bdt_cut"])
B = np.array(df["nevents_regionB"])

# select data in the plotting region
Txbb_cond = Txbb >= txbb_low
BDT_cond = BDT >= bdt_low
cond = Txbb_cond & BDT_cond

Txbb = Txbb[cond]
BDT = BDT[cond]
B = B[cond]

### Add nevents_B = 0 at boundaries to make interpolation more physical

In [None]:
# at Txbb = 1
for bdt_val in np.unique(BDT):
    Txbb = np.append(Txbb, 1)
    BDT = np.append(BDT, bdt_val)
    B = np.append(B, 0.0)

# at BDT = 1
for txbb_val in np.unique(Txbb):
    Txbb = np.append(Txbb, txbb_val)
    BDT = np.append(BDT, 1)
    B = np.append(B, 0.0)

In [None]:
df[df["nevents_regionB"] == 0]

In [None]:
txbb_bins = np.arange(txbb_low - txbb_stepsize / 2, 1 + txbb_stepsize, txbb_stepsize)
bdt_bins = np.arange(bdt_low - bdt_stepsize / 2, 1 + bdt_stepsize, bdt_stepsize)

In [None]:
# check if the predictions are smooth
test_Txbb_stepsize = 0.0002
test_BDT_stepsize = 0.0005

test_Txbb_bins = np.arange(
    txbb_low - test_Txbb_stepsize / 2, 1 + test_Txbb_stepsize, test_Txbb_stepsize
)
test_BDT_bins = np.arange(bdt_low - test_BDT_stepsize / 2, 1 + test_BDT_stepsize, test_BDT_stepsize)

In [None]:
test_Txbb_range = np.arange(txbb_low, 1, test_Txbb_stepsize)
test_BDT_range = np.arange(bdt_low, 1, test_BDT_stepsize)
test_Txbb_grid, test_BDT_grid = np.meshgrid(test_Txbb_range, test_BDT_range)
test_Txbb = test_Txbb_grid.flatten()
test_BDT = test_BDT_grid.flatten()

## Scanned values (measurements)

In [None]:
# Plot the surface.
heatmap, xedges, yedges = np.histogram2d(Txbb, BDT, bins=[txbb_bins, bdt_bins], weights=B)

# Create the heatmap plot
plt.imshow(
    heatmap.T, extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]], origin="lower", cmap="hot"
)
plt.colorbar()  # add a colorbar
plt.title("Scanned B Values")
plt.xlabel("txbb_cut")
plt.ylabel("bdt_cut")
plt.show("Number of Data in Region B")

## fit 1D slice with different BDT_cuts to understand if Poisson is a good assumption

In [None]:
from scipy.special import gammainc, gamma
from scipy.optimize import curve_fit

In [None]:
txbb_cut = 0.99

# data for plot
txbb_select = np.isclose(df["txbb_cut"], txbb_cut)
bdt_select = df["bdt_cut"] >= bdt_low
select = txbb_select & bdt_select

BDT_1D = df[select]["bdt_cut"]
B_1D = df[select]["nevents_regionB"]

In [None]:
# fit data
def one_minus_cdf(x, a, scale):
    cdf = gammainc(a, (x / scale) ** 2) / gamma(a)
    return 1 - cdf

In [None]:
# plot interpolation
test_BDT_1D = np.arange(bdt_low, 1, test_BDT_stepsize)
test_Txbb_1D = np.ones_like(test_BDT_1D) * txbb_cut

In [None]:
fix_txbb_cuts = np.arange(0.95, 0.998, 0.002)

params_list = []


for txbb_cut in fix_txbb_cuts:
    # init plot
    fig, ax = plt.subplots(figsize=(10, 8))

    # plot data
    txbb_select = np.isclose(df["txbb_cut"], txbb_cut)
    bdt_select = df["bdt_cut"] >= bdt_low
    select = txbb_select & bdt_select

    BDT_1D = df[select]["bdt_cut"]
    B_1D = df[select]["nevents_regionB"]

    params, params_covariance = curve_fit(one_minus_cdf, BDT_1D, B_1D / B_max, p0=[2, 2])
    params_list.append(params)

    # plot interpolation
    test_BDT_1D = np.arange(bdt_low, 1, test_BDT_stepsize)
    test_Txbb_1D = np.ones_like(test_BDT_1D) * txbb_cut

    B_RBF_Pred_1D = one_minus_cdf(test_BDT_1D, params[0], params[1]) * B_max

    ax.scatter(BDT_1D, B_1D, c="y", label="measurements")
    ax.plot(test_BDT_1D, B_RBF_Pred_1D, label="interpolation")
    ax.set(
        xlabel="bdt cut",
        ylabel="Number of Events in Region B",
        title=f"Poisson Interpolation @ Txbb cut = {txbb_cut}",
    )
    ax.legend()
    plt.show()

In [None]:
params_arr = np.array(params_list)
params_arr

In [None]:
plt.scatter(fix_txbb_cuts, params_arr[:, 1])
plt.xlabel("txbb cuts")
plt.ylabel("scale factor")
plt.title("Fit parameter 2 vs. txbb_cut")

## fit 1D slice with different xbb cuts to check if Poisson is a good assumption

## curve fit 2d

In [None]:
# fit data
def one_minus_cdf_2d(x, a, b, scale1, scale2):
    x1 = x[:, 0]
    x2 = x[:, 1]

    c1 = 0
    c2 = 0

    sx1 = x1 / scale1
    sx2 = x2 / scale2

    arg1 = sx1**3 + (c1 * sx1) ** 2
    arg2 = sx2**3 + (c2 * sx2) ** 2

    cdf_1 = gammainc(a, arg1) / gamma(a)
    cdf_2 = gammainc(b, arg2) / gamma(b)

    return (1 - cdf_1) * (1 - cdf_2)

In [None]:
fit_data_2d = np.concatenate([Txbb.reshape(-1, 1), BDT.reshape(-1, 1)], axis=1)

In [None]:
params = [1, 1, 1, 1]
params, params_covariance = curve_fit(
    one_minus_cdf_2d, fit_data_2d, B / B_max, p0=params, maxfev=100000
)

In [None]:
test_data_2d = np.concatenate([test_Txbb.reshape(-1, 1), test_BDT.reshape(-1, 1)], axis=1)

In [None]:
B_pred = one_minus_cdf_2d(test_data_2d, *params) * B_max

In [None]:
# Plot the surface.
heatmap, Txbbedges, BDTedges = np.histogram2d(
    test_Txbb, test_BDT, bins=[test_Txbb_bins, test_BDT_bins], weights=B_pred
)

# Create the heatmap plot
plt.figure(figsize=(10, 6))
plt.imshow(
    heatmap.T,
    extent=[Txbbedges[0], Txbbedges[-1], BDTedges[0], BDTedges[-1]],
    origin="lower",
    cmap="hot",
)
plt.colorbar()  # add a colorbar
plt.title("2D Poisson Predictions on 10x Finer Grid")
plt.xlabel("txbb_cut")
plt.ylabel("bdt_cut")
plt.show("Number of Data in Region B")

In [None]:
fix_bdt_cuts = np.arange(0.90, 0.995, 0.005)
params_list = []

for bdt_cut in fix_bdt_cuts:
    # init plot
    fig, ax = plt.subplots(figsize=(10, 8))

    # plot data
    bdt_select = np.isclose(df["bdt_cut"], bdt_cut)
    txbb_select = df["txbb_cut"] >= txbb_low
    select = txbb_select & bdt_select

    Txbb_1D = df[select]["txbb_cut"]
    B_1D = df[select]["nevents_regionB"]

    # plot interpolation
    test_Txbb_1D = np.arange(txbb_low, 1, test_Txbb_stepsize)
    test_BDT_1D = np.ones_like(test_Txbb_1D) * bdt_cut
    test_data_1D = np.concatenate([test_Txbb_1D.reshape(-1, 1), test_BDT_1D.reshape(-1, 1)], axis=1)

    B_Pred_1D = one_minus_cdf_2d(test_data_1D, *params) * B_max

    ax.scatter(Txbb_1D, B_1D, c="y", label="measurements")
    ax.plot(test_Txbb_1D, B_Pred_1D, label="interpolation")
    ax.set(
        xlabel="txbb cut",
        ylabel="Number of Events in Region B",
        title=f"2D Poisson Interpolation @ BDT cut = {bdt_cut}",
    )
    ax.legend()
    plt.show()

In [None]:
fix_txbb_cuts = np.arange(0.95, 0.998, 0.002)
params_list = []


for txbb_cut in fix_txbb_cuts:
    # init plot
    fig, ax = plt.subplots(figsize=(10, 8))

    # plot data
    txbb_select = np.isclose(df["txbb_cut"], txbb_cut)
    bdt_select = df["bdt_cut"] >= bdt_low
    select = txbb_select & bdt_select

    BDT_1D = df[select]["bdt_cut"]
    B_1D = df[select]["nevents_regionB"]

    # plot interpolation
    test_BDT_1D = np.arange(bdt_low, 1, test_BDT_stepsize)
    test_Txbb_1D = np.ones_like(test_BDT_1D) * txbb_cut
    test_data_1D = np.concatenate([test_Txbb_1D.reshape(-1, 1), test_BDT_1D.reshape(-1, 1)], axis=1)

    B_Pred_1D = one_minus_cdf_2d(test_data_1D, *params) * B_max

    ax.scatter(BDT_1D, B_1D, c="y", label="measurements")
    ax.plot(test_BDT_1D, B_Pred_1D, label="interpolation")
    ax.set(
        xlabel="bdt cut",
        ylabel="Number of Events in Region B",
        title=f"2D Poisson Interpolation @ Txbb cut = {txbb_cut}",
    )
    ax.legend()
    plt.show()

## Print the 2d fit params and copy to Optimize_WP_with_smoothB.ipynb

In [None]:
print(params)