## Mass-balanced pNN

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, roc_curve, average_precision_score, roc_auc_score

from script import utils, cms
from script.utils import free_mem
from script.models import FocalLoss
from script.cms.data import MassBalancedSequence

from script.datasets import Dataset

sns.set()

In [None]:
# for debugging, comment if not needed
%load_ext autoreload
%autoreload 2

In [None]:
utils.set_random_seed(42)

In [None]:
INTERVALS = [(115, 180),   # 130
             (115, 200),   # 150
             (120, 220),   # 170
             (150, 250),   # 200
             (200, 300),   # 250
             (225, 375),   # 300
             (275, 425),   # 350
             (300, 500),   # 400
             (350, 550),   # 450
             (350, 650),   # 500
             (400, 800),   # 600
             (500, 900),   # 700
             (600, 1000),  # 800
             (700, 1800),  # 1000
             (700, 1800),  # 1200
             (700, 1800)]  # 1500

## Category 1

In [None]:
# divide background samples into "train" and "test"
b = pd.read_csv('data/new2/background_cat1.csv')

# save to disk
b[b.index % 2 == 0].to_csv('data/new2/background_cat1_train.csv', index=False)
b[b.index % 2 == 1].to_csv('data/new2/background_cat1_test.csv', index=False)

In [None]:
# divide signal samples into "train" and "test"
s = pd.read_csv('data/new2/signal_bassociated_cat1.csv')

# rename "dimuon_mass" to "dimuon_M"
s.rename(columns={'dimuon_mass': 'dimuon_M'}, inplace=True)

# save to disk
cols = [s.columns[0]] + list(s.columns[2:])  # remove "training" column

s[s['training'] == 3][cols].to_csv('data/new2/signal_bassociated_cat1_train.csv', index=False)
s[s['training'] == 2][cols].to_csv('data/new2/signal_bassociated_cat1_test.csv', index=False)

---

In [None]:
VAR_CAT1 = ["dimuon_deltar", "dimuon_deltaphi", "dimuon_deltaeta", "met_pt", 
             "deltar_bjet1_dimuon", "deltapt_bjet1_dimuon", "deltaeta_bjet1_dimuon", 
             "bjet_1_pt", "bjet_1_eta", "deltaphi_bjet1_dimuon",
             "ljet_1_pt", "ljet_1_eta", "bjet_n", "ljet_n"]

In [None]:
# load training data
data = Dataset()
data.load(signal='data/new2/signal_bassociated_cat1_train.csv', 
          bkg='data/new2/background_cat1_train.csv', feature_columns=VAR_CAT1)

data.mass_intervals = INTERVALS

In [None]:
# add a new column "sample_weights" just for training weights
data.signal['sample_weights'] = data.signal['PU_Weight'].abs() * 0.99
data.background['sample_weights'] = data.background['PU_Weight'].abs() * 0.99

In [None]:
train, valid = MassBalancedSequence.get_data(data, num_splits=2, case=2, train_batch=1024,
                                             weight_column='sample_weights')

In [None]:
model, checkpoint = utils.get_compiled_pnn(data, save='tmp/pnn-mass_bal-cat_1-case_2')

In [None]:
model.fit(x=train, epochs=100, validation_data=valid, verbose=2, callbacks=[checkpoint])

In [None]:
utils.load_from_checkpoint(model, path='tmp/pnn-mass_bal-cat_1-case_2')

Evaluation:

In [None]:
test = Dataset()
test.load(signal='data/new2/signal_bassociated_cat1_test.csv', 
          bkg='data/new2/background_cat1_test.csv', feature_columns=VAR_CAT1)

# prepare weights for test-only
test.signal['test_weights'] = test.signal['PU_Weight'].abs() * 0.99
test.background['test_weights'] = test.background['PU_Weight'].abs() * 0.99 * test.background['weight']

test.mass_intervals = INTERVALS

In [None]:
test_mass = test.unique_signal_mass
test_intervals = test.mass_intervals
model_and_data = {'case-2': (model, test)}
targs = dict(weight_column='test_weights')

In [None]:
test_ds = MassBalancedSequence(signal=test.signal, background=test.background, batch_size=1024, 
                               features=VAR_CAT1, balance=False, weight_column='test_weights', 
                               sample_mass=False, intervals=INTERVALS)
test_ds = test_ds.to_tf_dataset()

In [None]:
_ = model.evaluate(x=test_ds, verbose=2)

In [None]:
cms.plot.significance_ratio_vs_mass(model_and_data, title='Category-1, bbA+bbH', **targs)

In [None]:
ams, cuts = cms.plot._compute_significance(model, test, **targs)

In [None]:
cms.plot.curve_vs_mass(model_and_data, bins=20, title='pNN [category-1, bbA+bbH]', auc=False, 
                       curve='ROC', **targs)

In [None]:
cms.plot.curve_vs_mass(model_and_data, bins=20, title='pNN [category-1, bbA+bbH]', auc=False, 
                       curve='PR', **targs)

In [None]:
for mass, interval in zip(test_mass, test_intervals):
    cms.plot.significance(model, test, mass=mass, interval=interval, **targs)

In [None]:
for mass, interval, cut in zip(test_mass, test_intervals, cuts):
    cms.plot.compare_roc(test, {'case-2': (model, cut)}, mass=mass, interval=interval, **targs)

In [None]:
for mass, interval, cut in zip(test_mass, test_intervals, cuts):
    cms.plot.compare_pr(test, {'case-2': (model, cut)}, mass=mass, interval=interval, **targs)

In [None]:
for mass, interval, cut in zip(test_mass, test_intervals, cuts):
    cms.plot.variables(test, model, mass=mass, cut=cut, share_y=True,
                            variables=VAR_CAT1 + ['dimuon_pt', 'dimuon_M'], **targs)

---
## Category 2