In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import backend as K
from tensorflow import keras
from tensorflow.errors import InvalidArgumentError
import tensorflow as tf

# Tensorflow GPU settings
# gpu_options = tf.GPUOptions(allow_growth=True)#per_process_gpu_memory_fraction=0.5)
# sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

from sklearn.metrics import roc_curve, roc_auc_score, auc
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.utils import shuffle

from scipy import interpolate

from atlasify import atlasify
import pickle

In [None]:
sig = pd.read_feather('/home/grabanal/WVZ/gabriel_ML_data/20220117_iso_e4m1_VVZ_RD.arrow')
sig['is_signal'] = True
# sig = sig[sig.SR == 2]

bg = pd.read_feather('/home/grabanal/WVZ/gabriel_ML_data/20220117_iso_e4m1_FULLBG_RD.arrow')
bg['is_signal'] = False
# bg = bg[bg.SR == 2]

In [None]:
train_feats_raw = sorted([f for f in sig.columns if f not in ['index', 'wgt', 'is_signal', 
                                                              'Zcand_mass', 'chisq']])

train_feat_sets = [train_feats_raw, 
                   [f for f in train_feats_raw if f not in ['Wlep1_phi', 'Wlep2_phi', 'Zlep1_phi', 'Zlep2_phi']],
                   [f for f in train_feats_raw if f not in ['MET', 'METSig']],
                   [f for f in train_feats_raw if f not in ['pt_1', 'pt_2', 'pt_3', 'pt_4']],
                   [f for f in train_feats_raw if f not in ['Njet', 'Nlep']]
                  ]

In [None]:
# Normalize training features
X = pd.concat([sig[train_feats_raw], bg[train_feats_raw]], ignore_index=True)

min_max_scaler = preprocessing.MinMaxScaler()

min_max_scaler.fit(X)

for df in [sig, bg]:
    df[train_feats_raw] = min_max_scaler.transform(df[train_feats_raw])

In [None]:
# Load per-background models
models_dir = 'models/background_id_models/'

background_classifiers = {'ZZ': 1, 'Zjets': 2, 'WZ': 1, 'ttZ': 0, 'other': 1}

for bc_name in background_classifiers:
    bc_index = background_classifiers[bc_name]
    
    classifier = keras.models.load_model((models_dir + 'classifier_' + bc_name 
                                          + '_train_feat_test_' + str(bc_index)))
    sig['classifier_' + bc_name + '_score'] = classifier.predict(sig[train_feat_sets[bc_index]], 
                                                                    batch_size=10000)
    bg['classifier_' + bc_name + '_score'] = classifier.predict(bg[train_feat_sets[bc_index]], 
                                                                   batch_size=10000)

In [None]:
_, b, _ = plt.hist(bg.classifier_ZZ_score, bins=100, weights=bg.wgt, density=True, alpha=0.5)
plt.hist(sig.classifier_ZZ_score, bins=b, weights=sig.wgt, density=True, alpha=0.5)

plt.show()

In [None]:
_, b, _ = plt.hist(bg.classifier_Zjets_score, bins=100, weights=bg.wgt, density=True, alpha=0.5)
plt.hist(sig.classifier_Zjets_score, bins=b, weights=sig.wgt, density=True, alpha=0.5)

plt.show()

In [None]:
_, b, _ = plt.hist(bg.classifier_WZ_score, bins=100, weights=bg.wgt, density=True, alpha=0.5)
plt.hist(sig.classifier_WZ_score, bins=b, weights=sig.wgt, density=True, alpha=0.5)

plt.show()

In [None]:
_, b, _ = plt.hist(bg.classifier_ttZ_score, bins=100, weights=bg.wgt, density=True, alpha=0.5)
plt.hist(sig.classifier_ttZ_score, bins=b, weights=sig.wgt, density=True, alpha=0.5)

plt.show()

In [None]:
_, b, _ = plt.hist(bg.classifier_other_score, bins=100, weights=bg.wgt, density=True, alpha=0.5)
plt.hist(sig.classifier_other_score, bins=b, weights=sig.wgt, density=True, alpha=0.5)

plt.show()

In [None]:
# Cut to 4l-DF signal region
bg = bg[bg.SR == 1]
sig = sig[sig.SR == 1]

# Train model

First we want to figure out the signal/background ratio

In [None]:
def region_sig(s, b):
    if s == 0:
        return 0
    return np.sqrt(2 * ((s + b) * np.log(1 + s / b) - s))

n_bg = sum(bg.wgt)
n_sig = sum(sig.wgt)

print('There are', n_bg, 'background events')
print('There are', n_sig, 'signal events')
print('')
print('S/B =', n_sig/n_bg)
print('Starting significance is', region_sig(n_sig, n_bg), 'sigma')
print('Corresponds to', np.sqrt(2.0) * region_sig(n_sig, n_bg), 'sigma')

In [None]:
bg['abs_wgt'] = np.abs(bg.wgt)
sig['abs_wgt'] = np.abs(sig.wgt)

In [None]:
sig.columns

In [None]:
bg_classifier_score_feats = ['classifier_' + bc + '_score' for bc in background_classifiers]
combined_train_feats_raw = train_feats_raw + bg_classifier_score_feats

combined_train_feat_sets = [combined_train_feats_raw, 
                            [f for f in combined_train_feats_raw if f not in bg_classifier_score_feats],
                            [f for f in combined_train_feats_raw if f not in ['Wlep1_phi', 'Wlep2_phi', 
                                                                              'Zlep1_phi', 'Zlep2_phi']],
                            [f for f in combined_train_feats_raw if f not in ['MET', 'METSig']],
                            [f for f in combined_train_feats_raw if f not in ['pt_1', 'pt_2', 'pt_3', 'pt_4']],
                            [f for f in combined_train_feats_raw if f not in ['Njet', 'Nlep']]
                           ]

In [None]:
EPOCHS = 10000
patience = 500
batch_size = 256
num_nodes = 64
dropout = 0.1
learn_rate = 1e-5

for i, train_feats in enumerate(combined_train_feat_sets):
    model_dir = 'models/SR_4l_DF_models/'
    model_name = 'classifier_train_feat_test_' + str(i)
    
    print('Running with training features:', train_feats)
    # Save training setup
    with open(model_dir + model_name + '_setup.txt', 'w') as file:
        file.write('Epochs: ' + str(EPOCHS) + '\n')
        file.write('Patience: ' + str(patience) + '\n')
        file.write('Learning rate: ' + str(learn_rate) + '\n')
        file.write('Batch size: ' + str(batch_size) + '\n\n')
        file.write('Training features:\n' + '\n'.join(train_feats))
    
    # Generate train and test samples
    sig_train, sig_test = train_test_split(sig[train_feats + ['wgt']], train_size=0.5, random_state=314)
    bg_train, bg_test = train_test_split(bg[train_feats + ['wgt']], train_size=0.5, random_state=314)

    n_sig = sum(sig_train.wgt)
    n_bg = sum(bg_train.wgt)

    x_train_sig = sig_train[train_feats]
    x_train_bg = bg_train[train_feats]

    x_train = pd.concat([x_train_sig, x_train_bg])
    y_train = np.concatenate([np.ones(len(sig_train)), np.zeros(len(bg_train))])
    w_train = pd.Series(np.concatenate([(n_sig + n_bg) / n_sig * sig_train['wgt'], 
                                        (n_sig + n_bg) / n_bg * bg_train['wgt']]))

    n_sig_test = sum(sig_test.wgt)
    n_bg_test = sum(bg_test.wgt)

    x_test = pd.concat([sig_test[train_feats], bg_test[train_feats]])
    y_test = np.concatenate([np.ones(len(sig_test)), np.zeros(len(bg_test))])
    w_test = pd.Series(np.concatenate([(n_sig_test + n_bg_test) / n_sig_test * sig_test['wgt'], 
                                       (n_sig_test + n_bg_test) / n_bg_test * bg_test['wgt']]))
    
    # Generate and fit model
    K.clear_session()
    classifier = Sequential()
    classifier.add(Dense(num_nodes, input_dim=x_train.shape[1], activation='relu')) 
    classifier.add(Dropout(dropout))
    classifier.add(Dense(num_nodes, activation='relu'))
    classifier.add(Dropout(dropout))
    classifier.add(Dense(num_nodes, activation='relu'))
    classifier.add(Dropout(dropout))
    classifier.add(Dense(1, activation='sigmoid'))

    opt = keras.optimizers.Adam(learning_rate=learn_rate)
    classifier.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

    # Early stopping
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)

    history = classifier.fit(x_train, y_train, epochs=EPOCHS, batch_size=batch_size,
                             validation_data=(x_test, y_test, w_test), sample_weight=w_train, 
                             verbose=1, callbacks=[callback], shuffle=True)
    
    # Save model and history
    classifier.save(model_dir + model_name)
    with open(model_dir + model_name + '_history.pkl', 'wb') as file_pi:
        pickle.dump(history.history, file_pi)