In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import backend as K
from tensorflow import keras
from tensorflow.errors import InvalidArgumentError
import tensorflow as tf

# Tensorflow GPU settings
# gpu_options = tf.GPUOptions(allow_growth=True)#per_process_gpu_memory_fraction=0.5)
# sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

from sklearn.metrics import roc_curve, roc_auc_score, auc
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.utils import shuffle

from scipy import interpolate

from atlasify import atlasify
import pickle
import os

In [None]:
sig_old = pd.read_feather('/home/grabanal/WVZ/gabriel_ML_data/20220117_iso_e4m1_VVZ_RD.arrow')

sig = pd.read_feather('/home/grabanal/WVZ/gabriel_ML_data/20220301_ELReLMIs54_MUReLMIs31_btag77_VVZ_RD.arrow')
sig['is_signal'] = True
bg_full = pd.read_feather(('/home/grabanal/WVZ/gabriel_ML_data/'
                           + '20220301_ELReLMIs54_MUReLMIs31_btag77_FULLBG_RD.arrow'))
bg_full['is_signal'] = False

In [None]:
bg_ZZ = pd.read_feather('/home/grabanal/WVZ/gabriel_ML_data/20220301_ELReLMIs54_MUReLMIs31_btag77_ZZ_RD.arrow')
bg_Zjets_old = pd.read_feather('/home/grabanal/WVZ/gabriel_ML_data/20220301_ELReLMIs54_MUReLMIs31_btag77_Zjets_RD.arrow')
bg_Zjets = pd.read_feather('/home/grabanal/WVZ/gabriel_ML_data/20220301_ELReLMIs54_MUReLMIs31_btag77_Zjets_RD.arrow')
bg_Zgamma = pd.read_feather('/home/grabanal/WVZ/gabriel_ML_data/20220301_ELReLMIs54_MUReLMIs31_btag77_Zgamma_RD.arrow')
bg_WZ = pd.read_feather('/home/grabanal/WVZ/gabriel_ML_data/20220301_ELReLMIs54_MUReLMIs31_btag77_WZ_RD.arrow')
bg_tZ = pd.read_feather('/home/grabanal/WVZ/gabriel_ML_data/20220301_ELReLMIs54_MUReLMIs31_btag77_tZ_RD.arrow')
bg_tWZ = pd.read_feather('/home/grabanal/WVZ/gabriel_ML_data/20220301_ELReLMIs54_MUReLMIs31_btag77_tWZ_RD.arrow')
bg_ttZ = pd.read_feather('/home/grabanal/WVZ/gabriel_ML_data/20220301_ELReLMIs54_MUReLMIs31_btag77_ttZ_RD.arrow')
bg_other = pd.read_feather('/home/grabanal/WVZ/gabriel_ML_data/20220301_ELReLMIs54_MUReLMIs31_btag77_others_RD.arrow')

bg_sources = [bg_ZZ, bg_Zjets, bg_Zgamma, bg_WZ, bg_tZ, bg_tWZ, bg_ttZ, bg_other]

In [None]:
for df in bg_sources:
    print(sum(df.wgt) / sum(bg_full.wgt))

In [None]:
train_feats_raw = sorted([f for f in sig_old.columns if f not in ['index', 'wgt', 'is_signal', 
                                                                  'Zcand_mass', 'chisq']])

X = pd.concat([sig[train_feats_raw], bg_full[train_feats_raw]], ignore_index=True)

In [None]:
# Normalize inputs for NN training
min_max_scaler = preprocessing.MinMaxScaler()

min_max_scaler.fit(X)

for df in [sig, bg_full] + bg_sources:
    df[train_feats_raw] = min_max_scaler.transform(df[train_feats_raw])

# Signal vs ZZ

ZZ events account for

In [None]:
current_bg = bg_ZZ
sum(current_bg.wgt) / sum(bg_full.wgt) * 100

percent of the total background. The amount relative to signal is

In [None]:
print('Number:', len(current_bg)/len(sig) * 100, '%')
print('Weight:', sum(current_bg.wgt)/sum(sig.wgt) * 100, '%')

In [None]:
train_feat_sets = [train_feats_raw, 
                   [f for f in train_feats_raw if f not in ['Wlep1_phi', 'Wlep2_phi', 'Zlep1_phi', 'Zlep2_phi']],
                   [f for f in train_feats_raw if f not in ['MET', 'METSig']],
                   [f for f in train_feats_raw if f not in ['pt_1', 'pt_2', 'pt_3', 'pt_4']],
                   [f for f in train_feats_raw if f not in ['Njet', 'Nlep']]
                  ]

In [None]:
# Regenerate test samples
_, sig_test = train_test_split(sig[train_feats_raw + ['wgt']], train_size=0.5, random_state=314)
_, bg_test = train_test_split(current_bg[train_feats_raw + ['wgt']], train_size=0.5, random_state=314)

n_sig_test = sum(sig_test.wgt)
n_bg_test = sum(bg_test.wgt)

x_test = pd.concat([sig_test[train_feats_raw], bg_test[train_feats_raw]])
y_test = np.concatenate([np.ones(len(sig_test)), np.zeros(len(bg_test))])
w_test = pd.Series(np.concatenate([(n_sig_test + n_bg_test) / n_sig_test * sig_test['wgt'], 
                                   (n_sig_test + n_bg_test) / n_bg_test * bg_test['wgt']]))

Now we scan over tests

In [None]:
scan_test_str = '20220301_classifier_ZZ'
plot_str = scan_test_str

completed_tests = ['models/background_id_models/' + t for t in os.listdir('models/background_id_models/') 
                   if t.startswith(scan_test_str) and t.endswith('_history.pkl')]
completed_tests = [t.replace('_history.pkl', '') for t in completed_tests]
completed_tests

In [None]:
models = [None] * len(completed_tests)

for i, test in enumerate(completed_tests):
    models[i] = keras.models.load_model(test)
    
    x_test['output_test_' + str(i)] = models[i].predict(x_test[train_feat_sets[i]], batch_size=10000)
    sig_test['output_test_' + str(i)] = models[i].predict(sig_test[train_feat_sets[i]], batch_size=10000)
    bg_test['output_test_' + str(i)] = models[i].predict(bg_test[train_feat_sets[i]], batch_size=10000)

In [None]:
# Generate ROC curves
plt.figure(figsize=(8, 6))

plt.plot([0, 1], [0, 1], ls='--', color='grey', label='Random')

for i, test in enumerate(completed_tests):
    fpr, tpr, _ = roc_curve(y_test, x_test['output_test_' + str(i)], sample_weight=w_test)
    
    plt.plot(fpr, tpr, label='Test ' + str(i))
    
plt.legend(fontsize=14)
plt.xlabel('False positive rate', fontsize=14)
plt.ylabel('True positive rate', fontsize=14)
plt.title('Signal vs other', fontsize=16, loc='right')

atlasify('Internal Simulation', outside=True)

plt.savefig('plots/background_id_models/' + plot_str + '_train_feat_test_roc.png', 
            pad_inches=0.05, bbox_inches='tight')
plt.savefig('plots/background_id_models/' + plot_str + '_train_feat_test_roc.pdf', 
            pad_inches=0.05, bbox_inches='tight')

In [None]:
# Generate training history plots

for i, test in enumerate(completed_tests):
    plt.figure(figsize=(6, 4))
    
    with open(test + '_history.pkl', 'rb') as f:
        history = pickle.load(f)
        
    # Loss
    plt.plot(history['loss'], label='loss')
    plt.plot(history['val_loss'], label='val loss')
    
    plt.legend(fontsize=12)
    plt.xlabel('NN output', fontsize=12)
    plt.ylabel('Density', fontsize=12)
    
    atlasify('Internal simulation', outside=True)
    
    plt.title('ZZ Test ' + str(i), fontsize=14, loc='right')
    
    if not os.path.exists('plots/background_id_models/' + plot_str + '/train_feat_test/'):
        os.makedirs('plots/background_id_models/' + plot_str + '/train_feat_test/')
    
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_loss.png', 
            pad_inches=0.05, bbox_inches='tight')
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_loss.pdf', 
            pad_inches=0.05, bbox_inches='tight')
    
    plt.show()
    
    # Accuracy
    plt.plot(history['accuracy'], label='acc')
    plt.plot(history['val_accuracy'], label='val acc')
    
    plt.legend(fontsize=12)
    plt.xlabel('NN output', fontsize=12)
    plt.ylabel('Density', fontsize=12)
    
    atlasify('Internal simulation', outside=True)
    
    plt.title('ZZ Test ' + str(i), fontsize=14, loc='right')
    
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_acc.png', 
            pad_inches=0.05, bbox_inches='tight')
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_acc.pdf', 
            pad_inches=0.05, bbox_inches='tight')

In [None]:
# Generate signal vs background histograms

for i, test in enumerate(completed_tests):
    plt.figure(figsize=(6, 4))
    
    _, b, _ = plt.hist(bg_test['output_test_' + str(i)], bins=30, weights=bg_test.wgt, label='Other', 
                       density=True, alpha=0.5)
    plt.hist(sig_test['output_test_' + str(i)], bins=b, weights=sig_test.wgt, label='Sig', 
             density=True, alpha=0.5)
    
    
    plt.legend(fontsize=12)
    plt.xlabel('NN output', fontsize=12)
    plt.ylabel('Density', fontsize=12)
    
    atlasify('Internal simulation', outside=True)
    
    plt.title('Test ' + str(i), fontsize=14, loc='right')
    
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_output.png', 
            pad_inches=0.05, bbox_inches='tight')
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_output.pdf', 
            pad_inches=0.05, bbox_inches='tight')

# Signal vs Z + jets

Z + jets events account for

In [None]:
current_bg = bg_Zjets
sum(current_bg.wgt) / sum(bg_full.wgt) * 100

percent of the total background. The amount relative to signal is

In [None]:
print('Number:', len(current_bg)/len(sig) * 100, '%')
print('Weight:', sum(current_bg.wgt)/sum(sig.wgt) * 100, '%')

In [None]:
train_feat_sets = [train_feats_raw, 
                   [f for f in train_feats_raw if f not in ['Wlep1_phi', 'Wlep2_phi', 'Zlep1_phi', 'Zlep2_phi']],
                   [f for f in train_feats_raw if f not in ['MET', 'METSig']],
                   [f for f in train_feats_raw if f not in ['pt_1', 'pt_2', 'pt_3', 'pt_4']],
                   [f for f in train_feats_raw if f not in ['Njet', 'Nlep']]
                  ]

In [None]:
# Regenerate test samples
_, sig_test = train_test_split(sig[train_feats_raw + ['wgt']], train_size=0.5, random_state=314)
_, bg_test = train_test_split(current_bg[train_feats_raw + ['wgt']], train_size=0.5, random_state=314)

# n_sig = sum(sig_train.wgt)
# n_bg = sum(bg_train.wgt)

n_sig_test = sum(sig_test.wgt)
n_bg_test = sum(bg_test.wgt)

x_test = pd.concat([sig_test[train_feats_raw], bg_test[train_feats_raw]])
y_test = np.concatenate([np.ones(len(sig_test)), np.zeros(len(bg_test))])
w_test = pd.Series(np.concatenate([(n_sig_test + n_bg_test) / n_sig_test * sig_test['wgt'], 
                                   (n_sig_test + n_bg_test) / n_bg_test * bg_test['wgt']]))

Now we scan over tests

In [None]:
scan_test_str = '20220301_classifier_Zjets'
plot_str = scan_test_str

completed_tests = ['models/background_id_models/' + t for t in os.listdir('models/background_id_models/') 
                   if t.startswith(scan_test_str) and t.endswith('_history.pkl')]
completed_tests = [t.replace('_history.pkl', '') for t in completed_tests]
completed_tests

In [None]:
models = [None] * len(completed_tests)

for i, test in enumerate(completed_tests):
    models[i] = keras.models.load_model(test)
    
    x_test['output_test_' + str(i)] = models[i].predict(x_test[train_feat_sets[i]], batch_size=10000)
    sig_test['output_test_' + str(i)] = models[i].predict(sig_test[train_feat_sets[i]], batch_size=10000)
    bg_test['output_test_' + str(i)] = models[i].predict(bg_test[train_feat_sets[i]], batch_size=10000)

In [None]:
# Generate ROC curves
plt.figure(figsize=(8, 6))

plt.plot([0, 1], [0, 1], ls='--', color='grey', label='Random')

for i, test in enumerate(completed_tests):
    fpr, tpr, _ = roc_curve(y_test, x_test['output_test_' + str(i)], sample_weight=w_test)
    
    plt.plot(fpr, tpr, label='Test ' + str(i))
    
plt.legend(fontsize=14)
plt.xlabel('False positive rate', fontsize=14)
plt.ylabel('True positive rate', fontsize=14)
plt.title('Signal vs other', fontsize=16, loc='right')

atlasify('Internal Simulation', outside=True)

plt.savefig('plots/background_id_models/' + plot_str + '_train_feat_test_roc.png', 
            pad_inches=0.05, bbox_inches='tight')
plt.savefig('plots/background_id_models/' + plot_str + '_train_feat_test_roc.pdf', 
            pad_inches=0.05, bbox_inches='tight')

In [None]:
# Generate training history plots

for i, test in enumerate(completed_tests):
    plt.figure(figsize=(6, 4))
    
    with open(test + '_history.pkl', 'rb') as f:
        history = pickle.load(f)
        
    # Loss
    plt.plot(history['loss'], label='loss')
    plt.plot(history['val_loss'], label='val loss')
    
    plt.legend(fontsize=12)
    plt.xlabel('NN output', fontsize=12)
    plt.ylabel('Density', fontsize=12)
    
    atlasify('Internal simulation', outside=True)
    
    plt.title('Z + jets Test ' + str(i), fontsize=14, loc='right')
    
    if not os.path.exists('plots/background_id_models/' + plot_str + '/train_feat_test/'):
        os.makedirs('plots/background_id_models/' + plot_str + '/train_feat_test/')
    
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_loss.png', 
            pad_inches=0.05, bbox_inches='tight')
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_loss.pdf', 
            pad_inches=0.05, bbox_inches='tight')
    
    plt.show()
    
    # Accuracy
    plt.plot(history['accuracy'], label='acc')
    plt.plot(history['val_accuracy'], label='val acc')
    
    plt.legend(fontsize=12)
    plt.xlabel('NN output', fontsize=12)
    plt.ylabel('Density', fontsize=12)
    
    atlasify('Internal simulation', outside=True)
    
    plt.title('Z + jets Test ' + str(i), fontsize=14, loc='right')
    
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_acc.png', 
            pad_inches=0.05, bbox_inches='tight')
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_acc.pdf', 
            pad_inches=0.05, bbox_inches='tight')

In [None]:
# Generate signal vs background histograms

for i, test in enumerate(completed_tests):
    plt.figure(figsize=(6, 4))
    
    _, b, _ = plt.hist(bg_test['output_test_' + str(i)], bins=30, weights=bg_test.wgt, label='Other', 
                       density=True, alpha=0.5)
    plt.hist(sig_test['output_test_' + str(i)], bins=b, weights=sig_test.wgt, label='Sig', 
             density=True, alpha=0.5)
    
    
    plt.legend(fontsize=12)
    plt.xlabel('NN output', fontsize=12)
    plt.ylabel('Density', fontsize=12)
    
    atlasify('Internal simulation', outside=True)
    
    plt.title('Test ' + str(i), fontsize=14, loc='right')
    
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_output.png', 
            pad_inches=0.05, bbox_inches='tight')
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_output.pdf', 
            pad_inches=0.05, bbox_inches='tight')

# Signal vs ttZ

ttZ events account for

In [None]:
current_bg = bg_ttZ
sum(current_bg.wgt) / sum(bg_full.wgt) * 100

percent of the total background. The amount relative to signal is

In [None]:
print('Number:', len(current_bg)/len(sig) * 100, '%')
print('Weight:', sum(current_bg.wgt)/sum(sig.wgt) * 100, '%')

In [None]:
train_feat_sets = [train_feats_raw, 
                   [f for f in train_feats_raw if f not in ['Wlep1_phi', 'Wlep2_phi', 'Zlep1_phi', 'Zlep2_phi']],
                   [f for f in train_feats_raw if f not in ['MET', 'METSig']],
                   [f for f in train_feats_raw if f not in ['pt_1', 'pt_2', 'pt_3', 'pt_4']],
                   [f for f in train_feats_raw if f not in ['Njet', 'Nlep']]
                  ]

In [None]:
# Regenerate test samples
_, sig_test = train_test_split(sig[train_feats_raw + ['wgt']], train_size=0.5, random_state=314)
_, bg_test = train_test_split(current_bg[train_feats_raw + ['wgt']], train_size=0.5, random_state=314)

# n_sig = sum(sig_train.wgt)
# n_bg = sum(bg_train.wgt)

n_sig_test = sum(sig_test.wgt)
n_bg_test = sum(bg_test.wgt)

x_test = pd.concat([sig_test[train_feats_raw], bg_test[train_feats_raw]])
y_test = np.concatenate([np.ones(len(sig_test)), np.zeros(len(bg_test))])
w_test = pd.Series(np.concatenate([(n_sig_test + n_bg_test) / n_sig_test * sig_test['wgt'], 
                                   (n_sig_test + n_bg_test) / n_bg_test * bg_test['wgt']]))

Now we scan over tests

In [None]:
scan_test_str = '20220301_classifier_ttZ'
plot_str = scan_test_str

completed_tests = ['models/background_id_models/' + t for t in os.listdir('models/background_id_models/') 
                   if t.startswith(scan_test_str) and t.endswith('_history.pkl')]
completed_tests = [t.replace('_history.pkl', '') for t in completed_tests]
completed_tests

In [None]:
models = [None] * len(completed_tests)

for i, test in enumerate(completed_tests):
    models[i] = keras.models.load_model(test)
    
    x_test['output_test_' + str(i)] = models[i].predict(x_test[train_feat_sets[i]], batch_size=10000)
    sig_test['output_test_' + str(i)] = models[i].predict(sig_test[train_feat_sets[i]], batch_size=10000)
    bg_test['output_test_' + str(i)] = models[i].predict(bg_test[train_feat_sets[i]], batch_size=10000)

In [None]:
# Generate ROC curves
plt.figure(figsize=(8, 6))

plt.plot([0, 1], [0, 1], ls='--', color='grey', label='Random')

for i, test in enumerate(completed_tests):
    fpr, tpr, _ = roc_curve(y_test, x_test['output_test_' + str(i)], sample_weight=w_test)
    
    plt.plot(fpr, tpr, label='Test ' + str(i))
    
plt.legend(fontsize=14)
plt.xlabel('False positive rate', fontsize=14)
plt.ylabel('True positive rate', fontsize=14)
plt.title('Signal vs other', fontsize=16, loc='right')

atlasify('Internal Simulation', outside=True)

plt.savefig('plots/background_id_models/' + plot_str + '_train_feat_test_roc.png', 
            pad_inches=0.05, bbox_inches='tight')
plt.savefig('plots/background_id_models/' + plot_str + '_train_feat_test_roc.pdf', 
            pad_inches=0.05, bbox_inches='tight')

In [None]:
# Generate training history plots

for i, test in enumerate(completed_tests):
    plt.figure(figsize=(6, 4))
    
    with open(test + '_history.pkl', 'rb') as f:
        history = pickle.load(f)
        
    # Loss
    plt.plot(history['loss'], label='loss')
    plt.plot(history['val_loss'], label='val loss')
    
    plt.legend(fontsize=12)
    plt.xlabel('NN output', fontsize=12)
    plt.ylabel('Density', fontsize=12)
    
    atlasify('Internal simulation', outside=True)
    
    plt.title('ttZ Test ' + str(i), fontsize=14, loc='right')
    
    if not os.path.exists('plots/background_id_models/' + plot_str + '/train_feat_test/'):
        os.makedirs('plots/background_id_models/' + plot_str + '/train_feat_test/')
    
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_loss.png', 
            pad_inches=0.05, bbox_inches='tight')
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_loss.pdf', 
            pad_inches=0.05, bbox_inches='tight')
    
    plt.show()
    
    # Accuracy
    plt.plot(history['accuracy'], label='acc')
    plt.plot(history['val_accuracy'], label='val acc')
    
    plt.legend(fontsize=12)
    plt.xlabel('NN output', fontsize=12)
    plt.ylabel('Density', fontsize=12)
    
    atlasify('Internal simulation', outside=True)
    
    plt.title('ttZ Test ' + str(i), fontsize=14, loc='right')
    
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_acc.png', 
            pad_inches=0.05, bbox_inches='tight')
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_acc.pdf', 
            pad_inches=0.05, bbox_inches='tight')

In [None]:
# Generate signal vs background histograms

for i, test in enumerate(completed_tests):
    plt.figure(figsize=(6, 4))
    
    _, b, _ = plt.hist(bg_test['output_test_' + str(i)], bins=30, weights=bg_test.wgt, label='Other', 
                       density=True, alpha=0.5)
    plt.hist(sig_test['output_test_' + str(i)], bins=b, weights=sig_test.wgt, label='Sig', 
             density=True, alpha=0.5)
    
    
    plt.legend(fontsize=12)
    plt.xlabel('NN output', fontsize=12)
    plt.ylabel('Density', fontsize=12)
    
    atlasify('Internal simulation', outside=True)
    
    plt.title('Test ' + str(i), fontsize=14, loc='right')
    
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_output.png', 
            pad_inches=0.05, bbox_inches='tight')
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_output.pdf', 
            pad_inches=0.05, bbox_inches='tight')

# Signal vs WZ

WZ events account for

In [None]:
current_bg = bg_WZ
sum(current_bg.wgt) / sum(bg_full.wgt) * 100

percent of the total background. The amount relative to signal is

In [None]:
print('Number:', len(current_bg)/len(sig) * 100, '%')
print('Weight:', sum(current_bg.wgt)/sum(sig.wgt) * 100, '%')

In [None]:
train_feat_sets = [train_feats_raw, 
                   [f for f in train_feats_raw if f not in ['Wlep1_phi', 'Wlep2_phi', 'Zlep1_phi', 'Zlep2_phi']],
                   [f for f in train_feats_raw if f not in ['MET', 'METSig']],
                   [f for f in train_feats_raw if f not in ['pt_1', 'pt_2', 'pt_3', 'pt_4']],
                   [f for f in train_feats_raw if f not in ['Njet', 'Nlep']]
                  ]

In [None]:
# Regenerate test samples
_, sig_test = train_test_split(sig[train_feats_raw + ['wgt']], train_size=0.5, random_state=314)
_, bg_test = train_test_split(current_bg[train_feats_raw + ['wgt']], train_size=0.5, random_state=314)

# n_sig = sum(sig_train.wgt)
# n_bg = sum(bg_train.wgt)

n_sig_test = sum(sig_test.wgt)
n_bg_test = sum(bg_test.wgt)

x_test = pd.concat([sig_test[train_feats_raw], bg_test[train_feats_raw]])
y_test = np.concatenate([np.ones(len(sig_test)), np.zeros(len(bg_test))])
w_test = pd.Series(np.concatenate([(n_sig_test + n_bg_test) / n_sig_test * sig_test['wgt'], 
                                   (n_sig_test + n_bg_test) / n_bg_test * bg_test['wgt']]))

Now we scan over tests

In [None]:
scan_test_str = '20220301_classifier_WZ'
plot_str = scan_test_str

completed_tests = ['models/background_id_models/' + t for t in os.listdir('models/background_id_models/') 
                   if t.startswith(scan_test_str) and t.endswith('_history.pkl')]
completed_tests = [t.replace('_history.pkl', '') for t in completed_tests]
completed_tests

In [None]:
models = [None] * len(completed_tests)

for i, test in enumerate(completed_tests):
    models[i] = keras.models.load_model(test)
    
    x_test['output_test_' + str(i)] = models[i].predict(x_test[train_feat_sets[i]], batch_size=10000)
    sig_test['output_test_' + str(i)] = models[i].predict(sig_test[train_feat_sets[i]], batch_size=10000)
    bg_test['output_test_' + str(i)] = models[i].predict(bg_test[train_feat_sets[i]], batch_size=10000)

In [None]:
# Generate ROC curves
plt.figure(figsize=(8, 6))

plt.plot([0, 1], [0, 1], ls='--', color='grey', label='Random')

for i, test in enumerate(completed_tests):
    fpr, tpr, _ = roc_curve(y_test, x_test['output_test_' + str(i)], sample_weight=w_test)
    
    plt.plot(fpr, tpr, label='Test ' + str(i))
    
plt.legend(fontsize=14)
plt.xlabel('False positive rate', fontsize=14)
plt.ylabel('True positive rate', fontsize=14)
plt.title('Signal vs other', fontsize=16, loc='right')

atlasify('Internal Simulation', outside=True)

plt.savefig('plots/background_id_models/' + plot_str + '_train_feat_test_roc.png', 
            pad_inches=0.05, bbox_inches='tight')
plt.savefig('plots/background_id_models/' + plot_str + '_train_feat_test_roc.pdf', 
            pad_inches=0.05, bbox_inches='tight')

In [None]:
# Generate training history plots

for i, test in enumerate(completed_tests):
    plt.figure(figsize=(6, 4))
    
    with open(test + '_history.pkl', 'rb') as f:
        history = pickle.load(f)
        
    # Loss
    plt.plot(history['loss'], label='loss')
    plt.plot(history['val_loss'], label='val loss')
    
    plt.legend(fontsize=12)
    plt.xlabel('NN output', fontsize=12)
    plt.ylabel('Density', fontsize=12)
    
    atlasify('Internal simulation', outside=True)
    
    plt.title('WZ Test ' + str(i), fontsize=14, loc='right')
    
    if not os.path.exists('plots/background_id_models/' + plot_str + '/train_feat_test/'):
        os.makedirs('plots/background_id_models/' + plot_str + '/train_feat_test/')
    
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_loss.png', 
            pad_inches=0.05, bbox_inches='tight')
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_loss.pdf', 
            pad_inches=0.05, bbox_inches='tight')
    
    plt.show()
    
    # Accuracy
    plt.plot(history['accuracy'], label='acc')
    plt.plot(history['val_accuracy'], label='val acc')
    
    plt.legend(fontsize=12)
    plt.xlabel('NN output', fontsize=12)
    plt.ylabel('Density', fontsize=12)
    
    atlasify('Internal simulation', outside=True)
    
    plt.title('WZ Test ' + str(i), fontsize=14, loc='right')
    
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_acc.png', 
            pad_inches=0.05, bbox_inches='tight')
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_acc.pdf', 
            pad_inches=0.05, bbox_inches='tight')

In [None]:
# Generate signal vs background histograms

for i, test in enumerate(completed_tests):
    plt.figure(figsize=(6, 4))
    
    _, b, _ = plt.hist(bg_test['output_test_' + str(i)], bins=30, weights=bg_test.wgt, label='Other', 
                       density=True, alpha=0.5)
    plt.hist(sig_test['output_test_' + str(i)], bins=b, weights=sig_test.wgt, label='Sig', 
             density=True, alpha=0.5)
    
    
    plt.legend(fontsize=12)
    plt.xlabel('NN output', fontsize=12)
    plt.ylabel('Density', fontsize=12)
    
    atlasify('Internal simulation', outside=True)
    
    plt.title('Test ' + str(i), fontsize=14, loc='right')
    
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_output.png', 
            pad_inches=0.05, bbox_inches='tight')
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_output.pdf', 
            pad_inches=0.05, bbox_inches='tight')

# Signal vs other

other events account for

In [None]:
current_bg = pd.concat([bg_Zgamma, bg_tZ, bg_tWZ, bg_other])
sum(current_bg.wgt) / sum(bg_full.wgt) * 100

percent of the total background. The amount relative to signal is

In [None]:
print('Number:', len(current_bg)/len(sig) * 100, '%')
print('Weight:', sum(current_bg.wgt)/sum(sig.wgt) * 100, '%')

In [None]:
train_feat_sets = [train_feats_raw, 
                   [f for f in train_feats_raw if f not in ['Wlep1_phi', 'Wlep2_phi', 'Zlep1_phi', 'Zlep2_phi']],
                   [f for f in train_feats_raw if f not in ['MET', 'METSig']],
                   [f for f in train_feats_raw if f not in ['pt_1', 'pt_2', 'pt_3', 'pt_4']],
                   [f for f in train_feats_raw if f not in ['Njet', 'Nlep']]
                  ]

In [None]:
# Regenerate test samples
_, sig_test = train_test_split(sig[train_feats_raw + ['wgt']], train_size=0.5, random_state=314)
_, bg_test = train_test_split(current_bg[train_feats_raw + ['wgt']], train_size=0.5, random_state=314)

n_sig_test = sum(sig_test.wgt)
n_bg_test = sum(bg_test.wgt)

x_test = pd.concat([sig_test[train_feats_raw], bg_test[train_feats_raw]])
y_test = np.concatenate([np.ones(len(sig_test)), np.zeros(len(bg_test))])
w_test = pd.Series(np.concatenate([(n_sig_test + n_bg_test) / n_sig_test * sig_test['wgt'], 
                                   (n_sig_test + n_bg_test) / n_bg_test * bg_test['wgt']]))

Now we scan over tests

In [None]:
scan_test_str = '20220301_classifier_other'
plot_str = scan_test_str

completed_tests = ['models/background_id_models/' + t for t in os.listdir('models/background_id_models/') 
                   if t.startswith(scan_test_str) and t.endswith('_history.pkl')]
completed_tests = [t.replace('_history.pkl', '') for t in completed_tests]
completed_tests

In [None]:
models = [None] * len(completed_tests)

for i, test in enumerate(completed_tests):
    models[i] = keras.models.load_model(test)
    
    x_test['output_test_' + str(i)] = models[i].predict(x_test[train_feat_sets[i]], batch_size=10000)
    sig_test['output_test_' + str(i)] = models[i].predict(sig_test[train_feat_sets[i]], batch_size=10000)
    bg_test['output_test_' + str(i)] = models[i].predict(bg_test[train_feat_sets[i]], batch_size=10000)

In [None]:
# Generate ROC curves
plt.figure(figsize=(8, 6))

plt.plot([0, 1], [0, 1], ls='--', color='grey', label='Random')

for i, test in enumerate(completed_tests):
    fpr, tpr, _ = roc_curve(y_test, x_test['output_test_' + str(i)], sample_weight=w_test)
    
    plt.plot(fpr, tpr, label='Test ' + str(i))
    
plt.legend(fontsize=14)
plt.xlabel('False positive rate', fontsize=14)
plt.ylabel('True positive rate', fontsize=14)
plt.title('Signal vs other', fontsize=16, loc='right')

atlasify('Internal Simulation', outside=True)

plt.savefig('plots/background_id_models/' + plot_str + '_train_feat_test_roc.png', 
            pad_inches=0.05, bbox_inches='tight')
plt.savefig('plots/background_id_models/' + plot_str + '_train_feat_test_roc.pdf', 
            pad_inches=0.05, bbox_inches='tight')

In [None]:
# Generate training history plots

for i, test in enumerate(completed_tests):
    plt.figure(figsize=(6, 4))
    
    with open(test + '_history.pkl', 'rb') as f:
        history = pickle.load(f)
        
    # Loss
    plt.plot(history['loss'], label='loss')
    plt.plot(history['val_loss'], label='val loss')
    
    plt.legend(fontsize=12)
    plt.xlabel('NN output', fontsize=12)
    plt.ylabel('Density', fontsize=12)
    
    atlasify('Internal simulation', outside=True)
    
    plt.title('other Test ' + str(i), fontsize=14, loc='right')
    
    if not os.path.exists('plots/background_id_models/' + plot_str + '/train_feat_test/'):
        os.makedirs('plots/background_id_models/' + plot_str + '/train_feat_test/')
    
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_loss.png', 
            pad_inches=0.05, bbox_inches='tight')
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_loss.pdf', 
            pad_inches=0.05, bbox_inches='tight')
    
    plt.show()
    
    # Accuracy
    plt.plot(history['accuracy'], label='acc')
    plt.plot(history['val_accuracy'], label='val acc')
    
    plt.legend(fontsize=12)
    plt.xlabel('NN output', fontsize=12)
    plt.ylabel('Density', fontsize=12)
    
    atlasify('Internal simulation', outside=True)
    
    plt.title('other Test ' + str(i), fontsize=14, loc='right')
    
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_acc.png', 
            pad_inches=0.05, bbox_inches='tight')
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_acc.pdf', 
            pad_inches=0.05, bbox_inches='tight')

In [None]:
# Generate signal vs background histograms

for i, test in enumerate(completed_tests):
    plt.figure(figsize=(6, 4))
    
    _, b, _ = plt.hist(bg_test['output_test_' + str(i)], bins=30, weights=bg_test.wgt, label='Other', 
                       density=True, alpha=0.5)
    plt.hist(sig_test['output_test_' + str(i)], bins=b, weights=sig_test.wgt, label='Sig', 
             density=True, alpha=0.5)
    
    
    plt.legend(fontsize=12)
    plt.xlabel('NN output', fontsize=12)
    plt.ylabel('Density', fontsize=12)
    
    atlasify('Internal simulation', outside=True)
    
    plt.title('Test ' + str(i), fontsize=14, loc='right')
    
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_output.png', 
            pad_inches=0.05, bbox_inches='tight')
    plt.savefig('plots/background_id_models/' + plot_str + '/train_feat_test/test_' + str(i) + '_output.pdf', 
            pad_inches=0.05, bbox_inches='tight')