In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from sklearn.model_selection import RandomizedSearchCV, train_test_split

%matplotlib inline

In [58]:
data = pd.read_csv('full_data.csv')

In [59]:
data.drop(columns=data.columns[0], axis=1,  inplace=True)

data.head()

Unnamed: 0,jj_eta_1,j_eta_2,jj_theta_1,jj_theta_2,reco_q_1,reco_q_2,reco_s_1,reco_s_2,reco_g_1,reco_g_2,...,jet_nchad_1,jet_nchad_2,jet_nconst_1,jet_nconst_2,jj_m,jj_pt_1,jj_pt_2,jet_m_1,jet_m_2,label
0,-0.19937,0.178718,1.76886,1.39302,0.002115,0.000134,0.000893,0.000191,0.049876,0.009023,...,10,10,20,16,123.909,34.3344,31.9266,52.3694,17.2263,0
1,-1.31431,1.29481,2.61667,0.534777,2.6e-05,4.5e-05,2.1e-05,3.7e-05,0.011074,0.004671,...,9,14,18,32,126.411,38.6439,36.557,15.0486,10.7552,0
2,0.200933,0.036554,1.3712,1.53425,6.2e-05,9e-06,0.000122,1.7e-05,0.002114,0.00344,...,7,2,23,8,66.6327,26.958,30.2782,15.7986,8.8268,0
3,0.060725,0.060067,1.51011,1.51077,7.9e-05,5e-05,3.1e-05,4.6e-05,0.004614,0.004679,...,14,10,24,19,105.505,41.0236,36.772,22.2242,14.2766,0
4,-0.10022,0.088021,1.67085,1.48289,3.2e-05,2.1e-05,2.7e-05,2.9e-05,0.005912,0.013,...,9,13,32,21,128.141,31.374,28.5213,7.17776,10.3575,0


In [60]:
shuf_data = data.sample(frac=1)

In [61]:
no_events = shuf_data.shape[0]
no_training = int(0.8 * no_events)

In [62]:
dataset = shuf_data.to_numpy()

In [63]:
X = dataset[:, 0:23]
y = dataset[:, 23]

In [64]:
# X_train
X_train = X[:no_training]
X_test = X[no_training:]

y_train = y[:no_training]
y_test = y[no_training:]

In [65]:
# define the keras model
model = Sequential()
model.add(Dense(12, input_shape=(23,), activation='relu'))
model.add(Dense(23, activation='relu'))
model.add(Dense(23, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [66]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'auc'])

In [67]:
model.fit(X_train, y_train, epochs=2, batch_size=10)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fcc8c175450>

In [68]:
# evaluate the keras model
_, accuracy = model.evaluate(X_test, y_test)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 95.79


In [69]:
predictions = model.predict(X_test)
# round predictions 
rounded = [round(x[0]) for x in predictions]



In [70]:
# make class predictions with the model
predictions = (model.predict(X_test) > 0.5).astype(int)



In [None]:
for i in range(5):
 print('%s => %d (expected %d)' % (X_test[i].tolist(), predictions[i], y_test[i]))

In [None]:
# plot all predictions (both signal and background)
plt.figure()
plt.hist(predictions, bins=np.linspace(0,1,50),histtype='step',color='darkgreen',label='All events')
# make the plot readable
plt.xlabel('Prediction from NN',fontsize=12)
plt.ylabel('Events',fontsize=12)
plt.legend(frameon=False)

In [None]:
# plot signal and background separately
plt.figure()
plt.hist(predictions[y_test.astype(bool)],bins=np.linspace(0,1,50),
         histtype='step',color='midnightblue',label='signal')
plt.hist(predictions[~(y_test.astype(bool))],bins=np.linspace(0,1,50),
         histtype='step',color='firebrick',label='background')
# make the plot readable
plt.xlabel('Prediction from NN',fontsize=12)
plt.ylabel('Events',fontsize=12)
plt.legend(frameon=False)

In [None]:
signal, sig_bins = np.histogram(predictions[y_test.astype(bool)],bins=np.linspace(0,1,50))
bkgrnd, back_bins = np.histogram(predictions[~(y_test.astype(bool))],bins=np.linspace(0,1,50))

sig_sf = 10 * 7.38400e-05
back_sf = (10 * 363) 

plt.hist(sig_bins[:-1], sig_bins, weights=sig_sf*signal, histtype='step',color='midnightblue',label='signal')
plt.hist(back_bins[:-1], back_bins, weights=back_sf*bkgrnd, histtype='step',color='firebrick',label='background')

plt.yscale('log')
plt.xlabel('Prediction from NN',fontsize=12)
plt.ylabel('Events',fontsize=12)
plt.legend(frameon=False)
plt.show()

In [75]:
prediction_score = (model.predict(X_test)).astype(float)



In [None]:
# plot all predictions (both signal and background)
plt.figure()
plt.hist(prediction_score, bins=np.linspace(0,1,50),histtype='step',color='darkgreen',label='All events')
# make the plot readable
plt.xlabel('Prediction from NN',fontsize=12)
plt.ylabel('Events',fontsize=12)
plt.legend(frameon=False)

In [None]:
# plot signal and background separately
plt.figure()
plt.hist(prediction_score[y_test.astype(bool)],bins=np.linspace(0,1,50),
         histtype='step',color='midnightblue',label='signal')
plt.hist(prediction_score[~(y_test.astype(bool))],bins=np.linspace(0,1,50),
         histtype='step',color='firebrick',label='background')
# make the plot readable
plt.xlabel('Prediction from NN',fontsize=12)
plt.ylabel('Events',fontsize=12)
plt.legend(frameon=False)

In [None]:
signal, sig_bins = np.histogram(prediction_score[y_test.astype(bool)],bins=np.linspace(0,1,50))
bkgrnd, back_bins = np.histogram(prediction_score[~(y_test.astype(bool))],bins=np.linspace(0,1,50))

sig_sf = 10 * 7.38400e-05
back_sf = (10 * 363) 

plt.hist(sig_bins[:-1], sig_bins, weights=sig_sf*signal, histtype='step',color='midnightblue',label='signal')
plt.hist(back_bins[:-1], back_bins, weights=back_sf*bkgrnd, histtype='step',color='firebrick',label='background')

plt.yscale('log')
plt.xlabel('Prediction from NN',fontsize=12)
plt.ylabel('Events',fontsize=12)
plt.legend(frameon=False)
plt.show()

In [None]:
# choose score cuts:
cuts = np.linspace(0,1,500)
nsignal = np.zeros(len(cuts))
nbackground = np.zeros(len(cuts))

for i,cut in enumerate(cuts):
    nsignal[i] = len(np.where(prediction_score[y_test.astype(bool)] > cut)[0])
    nbackground[i] = len(np.where(prediction_score[~(y_test.astype(bool))] > cut)[0])


# plot efficiency vs. purity (ROC curve)
plt.figure()
plt.plot(nsignal/len(X_test[y_test.astype(bool) == 1]),nsignal/(nsignal + nbackground),'o-',color='blueviolet')
# make the plot readable
plt.xlabel('Efficiency',fontsize=12)
plt.ylabel('Purity',fontsize=12)

In [None]:
# Zoom in view of the upper left corner.
plt.figure()
plt.xlim(0.85, 1.0)
plt.ylim(0.85, 1.0)
# plt.plot([0, 1], [0, 1], 'k--')
plt.plot(nsignal/len(X_test[y_test.astype(bool) == 1]),nsignal/(nsignal + nbackground),'o-',color='blueviolet', markersize=1)
plt.xlabel('Efficiency',fontsize=12)
plt.ylabel('Purity',fontsize=12)
plt.title('ROC curve (zoomed in at top right)')
plt.show()