In [None]:
%pip install pydot
%pip install graphviz

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from keras.utils.vis_utils import plot_model
import keras_tuner

from sklearn.model_selection import RandomizedSearchCV, train_test_split

%matplotlib inline

## Preprocess Data

In [None]:
data = pd.read_csv('full_data.csv')

In [None]:
headings = data.columns.to_list()
print(len(headings))
headings_without_label = headings.pop(19)
print(len(headings))
headings.append('label')
print(headings)

In [None]:
data = data[headings]

In [None]:
data

In [None]:
# data.drop(columns=data.columns[0], axis=1,  inplace=True)

# data

In [None]:
data_high_glu = data[data.scoresum_g > 1.7]

In [None]:
shuf_data = data.sample(frac=1)

In [None]:
no_events = shuf_data.shape[0]
no_training = int(0.7 * no_events)
no_val = int(no_training + (0.15 * no_events))

In [None]:
dataset = shuf_data.to_numpy()

In [None]:
X = dataset[:, 0:28]
y = dataset[:, 28]

In [None]:
# X_train
X_train = X[:no_training]
X_val = X[no_training:no_val]
X_test = X[no_val:]

y_train = y[:no_training]
y_val = y[no_training:no_val]
y_test = y[no_val:]

## Evaluating Hyperparameters

In [None]:
def build_model(hp):
    model = Sequential()
    
    # tuning number of layers:
    for i in range(hp.Int('num_layers', 1, 3)):
        model.add(Dense(units=hp.Int(f'units{i}', min_value=10, max_value=80, step=10),
                        activation=hp.Choice('activation', ['relu', 'tanh']),
                       )
                 )
    
    if hp.Boolean('dropout'):
        model.add(Dropout(rate=0.2))
    
    model.add(Dense(1, activation='sigmoid'))
    #Add learning rate?
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return(model)
        

In [None]:
build_model(keras_tuner.HyperParameters())

In [None]:
tuner = keras_tuner.RandomSearch(hypermodel=build_model, objective='val_accuracy', max_trials=5, 
                                 executions_per_trial=1, overwrite=True, directory='./NN_traing_res',
                                 project_name='res_1')

In [None]:
tuner.search_space_summary()

In [None]:
tuner.search(X_train, y_train, epochs=100, batch_size=500,  validation_data=(X_val, y_val))

In [None]:
tuner.results_summary()

## Define Model With Best Hyperparams

In [None]:
# define the keras model
model = Sequential()
model.add(Dense(40, activation='relu', name='ReLu-1'))
model.add(Dense(10, activation='relu', name='ReLu-2'))
model.add(Dense(10, activation='relu', name='ReLu-3'))
model.add(Dense(1, activation='sigmoid', name='Sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

## Train and Evaluate Model

In [None]:
model.fit(X_train, y_train, epochs=100, batch_size=500)

In [None]:
# evaluate the keras model
_, accuracy = model.evaluate(X_test, y_test)
print('Accuracy: %.2f' % (accuracy*100))

In [None]:
predictions = model.predict(X_test)
# round predictions 
rounded = [round(x[0]) for x in predictions]

In [None]:
# make class predictions with the model
predictions = (model.predict(X_test) > 0.5).astype(int)

In [None]:
for i in range(5):
 print('%s => %d (expected %d)' % (X_test[i].tolist(), predictions[i], y_test[i]))

## Plot Predictions

In [None]:
# plot all predictions (both signal and background)
plt.figure()
plt.hist(predictions, bins=np.linspace(0,1,50),histtype='step',color='darkgreen',label='All events')
# make the plot readable
plt.xlabel('Prediction from NN',fontsize=12)
plt.ylabel('Events',fontsize=12)
plt.legend(frameon=False)

In [None]:
# plot signal and background separately
plt.figure()
plt.hist(predictions[y_test.astype(bool)],bins=np.linspace(0,1,50),
         histtype='step',color='midnightblue',label='signal')
plt.hist(predictions[~(y_test.astype(bool))],bins=np.linspace(0,1,50),
         histtype='step',color='firebrick',label='background')
# make the plot readable
plt.xlabel('Prediction from NN',fontsize=12)
plt.ylabel('Events',fontsize=12)
plt.legend(frameon=False)

In [None]:
signal, sig_bins = np.histogram(predictions[y_test.astype(bool)],bins=np.linspace(0,1,50))
bkgrnd, back_bins = np.histogram(predictions[~(y_test.astype(bool))],bins=np.linspace(0,1,50))

sig_sf = 10 * 7.38400e-05
back_sf = 10 * 363

plt.hist(sig_bins[:-1], sig_bins, weights=sig_sf*signal, histtype='step',color='midnightblue',label='signal')
plt.hist(back_bins[:-1], back_bins, weights=back_sf*bkgrnd, histtype='step',color='firebrick',label='background')

plt.yscale('log')
plt.xlabel('Prediction from NN',fontsize=12)
plt.ylabel('Events',fontsize=12)
plt.legend(frameon=False)
plt.show()

In [None]:
prediction_score = (model.predict(X_test)).astype(float)

In [None]:
# plot all predictions (both signal and background)
plt.figure()
plt.hist(prediction_score, bins=np.linspace(0,1,50),histtype='step',color='darkgreen',label='All events')
# make the plot readable
plt.xlabel('Prediction from NN',fontsize=12)
plt.ylabel('Events',fontsize=12)
plt.legend(frameon=False)

In [None]:
# plot signal and background separately
plt.figure()
plt.hist(prediction_score[y_test.astype(bool)],bins=np.linspace(0,1,50),
         histtype='step',color='midnightblue',label='signal')
plt.hist(prediction_score[~(y_test.astype(bool))],bins=np.linspace(0,1,50),
         histtype='step',color='firebrick',label='background')
# make the plot readable
plt.xlabel('Prediction from NN',fontsize=12)
plt.ylabel('Events',fontsize=12)
plt.legend(frameon=False)

In [None]:
signal, sig_bins = np.histogram(prediction_score[y_test.astype(bool)],bins=np.linspace(0,1,50))
bkgrnd, back_bins = np.histogram(prediction_score[~(y_test.astype(bool))],bins=np.linspace(0,1,50))

sig_sf = 10 * 7.38400e-05
back_sf = (10 * 363) 

plt.hist(sig_bins[:-1], sig_bins, weights=sig_sf*signal, histtype='step',color='midnightblue',label='signal')
plt.hist(back_bins[:-1], back_bins, weights=back_sf*bkgrnd, histtype='step',color='firebrick',label='background')

plt.yscale('log')
plt.xlabel('Prediction from NN',fontsize=12)
plt.ylabel('Events',fontsize=12)
plt.legend(frameon=False)
plt.show()

In [None]:
# choose score cuts:
cuts = np.linspace(0,1,500)
nsignal = np.zeros(len(cuts))
nbackground = np.zeros(len(cuts))

for i,cut in enumerate(cuts):
    nsignal[i] = len(np.where(prediction_score[y_test.astype(bool)] > cut)[0])
    nbackground[i] = len(np.where(prediction_score[~(y_test.astype(bool))] > cut)[0])


# plot efficiency vs. purity (ROC curve)
plt.figure()
plt.plot(nsignal/len(X_test[y_test.astype(bool) == 1]),nsignal/(nsignal + nbackground),'o-',color='blueviolet')
# make the plot readable
plt.xlabel('Efficiency',fontsize=12)
plt.ylabel('Purity',fontsize=12)

In [None]:
# Zoom in view of the upper left corner.
plt.figure()
plt.xlim(0.85, 1.0)
plt.ylim(0.85, 1.0)
# plt.plot([0, 1], [0, 1], 'k--')
plt.plot(nsignal/len(X_test[y_test.astype(bool) == 1]),nsignal/(nsignal + nbackground),'o-',color='blueviolet', markersize=1)
plt.xlabel('Efficiency',fontsize=12)
plt.ylabel('Purity',fontsize=12)
plt.title('ROC curve (zoomed in at top right)')
plt.show()