Neural Network

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 12})

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc

import tensorflow as tf
from tensorflow import keras
from keras.callbacks import EarlyStopping

Load the dataset and split it into trainings, test and validation data

In [None]:
# Load the dataset
data = pd.read_pickle('dataset.pkl')

# Extract the validation data from the dataset
val_data = data.loc[data.run >= 97].drop('run', axis=1)
data = data.loc[data.run <= 96].drop('run', axis=1)

# Split the remaining dataset into trainings and test data
# Split the remaining dataset into trainings and test data
test_size=0.25
train_complete = data[:int(len(data)*0.75)]
test_complete = data[int(len(data)*0.75):]

# Method to drop the less important features
def drop_less_important(data_temp):
    return data_temp.drop(['clusterAbsZernikeMoment40_1', 'clusterAbsZernikeMoment40_2', 'clusterAbsZernikeMoment51_1', 'clusterAbsZernikeMoment51_2', 'clusterLAT_1', 'clusterLAT_2','phi_1', 'phi_2','b2bPhi_1', 'b2bPhi_2','cosAngleBetweenMomentumAndVertexVectorInXYPlane_1', 'cosAngleBetweenMomentumAndVertexVectorInXYPlane_2','pRecoilPhi_1', 'pRecoilPhi_2'], axis=1, errors='ignore')

# Method to get data. The dataset can be reduced by a factor and the less important features can be dropped.
def get_data(factor=1, important_drop=False):

    train = train_complete[:(len(train_complete)//factor)]
    test = test_complete[:(len(test_complete)//factor)]
    val = val_data[:(len(val_data)//factor)]

    if important_drop:
        train = drop_less_important(train)
        test = drop_less_important(test)
        val = drop_less_important(val)
    
    print('Length trainings data:', len(train), 'Length test data:', len(test), 'Length validation data:', len(val), '\n')
    return train, test, val

def get_columns():
    return val_data.columns

data.head(10)

Create the model and train it

In [None]:
# Get the trainings, test and validation data
factor = 1
train, test, val = get_data(factor=factor, important_drop=False)

# Define a dropout rate for the 'Dropout' layer
rate = 0.2

# Create the model
tf.random.set_random_seed(42)
model = keras.Sequential([
    keras.layers.Flatten(input_shape=((len(val.columns)-1),)),
    keras.layers.Dense(512, activation='tanh'),
    keras.layers.Dense(128, activation='tanh'),
    keras.layers.Dense(32, activation='tanh'),
    keras.layers.Dense(1, activation='sigmoid')
])
    
# Compile the model
model.compile(optimizer='Adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])


# Set the parameters for the training
num_of_epochs = 100
batch_size = 1024

# Use early stopping to interrupt the trainings process when the validatin loss starts do increase over a given number of epochs
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)

# Train the network
train_history = model.fit(train.drop('y', axis=1), train.y, epochs=num_of_epochs, verbose=1, batch_size=batch_size, validation_data=(val.drop('y', axis=1), val.y), callbacks=[es])

Plot the accuracy and loss during training, evaluate the model, print the F1-score and the best accuracy in the validation dataset over the whole trainings session

In [None]:
def plot_training(loss, acc):
    # Plot the trainings and accuracy history
    
    _, (ax1, ax2) = plt.subplots(2, sharex=True, figsize=(12,8))
    
    ax1.set_ylabel('Loss')
    ax1.plot(loss, '-')
    
    ax2.set_ylabel('Accuracy')
    ax2.set_xlabel('Epoch Index')
    ax2.plot(acc, '-')
    
    plt.tight_layout()
    plt.show()

plot_training(train_history.history['loss'], train_history.history['acc'])
plot_training(train_history.history['val_loss'], train_history.history['val_acc'])

# Print accuracy and F1-score for the test and validation data
y_pred = np.vectorize(round)(model.predict(test.drop('y', axis=1)))
print('\nAccuracy on test data:', accuracy_score(test.y, y_pred))
print('F1-score on test data:', f1_score(test.y, y_pred))
print('Precision on test data:', precision_score(test.y, y_pred))

y_pred = np.vectorize(round)(model.predict(val.drop('y', axis=1)))
print('\nAccurcay on validation data:', accuracy_score(val.y, y_pred))
print('F1-score on validation data:', f1_score(val.y, y_pred))
print('Precision on validation data:', precision_score(val.y, y_pred))

print('\nHighest accuracy on the validation dataset during the trainings process:')
print(max(train_history.history['val_acc']))

Plot the Precision-Recall curve and compute AUC score

In [None]:
# Compute the predictions of the classifier as probabilities
y_pred = model.predict(val.drop('y', axis=1))

# Compute precision, recall and thresholds
precision, recall, thresholds = precision_recall_curve(val.y, y_pred)

# Computes the area under curve from the precision and recall
auc_score = auc(recall, precision)
print('Area under curve:', auc_score)

# Plot the precision-recall curves
#plt.plot([1, 0], [0, 1], 'k--') 
plt.axis([0, 1, 0, 1])
plt.plot(recall, precision, linestyle='-', label='Neural Network')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.show()

Save and load the trainings histroy and the predictions during testing and validation

In [None]:
filename = 'models/model_9999_data.npy'
data = [model.predict(test.drop('y', axis=1)), model.predict(val.drop('y', axis=1)), train_history.history]
np.save(filename, data)