# Discover the Higgs with Deep Neural Networks
# Chapter 6: Cross-Validation

In this chapter the concept of cross-validation is introduced to evaluate the network performance.

In [None]:
# Necessary imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from numpy.random import seed
import os

# Import the tensorflow module to create a neural network
import tensorflow as tf
from tensorflow.data import Dataset

# Import function to split data into train and test data
from sklearn.model_selection import train_test_split

# Import some common functions created for this notebook
import common

# Random state
random_state = 21
_ = np.random.RandomState(random_state)

## Data Preparation

### Load the Data

In [None]:
# Define the input samples
sample_list_signal = ['ggH125_ZZ4lep', 'VBFH125_ZZ4lep', 'WH125_ZZ4lep', 'ZH125_ZZ4lep']
sample_list_background = ['llll', 'Zee', 'Zmumu', 'ttbar_lep']

In [None]:
sample_path = 'input'
# Read all the samples
no_selection_data_frames = {}
for sample in sample_list_signal + sample_list_background:
    no_selection_data_frames[sample] = pd.read_csv(os.path.join(sample_path, sample + '.csv'))

### Event Pre-Selection

Import the pre-selection functions saved during the first chapter. If the modules are not found solve and execute the notebook of the first chapter.

In [None]:
from functions.selection_lepton_charge import selection_lepton_charge
from functions.selection_lepton_type import selection_lepton_type

In [None]:
# Create a copy of the original data frame to investigate later
data_frames = no_selection_data_frames.copy()

# Apply the chosen selection criteria
for sample in sample_list_signal + sample_list_background:
    # Selection on lepton type
    type_selection = np.vectorize(selection_lepton_type)(
        data_frames[sample].lep1_pdgId,
        data_frames[sample].lep2_pdgId,
        data_frames[sample].lep3_pdgId,
        data_frames[sample].lep4_pdgId)
    data_frames[sample] = data_frames[sample][type_selection]

    # Selection on lepton charge
    charge_selection = np.vectorize(selection_lepton_charge)(
        data_frames[sample].lep1_charge,
        data_frames[sample].lep2_charge,
        data_frames[sample].lep3_charge,
        data_frames[sample].lep4_charge)
    data_frames[sample] = data_frames[sample][charge_selection]

### Get Training and Test Data

In [None]:
# Split data to keep 40% for testing
train_data_frames, test_data_frames = common.split_data_frames(data_frames, 0.6)

Import the reweighting function to train with event weights. If the module is not found solve and execute the notebook of chapter 5.

In [None]:
from functions.reweight_weights import reweight_weights

## Cross-Validation

If you have a look on the training history you see fluctuations in the validation loss. Furthermore, the validation dataset also has a limited size, making it potentially not completely representative for validation.

So how should one evaluate the performance of a model and compare two models?

A commonly used method to evaluate the model performance is k-fold.
The training data is split several times with non-overlapping validation sets. On each split a model is trained and validated on the corresponding validation data. This results into several independently trained models with same size and setup validated on different datasets. Thus, one is able to calculate the mean performance of the resulting models.

<div>
<img src='figures/kFold.png' width='400'/>
</div>

In [None]:
# The training input variables
training_variables = ['lep1_pt', 'lep2_pt', 'lep3_pt', 'lep4_pt']

In [None]:
# Extract the values, weights, and classification of the data
values, weights, classification = common.get_dnn_input(train_data_frames, training_variables, sample_list_signal, sample_list_background)

Use kFold to split the data 3 times in 2/3 training and 1/3 validation data each randomly shuffled.

In [None]:
# Import the kFold module for cross-validation
from sklearn.model_selection import KFold

In [None]:
# Define the K-fold Cross Validator
kfold = KFold(n_splits=3, shuffle=True, random_state=random_state)

Now use kFold to train several models in a for loop. In each iteration you have to create a new model and train it. This results into three models trained and validated on overlapping training sets and not overlapping validation sets.

<font color='blue'>
Task:

Fill the missing parts in the for loop:
- Reweight the weights
- Convert the values and classification into tensorflow datasets
- Create a model with normalization layer and 2 hidden layers with 60 nodes each
- Compile the model
- Train the model with early stopping
- Evaluate the model on the training and evaluation data
</font><br>

In [None]:
# Loss function
loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=False)
# Optimizer
adam_optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=0.0002, beta_1=0.9)

In [None]:
# Early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [None]:
# Store the models and their training history
kfold_history = []
kfold_model = []
# Store the evaluation on training and validation data
kfold_train_eval_loss = []
kfold_train_eval_acc = []
kfold_val_eval_loss = []
kfold_val_eval_acc = []
split_idx = 1
for train_indices, val_indices in kfold.split(values):
    print(f'Use fold {split_idx}')
    split_idx += 1
    # Get train and validation data 
    train_values = values[train_indices]
    train_classification = classification[train_indices]
    train_weights = weights[train_indices]
    val_values = values[val_indices]
    val_classification = classification[val_indices]
    val_weights = weights[val_indices]
    # Get reweighted weights
    train_weights_reweighted = reweight_weights(train_weights, train_classification)
    val_weights_reweighted = reweight_weights(val_weights, val_classification)
    # Get train and validation datasets
    train_data = Dataset.from_tensor_slices((train_values, train_classification, train_weights_reweighted))
    train_data = train_data.shuffle(len(train_data), seed=random_state)
    train_data = train_data.batch(124)
    val_data = Dataset.from_tensor_slices((val_values, val_classification, val_weights_reweighted))
    val_data = val_data.shuffle(len(val_data), seed=random_state)
    val_data = val_data.batch(124)

    # Normalization layer
    normalization_layer = tf.keras.layers.Normalization()
    normalization_layer.adapt(train_values)
    # Create a simple NN
    model_layers = [
        normalization_layer,
        tf.keras.layers.Dense(60, activation='relu'),
        tf.keras.layers.Dense(60, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid'),
    ]
    model = tf.keras.models.Sequential(model_layers)
    # Compile model
    model.compile(optimizer=adam_optimizer, loss=loss_fn, weighted_metrics=['binary_accuracy'])

    # Train model
    history = model.fit(train_data, validation_data=val_data, callbacks=[early_stopping], epochs=1000)

    # Append to list
    kfold_history.append(history)
    kfold_model.append(model)

    # Evaluate model on training and validation data
    model_train_evaluation = model.evaluate(train_data)
    model_val_evaluation = model.evaluate(val_data)
    kfold_train_eval_loss.append(model_train_evaluation[0])
    kfold_train_eval_acc.append(model_train_evaluation[1])
    kfold_val_eval_loss.append(model_val_evaluation[0])
    kfold_val_eval_acc.append(model_val_evaluation[1])

Lets plot the training history of the three models.

In [None]:
# Plot the training history
fig, ax = plt.subplots(figsize=(7, 6))
color_list = ['r', 'g', 'b']
for k_fold_idx, (history, color) in enumerate(zip(kfold_history, color_list)):
  ax.plot(history.history['loss'], color=color, label=f'{k_fold_idx} training')
  ax.plot(history.history['val_loss'], color=color, ls='--', label=f'{k_fold_idx} val')
ax.set_xlabel('epoch')
ax.set_ylabel('loss')
ax.legend()
_ = plt.show()

<font color='blue'>
Task:

Calculate the mean and std of the validation loss.
</font>

In [None]:
val_loss_mean = np.mean(kfold_val_eval_loss)
val_loss_std = np.std(kfold_val_eval_loss)
print(f'The val loss of the model is {round(val_loss_mean, 3)} +- {round(val_loss_std, 3)}')

## Save and Load a Model

Lets save the three models of the cross validation

In [None]:
# Loop over all models
for idx, model in enumerate(kfold_model):
    # Save the model
    model.save(f'models/chapter6_model_crossval{idx}')