# Discover the Higgs with Deep Neural Networks

## Data Preparation

In [45]:
# Necessary imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from numpy.random import seed
import common

The goal of this lab course is to train a deep neural network to separate Higgs boson signal from background events. The most important signal sample ggH125_ZZ4lep corresponds to the process gg->H->ZZ. The dominant background sample is llll resulting from Z and ZZ decays.
After training the DNN model will be used to classify the events of the data samples.

Higgs signal samples:
- ggH125_ZZ4lep
- VBFH125_ZZ4lep
- WH125_ZZ4lep
- ZH125_ZZ4lep

Background samples:
- llll
- Zee
- Zmumu
- ttbar_lep

Data samples:
- data_A
- data_B
- data_C
- data_D

In [49]:
# Define the input samples
sample_list_signal = ['ggH125_ZZ4lep']
sample_list_background = ['llll']

In [50]:
# Read all the samples
data_frames = {}
for sample in sample_list_signal + sample_list_background:
    data_frames[sample] = pd.read_csv('input/' + sample + ".csv")

Although the final selection of the data is to be performed on the basis of a DNN, a rough pre-selection of the data is still useful.
Suitable criteria for this are basic selections that must be clearly fulfilled by H->ZZ->llll processes.

Hint: What lepton types and charges are expected in the final state?

In [51]:
def cut_lep_type(lep_type_0, lep_type_1, lep_type_2, lep_type_3):
    # Only keep events like eeee, mumumumu or eemumu
    sum_lep_type = lep_type_0 + lep_type_1 + lep_type_2 + lep_type_3
    return sum_lep_type == 44 or sum_lep_type == 48 or sum_lep_type == 52


def cut_lep_charge(lep_charge_0, lep_charge_1, lep_charge_2, lep_charge_3):
    # Only keep events where the sum of all lepton charges is zero
    sum_lep_charge = lep_charge_0 + lep_charge_1 + lep_charge_2 + lep_charge_3
    return sum_lep_charge == 0

In [52]:
# Apply the 
for sample in sample_list_signal + sample_list_background:
    # Cut on lepton type
    data_frames[sample] = data_frames[sample][np.vectorize(cut_lep_type)(
        data_frames[sample].lep1_pdgId,
        data_frames[sample].lep2_pdgId,
        data_frames[sample].lep3_pdgId,
        data_frames[sample].lep4_pdgId)]

    # Cut on lepton charge
    data_frames[sample] = data_frames[sample][np.vectorize(cut_lep_charge)(
        data_frames[sample].lep1_charge,
        data_frames[sample].lep2_charge,
        data_frames[sample].lep3_charge,
        data_frames[sample].lep4_charge)]

In [53]:
# Merge the signal and background data frames
def merge_data_frames(sample_list, data_frames_dic):
    for sample in sample_list:
        if sample == sample_list[0]:
            output_data_frame = data_frames_dic[sample]
        else:
            output_data_frame = pd.contact(output_data_frame, data_frames_dic[sample])
    return output_data_frame

data_frame_signal = merge_data_frames(sample_list_signal, data_frames)
data_frame_background = merge_data_frames(sample_list_background, data_frames)

AttributeError: module 'pandas' has no attribute 'contact'

In [None]:
variables = []
variables.append({'variable': 'lep1_phi', 'bins': np.linspace(-3.4, 3.4, 100), 'xlabel': '$lep_{phi}$[1] [GeV]'})
variables.append({'variable': 'lep1_eta', 'bins': np.linspace(-3.4, 3.4, 100), 'xlabel': '$lep_{eta}$[1] [GeV]'})
variables.append({'variable': 'lep1_pt', 'bins': np.linspace(0, 200, 100), 'xlabel': '$lep_{pt}$[1] [GeV]'})
variables.append({'variable': 'lep2_pt', 'bins': np.linspace(0, 150, 75), 'xlabel': '$lep_{pt}$[2] [GeV]'})
variables.append({'variable': 'lep3_pt', 'bins': np.linspace(0, 150, 75), 'xlabel': '$lep_{pt}$[3] [GeV]'})
variables.append({'variable': 'lep4_pt', 'bins': np.linspace(0, 100, 50), 'xlabel': '$lep_{pt}$[4] [GeV]'})
variables.append({'variable': 'lep_m_llll', 'bins': np.linspace(0, 400, 100), 'xlabel': '$m_{llll}$ [GeV]'})
variables.append({'variable': 'lep_m_ll_12', 'bins': np.linspace(0, 200, 100), 'xlabel': '$m_{ll}$[1,2] [GeV]'})
variables.append({'variable': 'lep_m_ll_13', 'bins': np.linspace(0, 200, 100), 'xlabel': '$m_{ll}$[1,3] [GeV]'})
variables.append({'variable': 'lep_m_ll_14', 'bins': np.linspace(0, 200, 100), 'xlabel': '$m_{ll}$[1,4] [GeV]'})
variables.append({'variable': 'lep_m_ll_23', 'bins': np.linspace(0, 200, 100), 'xlabel': '$m_{ll}$[2,3] [GeV]'})
variables.append({'variable': 'lep_m_ll_24', 'bins': np.linspace(0, 200, 100), 'xlabel': '$m_{ll}$[2,4] [GeV]'})
variables.append({'variable': 'lep_m_ll_34', 'bins': np.linspace(0, 200, 100), 'xlabel': '$m_{ll}$[3,4] [GeV]'})
for var in variables:
    common.plot_hist(var, data_frame_signal, data_frame_background)

In [54]:
# This is not correct!!!!!
for sample in sample_list_signal + sample_list_background:
    print(sample, np.mean(data_frames[sample].weight), len(data_frames[sample].weight))

ggH125_ZZ4lep 24.769128960170892 161451
data_A 0.0 27
data_B 0.0 86
data_C 0.0 146
data_D 0.0 248
llll 0.3029653353227194 523957


In [24]:
# The training input variables
input_variable_list = ['lep1_pt', 'lep2_pt', 'lep3_pt', 'lep4_pt']
#input_variable_list = ['lep_m_ll_12', 'lep_m_ll_13', 'lep_m_ll_14', 'lep_m_ll_23', 'lep_m_ll_24', 'lep_m_ll_34']

In [25]:
# Create the training input
input_values = []
input_classification = []
for sample in sample_list_signal + sample_list_background:
    # Classify signal and background (and skip if data)
    if sample in sample_list_signal:
        # 1 if signal
        input_classification.append(np.ones(len(data_frames[sample])))
    elif sample in sample_list_background:
        # 0 if background
        input_classification.append(np.zeros(len(data_frames[sample])))
    else:
        continue
    input_values.append(data_frames[sample][input_variable_list])

# Merge the input
input_values = np.concatenate(input_values)
input_classification = np.concatenate(input_classification)

In [26]:
np.random.seed(seed=420)

def split_indices(df, train_frac, test_frac, val_frac):
    #assert train_frac + val_frac + test_frac <= 1
    random_values = np.random.rand(len(df))
    range1 = random_values <= train_frac
    range2 = random_values <= train_frac + test_frac
    range3 = random_values <= train_frac + test_frac + val_frac
    train_index = range1
    test_index = ~range1 & range2
    val_index = ~range2 & range3
    return train_index, test_index, val_index

train_index, test_index, val_index = split_indices(input_values, 0.8, 0.1, 0.1)

In [27]:
train_input_values = input_values[train_index]
train_input_classification = input_classification[train_index]
test_input_values = input_values[test_index]
test_input_classification = input_classification[test_index]
val_input_values = input_values[val_index]
val_input_classification = input_classification[val_index]

In [28]:
import tensorflow as tf

In [36]:
model = tf.keras.models.Sequential([
  #tf.keras.layers.Flatten(input_shape=[len(input_variable_list)]),
  tf.keras.layers.Dense(50, activation='relu'),
  tf.keras.layers.Dense(50, activation='relu'),
  #tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(1, activation='sigmoid')
])

In [37]:
predictions = model(train_input_values).numpy()

In [38]:
#tf.nn.softmax(predictions).numpy()

In [39]:
loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [40]:
#loss_fn(train_input_classification, predictions).numpy()

In [41]:
model.compile(optimizer='adam',
              loss=loss_fn,
              metrics=['accuracy'])

In [42]:
model.fit(train_input_values, train_input_classification, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x14b3e2b50>

In [43]:
model.evaluate(test_input_values,  test_input_classification, verbose=2)

2129/2129 - 4s - loss: 0.3464 - accuracy: 0.8371 - 4s/epoch - 2ms/step


[0.3464040756225586, 0.8371140360832214]

In [44]:
test_prediction = model.predict(test_input_values)
print(test_prediction)

[[0.1651726 ]
 [0.00873801]
 [0.7357539 ]
 ...
 [0.47237718]
 [0.03232962]
 [0.34834814]]
