# Clean DNN version

### Load data

In [1]:
%pip install imbalanced-learn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
import ROOT;
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.utils import shuffle;
from sklearn.model_selection import train_test_split;


input_columns = ['DER_deltaeta_jet_jet', 'DER_deltar_tau_lep',
'DER_lep_eta_centrality','DER_mass_MMC',
'DER_mass_jet_jet', 'DER_mass_transverse_met_lep',
'DER_mass_vis', 'DER_met_phi_centrality', 
'DER_prodeta_jet_jet', 'DER_pt_h', 
'DER_pt_ratio_lep_tau','DER_pt_tot', 
'DER_sum_pt','PRI_jet_all_pt',
'PRI_jet_leading_eta', 'PRI_jet_leading_phi', 
'PRI_jet_leading_pt','PRI_jet_num',
'PRI_jet_subleading_eta','PRI_jet_subleading_phi', 
 'PRI_jet_subleading_pt', 'PRI_lep_eta', 
'PRI_lep_phi', 'PRI_lep_pt', 
'PRI_met','PRI_met_sumet', 
'PRI_tau_eta', 'PRI_tau_pt',
'transverse_lepton_jet_mass']



RDF = ROOT.ROOT.RDataFrame

signal_tree_name = 'signal'
background_tree_name = 'background'
test_tree_name = 'validation'
file_name = 'atlas-higgs-challenge-2014-v2_part.root'

rdf_signal = RDF(signal_tree_name, file_name)
rdf_bkg = RDF(background_tree_name, file_name)
rdf_test = RDF(test_tree_name, file_name)

reconstruct_transverse_lepton_jet_mass = '''

float lep_px = PRI_lep_pt * TMath::Cos(PRI_lep_phi);
float lep_py = PRI_lep_pt * TMath::Sin(PRI_lep_phi);
float jet_px = PRI_jet_leading_pt * TMath::Cos(PRI_jet_leading_phi);
float jet_py = PRI_jet_leading_pt * TMath::Sin(PRI_jet_leading_phi);

//calculate angle between jet and lepton
float cos_theta = (lep_px*jet_px + lep_py*jet_py) / PRI_lep_pt / PRI_jet_leading_pt;

return PRI_lep_pt * PRI_jet_leading_pt * (1 - cos_theta);
'''

#insertion
rdf_signal = rdf_signal.Define('transverse_lepton_jet_mass', reconstruct_transverse_lepton_jet_mass)
rdf_bkg = rdf_bkg.Define('transverse_lepton_jet_mass', reconstruct_transverse_lepton_jet_mass)
rdf_test = rdf_test.Define('transverse_lepton_jet_mass', reconstruct_transverse_lepton_jet_mass)

# label classification to int values
rdf_test = rdf_test.Define('IntLabel', '''
const char ch = Label[0];
const char s = 's';
if(ch == s){
    return 1;
}
else{
    return 0;
}
''')


df_signal = pd.DataFrame(rdf_signal.AsNumpy())
df_bg = pd.DataFrame(rdf_bkg.AsNumpy())
df_test = pd.DataFrame(rdf_test.AsNumpy())

for tdf in [df_signal, df_bg, df_test]:
    tdf['PRI_lep_eta'] = tdf['PRI_lep_eta'].abs()
    tdf['PRI_tau_eta'] = tdf['PRI_tau_eta'].abs()



#input feature arrays
vars_signal = df_signal[input_columns].to_numpy()
vars_bg = df_bg[input_columns].to_numpy()
vars_test = df_test[input_columns].to_numpy()
inputs = np.concatenate([vars_signal, vars_bg])


#weights
weight_signal = df_signal['Weight'].to_numpy()
weight_bg = df_bg['Weight'].to_numpy()
weights_test = df_test['Weight'].to_numpy()
weights = np.concatenate([weight_signal, weight_bg])
weights = weights.reshape((weights.shape[0],))



# target classifictionation (1:signal / 0: background)
y_signal = np.ones((vars_signal.shape[0], ))
y_bg = np.zeros((vars_bg.shape[0], ))
y_test = df_test.IntLabel.to_numpy()
targets = np.concatenate([y_signal, y_bg])

# shuffle 
inputs, targets, weights = shuffle(inputs, targets, weights)

# training and validation split  (80, 20)
x_train, x_val, y_train, y_val, w_train, w_val = train_test_split(inputs, targets, weights, test_size=0.2)

Note: you may need to restart the kernel to use updated packages.
Welcome to JupyROOT 6.28/10


## preparing data

In [2]:
from sklearn.preprocessing import StandardScaler;
from imblearn.over_sampling import SMOTE
 
scaler = StandardScaler()
scaler.fit(x_train) #set up only on train data
 
# tranformation applied to all
x_train = scaler.transform(x_train)
x_val = scaler.transform(x_val)
x_test = scaler.transform(vars_test)


# inbalance signal - bg
smote = SMOTE()
x_train, y_train = smote.fit_resample(x_train, y_train)



### AMS and functions

In [3]:
def ams_score(x, y, w, cut):
# Calculate Average Mean Significane as defined in ATLAS paper
#    -  approximative formula for large statistics with regularisation
# x: array of truth values (1 if signal)
# y: array of classifier result
# w: array of event weights
# cut
    t = y > cut 
    s = np.sum((x[t] == 1)*w[t])
    b = np.sum((x[t] == 0)*w[t])
    return s/np.sqrt(b+10.0)

def find_best_ams_score(x, y, w):
# find best value of AMS by scanning cut values; 
# x: array of truth values (1 if signal)
# y: array of classifier results
# w: array of event weights
#  returns 
#   ntuple of best value of AMS and the corresponding cut value
#   list with corresponding pairs (ams, cut) 
# ----------------------------------------------------------
    ymin=min(y) # classifiers may not be in range [0.,1.]
    ymax=max(y)
    nprobe=200    # number of (equally spaced) scan points to probe classifier 
    amsvec= [(ams_score(x, y, w, cut), cut) for cut in np.linspace(ymin, ymax, nprobe)] 
    maxams=sorted(amsvec, key=lambda lst: lst[0] )[-1]
    return maxams, amsvec


def printScoreTest(model):
    try:
        pred_clf = model.predict_proba(x_test)[:, 1]
    except:
        pred_clf = model.predict(x_test)
        pred_clf = pred_clf.reshape((pred_clf.shape[0],))

    auc = roc_auc_score(y_test, pred_clf, sample_weight=weights_test)
    print('AUC:', auc)
    bs = find_best_ams_score(y_test, pred_clf, weights_test)
    print('AMS:', bs[0][0])
    print('AMS total:', bs[0][0]*np.sqrt(50))


def printScore(model):
    try:
        pred_clf = model.predict_proba(x_val)[:, 1]
    except:
        pred_clf = model.predict(x_val)
        pred_clf = pred_clf.reshape((pred_clf.shape[0],))

    auc = roc_auc_score(y_val, pred_clf, sample_weight=w_val)
    print('AUC:', auc)
    bs = find_best_ams_score(y_val, pred_clf, w_val)
    print('AMS:', bs[0][0])
    print('AMS total:', bs[0][0]*np.sqrt(50))


def plotLossAccuracy(history):
    # Get training and validation loss/accuracy values from history
    loss_training = history.history['loss']
    loss_validation = history.history['val_loss']
    accuracy_training = history.history['accuracy']
    accuracy_validation = history.history['val_accuracy']


    fig, (ax1, ax2) = plt.subplots(1,2, figsize=(20, 8))
    
    ax1.plot(loss_training, '-', label='training')
    ax1.plot(loss_validation, '-', label='validation')
    ax1.legend(title='loss')
    ax1.set_xlabel('epoch')
    
    ax2.plot(accuracy_training, '-', label='training')
    ax2.plot(accuracy_validation, '-', label='validation')
    ax2.legend(title='accuracy')
    ax2.set_xlabel('epoch')

    plt.show()

### model and training function

In [4]:

# Training function
def train(model, epochs=10, batch_size=128, learning_rate=1e-3, verbose="auto", callbacks=None):
    
    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(learning_rate=learning_rate, weight_decay=0.5),
    #    metrics=['accuracy']
        metrics=[AUC()]
    )
    
    model.summary()

    history = model.fit(x_train, y_train,
                        validation_data=(x_val, y_val),
                        batch_size=batch_size, epochs=epochs,
                        verbose=verbose,   callbacks=callbacks)
    return history


In [5]:
import tensorflow as tf;
from tensorflow.keras import layers, models, optimizers;
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import CSVLogger, ModelCheckpoint, EarlyStopping
from tensorflow.keras.metrics import AUC

# Define the DNN model
def buildModel():
    model = models.Sequential()
    
    # input
    model.add(layers.Input(shape=(x_train.shape[1],)))
    
    # hidden
    #model.add(layers.Dense(2*128, activation='relu'))
    #model.add(layers.Dropout(0.4))
    #model.add(layers.Dense(2*128, activation='relu'))
    #model.add(layers.Dropout(0.4))    
    model.add(layers.Dense(2*128, activation='relu'))
    model.add(layers.Dropout(0.4))
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dropout(0.4))
    #model.add(layers.Dense(128, activation='relu'))
    #model.add(layers.Dropout(0.4))
    model.add(layers.Dense(32, activation='relu'))
    
    
    # output
    model.add(layers.Dense(1, activation='sigmoid'))
    
    return model


2024-07-30 21:49:44.326412: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-30 21:49:44.327101: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-30 21:49:44.330812: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-30 21:49:44.376467: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
name = 'v1'

model = buildModel()

callbacks = [ModelCheckpoint(filepath=f'new/{name}.keras', save_best_only=True), 
            CSVLogger(f'new/{name}_training.log')
          ]
        
history = train(model, epochs=100, batch_size=2*1024, learning_rate=3e-4, callbacks=callbacks)

printScore(model)
#plotLossAccuracy(history)


Epoch 1/100
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 46ms/step - auc: 0.5982 - loss: 0.6777 - val_auc: 0.7790 - val_loss: 0.5881
Epoch 2/100
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - auc: 0.7575 - loss: 0.5961 - val_auc: 0.8186 - val_loss: 0.5296
Epoch 3/100
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - auc: 0.7941 - loss: 0.5543 - val_auc: 0.8334 - val_loss: 0.5079
Epoch 4/100
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - auc: 0.8078 - loss: 0.5385 - val_auc: 0.8446 - val_loss: 0.4967
Epoch 5/100
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - auc: 0.8249 - loss: 0.5142 - val_auc: 0.8538 - val_loss: 0.4833
Epoch 6/100
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - auc: 0.8337 - loss: 0.5035 - val_auc: 0.8622 - val_loss: 0.4753
Epoch 7/100
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - 

In [7]:
def ams_wrapper(prediction, true_values, weights):
    return find_best_ams_score(true_values, prediction, weights)[0][0]


In [9]:
prediction = model.predict(x_val)
prediction = prediction.reshape((prediction.shape[0],))

ams_wrapper(prediction, y_val, w_val)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 719us/step


0.3690697062972268

In [11]:
prediction = model.predict(x_test)
prediction = prediction.reshape((prediction.shape[0],))

ams_wrapper(prediction, y_test, weights_test)

[1m1280/1280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 700us/step


0.743785820496553