# Analyzing the Data

# Data preparation

In [2]:
#import uproot
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc


import ROOT;
#import lumiere as lm
#lm.loadstyle(True);

from sklearn.metrics import roc_auc_score, roc_curve

def ams_score(x, y, w, cut):
# Calculate Average Mean Significane as defined in ATLAS paper
#    -  approximative formula for large statistics with regularisation
# x: array of truth values (1 if signal)
# y: array of classifier result
# w: array of event weights
# cut
    t = y > cut 
    s = np.sum((x[t] == 1)*w[t])
    b = np.sum((x[t] == 0)*w[t])
    return s/np.sqrt(b+10.0)

def find_best_ams_score(x, y, w):
# find best value of AMS by scanning cut values; 
# x: array of truth values (1 if signal)
# y: array of classifier results
# w: array of event weights
#  returns 
#   ntuple of best value of AMS and the corresponding cut value
#   list with corresponding pairs (ams, cut) 
# ----------------------------------------------------------
    ymin=min(y) # classifiers may not be in range [0.,1.]
    ymax=max(y)
    nprobe=200    # number of (equally spaced) scan points to probe classifier 
    amsvec= [(ams_score(x, y, w, cut), cut) for cut in np.linspace(ymin, ymax, nprobe)] 
    maxams=sorted(amsvec, key=lambda lst: lst[0] )[-1]
    return maxams, amsvec




def printScore(model):

    try:
        pred_clf = model.predict_proba(x_val)[:, 1]
    except:
        pred_clf = model.predict(x_val)
        pred_clf = pred_clf.reshape((pred_clf.shape[0],))

    auc = roc_auc_score(y_val, pred_clf, sample_weight=w_val)
    print('AUC:', auc)
    bs = find_best_ams_score(y_val, pred_clf, w_val)
    print('AMS:', bs[0][0])
    print('AMS total:', bs[0][0]*np.sqrt(50))



def plotLossAccuracy(history):
    # Get training and validation loss/accuracy values from history
    loss_training = history.history['loss']
    loss_validation = history.history['val_loss']
    accuracy_training = history.history['accuracy']
    accuracy_validation = history.history['val_accuracy']


    fig, (ax1, ax2) = plt.subplots(1,2, figsize=(20, 8))
    
    ax1.plot(loss_training, 'o--', label='training')
    ax1.plot(loss_validation, 'o--', label='validation')
    ax1.legend(title='loss')
    ax1.set_xlabel('epoch')
    
    ax2.plot(accuracy_training, 'o--', label='training')
    ax2.plot(accuracy_validation, 'o--', label='validation')
    ax2.legend(title='accuracy')
    ax2.set_xlabel('epoch')

    plt.show()


def plotAMS(history):
    # Get training and validation loss/accuracy values from history
    ams_training = history.history['ams_metric']
    ams_validation = history.history['val_ams_metric']
    

    fig, ax1 = plt.subplots(1,1, figsize=(10, 8))
    
    #ax1.plot(ams_training, 'o--', label='training')
    ax1.plot(ams_validation, '-', label='validation')
    ax1.legend(title='AMS')
    ax1.set_xlabel('epoch')
    
    plt.show()

Welcome to JupyROOT 6.28/10


## Read-in & to Pandas

In [3]:

input_columns = ['DER_deltaeta_jet_jet', 'DER_deltar_tau_lep', 'DER_lep_eta_centrality', 'DER_mass_MMC', 'DER_mass_jet_jet', 
                 'DER_mass_transverse_met_lep', 'DER_mass_vis', 'DER_met_phi_centrality', 'DER_prodeta_jet_jet', 'DER_pt_h', 
                 'DER_pt_ratio_lep_tau', 'DER_pt_tot', 'DER_sum_pt', 'PRI_jet_all_pt', 'PRI_jet_leading_eta', 'PRI_jet_leading_phi', 
                 'PRI_jet_leading_pt', 'PRI_jet_num', 'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi', 'PRI_jet_subleading_pt', 
                 'PRI_lep_eta', 'PRI_lep_phi', 'PRI_lep_pt', 'PRI_met', 'PRI_met_phi', 'PRI_met_sumet', 'PRI_tau_eta', 'PRI_tau_phi', 
                 'PRI_tau_pt', 'transverse_lepton_jet_mass']


'''

input_columns = [ 'transverse_lepton_jet_mass',
                'DER_mass_MMC', 
                'DER_mass_vis', 
                'DER_mass_transverse_met_lep',
                'DER_pt_ratio_lep_tau', 
                'DER_sum_pt',
                'DER_deltar_tau_lep', 
                'DER_pt_h',
                'DER_sum_pt',  
                'PRI_met_sumet', 
                'PRI_tau_pt',
                ]

'''
print(len(input_columns))

31


In [4]:
RDF = ROOT.ROOT.RDataFrame

signal_tree_name = 'signal'
background_tree_name = 'background'
test_tree_name = 'validation'
file_name = 'atlas-higgs-challenge-2014-v2_part.root'

rdf_signal = RDF(signal_tree_name, file_name)
rdf_bkg = RDF(background_tree_name, file_name)
rdf_test = RDF(test_tree_name, file_name)

reconstruct_transverse_lepton_jet_mass = '''

float lep_px = PRI_lep_pt * TMath::Cos(PRI_lep_phi);
float lep_py = PRI_lep_pt * TMath::Sin(PRI_lep_phi);
float jet_px = PRI_jet_leading_pt * TMath::Cos(PRI_jet_leading_phi);
float jet_py = PRI_jet_leading_pt * TMath::Sin(PRI_jet_leading_phi);

//calculate angle between jet and lepton
float cos_theta = (lep_px*jet_px + lep_py*jet_py) / PRI_lep_pt / PRI_jet_leading_pt;

return PRI_lep_pt * PRI_jet_leading_pt * (1 - cos_theta);
'''

#insertion
rdf_signal = rdf_signal.Define('transverse_lepton_jet_mass', reconstruct_transverse_lepton_jet_mass)
rdf_bkg = rdf_bkg.Define('transverse_lepton_jet_mass', reconstruct_transverse_lepton_jet_mass)
rdf_test = rdf_test.Define('transverse_lepton_jet_mass', reconstruct_transverse_lepton_jet_mass)

# label classification to int values
rdf_test = rdf_test.Define('IntLabel', '''
const char ch = Label[0];
const char s = 's';
if(ch == s){
    return 1;
}
else{
    return 0;
}
''')


df_signal = pd.DataFrame(rdf_signal.AsNumpy())
df_bg = pd.DataFrame(rdf_bkg.AsNumpy())
df_test = pd.DataFrame(rdf_test.AsNumpy())


## concatination, shuffle and split

In [5]:
from sklearn.utils import shuffle;
from sklearn.model_selection import train_test_split;

#input feature arrays
vars_signal = df_signal[input_columns].to_numpy()
vars_bg = df_bg[input_columns].to_numpy()
vars_test = df_test[input_columns].to_numpy()

inputs = np.concatenate([vars_signal, vars_bg])

#weights
weight_signal = df_signal['Weight'].to_numpy()
weight_bg = df_bg['Weight'].to_numpy()
weights = np.concatenate([weight_signal, weight_bg])
weights = weights.reshape((weights.shape[0],))

weights_test = df_test['Weight'].to_numpy()


# target classifictionation (1:signal / 0: background)
y_signal = np.ones((vars_signal.shape[0], ))
y_bg = np.zeros((vars_bg.shape[0], ))

targets = np.concatenate([y_signal, y_bg])

# for test dataset there is already a classification; convert to int
truths_test = df_test.IntLabel.to_numpy()


# shuffle 
inputs, targets, weights = shuffle(inputs, targets, weights)


# not for gridcv

# training and validation split  (80, 20)
x_train, x_val, y_train, y_val, w_train, w_val = train_test_split(inputs, targets, weights, test_size=0.2)
#x_train, y_train = inputs, targets

## StandardScaling 

In [6]:
from sklearn.preprocessing import StandardScaler;
 
scaler = StandardScaler()
scaler.fit(x_train) #set up only on train data
 
# tranformation applied to all
x_train = scaler.transform(x_train)
x_val = scaler.transform(x_val)
x_test = scaler.transform(vars_test)

### SMOTE

In [7]:
#%pip install imbalanced-learn

In [8]:
from imblearn.over_sampling import SMOTE


# Apply SMOTE to the training data
smote = SMOTE()
x_train, y_train = smote.fit_resample(x_train, y_train)


## PCA

In [9]:
from sklearn.decomposition import PCA

#x_train_pre = x_train

#pca = PCA(n_components=22)
#pca.fit(x_train)

#x_train = pca.transform(x_train)
#x_val = pca.transform(x_val)
#x_test = pca.transform(x_test)

## DNN

$$
AMS = \sqrt{2((s+b+b_r) \log(1+\frac{s}{s+b_r})-s)}
$$

Here:
- $s, b$: unnormalised true positive and false positive rates, respectively
- $b_r = 10$ is the constant regularisation term


In [10]:
from tensorflow.keras.metrics import Metric
from tensorflow.keras.metrics import AUC


def ams_metric(y_true, y_pred):
    
    s = tf.reduce_sum(y_true * tf.round(y_pred))
    b = tf.reduce_sum((1 - y_true) * tf.round(y_pred))
    b_r = 10.0
    ams = tf.math.sqrt(2 * ((s + b + b_r) * tf.math.log(1 + s / (b + b_r)) - s))
    return ams.numpy()

2024-07-26 11:11:38.614373: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-26 11:11:38.615022: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-26 11:11:38.618786: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-26 11:11:38.662002: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


#### training function

In [11]:

# Training function
def train(model, epochs=10, batch_size=128, learning_rate=1e-3, verbose="auto", callbacks=None):
    '''
    - x: features
    - y: labels
    - w: sample weights
    '''
    
    # Define loss function, optimizer algorithm and validation metrics
    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(learning_rate=learning_rate),
        metrics=['accuracy']
    )
        #metrics=['accuracy', AUC]
    
    # Print summary of the model
    model.summary()
    
    # Train model
    history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs,
                         verbose=verbose,  validation_split=0.2, callbacks=callbacks)
    #history = model.fit(x_train, y_train, sample_weight=w_train, batch_size=batch_size, epochs=epochs,
    #                     verbose=verbose,  validation_split=0.2)
        
    
    return history


### model

In [12]:
import tensorflow as tf;
from tensorflow.keras import layers, models, optimizers;
from tensorflow.keras.optimizers import Adam

# Define the DNN model
def buildModel():
    model = models.Sequential()
    
    # input
    model.add(layers.Input(shape=(x_train.shape[1],)))
    
    # hidden
    model.add(layers.Dense(2*128, activation='relu', kernel_initializer='glorot_normal'))
    model.add(layers.Dropout(0.4))
    model.add(layers.Dense(2*128, activation='relu', kernel_initializer='glorot_normal'))
    model.add(layers.Dropout(0.4))
    model.add(layers.Dense(2*128, activation='relu', kernel_initializer='glorot_normal'))
    model.add(layers.Dropout(0.4))
    model.add(layers.Dense(128, activation='relu', kernel_initializer='glorot_normal'))
    model.add(layers.Dropout(0.4))
    model.add(layers.Dense(128, activation='relu', kernel_initializer='glorot_normal'))
    model.add(layers.Dropout(0.4))
    model.add(layers.Dense(32, activation='relu', kernel_initializer='glorot_normal'))
    
    # output
    model.add(layers.Dense(1, activation='sigmoid', kernel_initializer='glorot_normal'))
    
    return model


## execution

In [13]:
name = 'model_a'

model = buildModel()

callbacks = [ModelCheckpoint(filepath=f'storgage/{name}.keras', save_best_only=True), 
             CSVLogger('training.log'),
            EarlyStopping(patience=10)] 

history = train(model, epochs=800, batch_size=4*1024, learning_rate=2e-4, callbacks=callbacks)
#history = train(model, epochs=400, batch_size=2*1024, learning_rate=2e-3)

printScore(model)
plotLossAccuracy(history)


NameError: name 'ModelCheckpoint' is not defined

In [None]:
model.save_weights('600epoch_weights.weights.h5')

In [None]:
plotAMS(history)

In [None]:
yhat = model.predict(x_val)
yhat = yhat.reshape((yhat.shape[0],))

ergebnis = find_best_ams_score(y_val, yhat, w_val)

In [None]:
ergebnis[0][0]

In [None]:
ams_metric(y_val, yhat)

In [None]:
ops.sum