In [4]:
import h5py
import pandas
import numpy as np

def load_data(name):
    with h5py.File(f'{name}.h5', 'r') as f:
        filename = name.split('/')[1]
        return pandas.DataFrame(f[filename][:], dtype=np.float64)

train = load_data('data/train')
test  = load_data('data/test')

# Variable list from https://www.nbi.dk/~petersen/Teaching/ML2023/InitialProject/VariableList.html
all_variables = ['actualInteractionsPerCrossing', 'averageInteractionsPerCrossing', 'correctedActualMu', 'correctedAverageMu', 'correctedScaledActualMu', 'correctedScaledAverageMu', 'NvtxReco', 'p_nTracks', 'p_pt_track', 'p_eta', 'p_phi', 'p_charge', 'p_qOverP', 'p_z0', 'p_d0', 'p_sigmad0', 'p_d0Sig', 'p_EptRatio', 'p_dPOverP', 'p_z0theta', 'p_etaCluster', 'p_phiCluster', 'p_eCluster', 'p_rawEtaCluster', 'p_rawPhiCluster', 'p_rawECluster', 'p_eClusterLr0', 'p_eClusterLr1', 'p_eClusterLr2', 'p_eClusterLr3', 'p_etaClusterLr1', 'p_etaClusterLr2', 'p_phiClusterLr2', 'p_eAccCluster', 'p_f0Cluster', 'p_etaCalo', 'p_phiCalo', 'p_eTileGap3Cluster', 'p_cellIndexCluster', 'p_phiModCalo', 'p_etaModCalo', 'p_dPhiTH3', 'p_R12', 'p_fTG3', 'p_weta2', 'p_Reta', 'p_Rphi', 'p_Eratio', 'p_f1', 'p_f3', 'p_Rhad', 'p_Rhad1', 'p_deltaEta1', 'p_deltaPhiRescaled2', 'p_TRTPID', 'p_TRTTrackOccupancy', 'p_numberOfInnermostPixelHits', 'p_numberOfPixelHits', 'p_numberOfSCTHits', 'p_numberOfTRTHits', 'p_numberOfTRTXenonHits', 'p_chi2', 'p_ndof', 'p_SharedMuonTrack', 'p_E7x7_Lr2', 'p_E7x7_Lr3', 'p_E_Lr0_HiG', 'p_E_Lr0_LowG', 'p_E_Lr0_MedG', 'p_E_Lr1_HiG', 'p_E_Lr1_LowG', 'p_E_Lr1_MedG', 'p_E_Lr2_HiG', 'p_E_Lr2_LowG', 'p_E_Lr2_MedG', 'p_E_Lr3_HiG', 'p_E_Lr3_LowG', 'p_E_Lr3_MedG', 'p_ambiguityType', 'p_asy1', 'p_author', 'p_barys1', 'p_core57cellsEnergyCorrection', 'p_deltaEta0', 'p_deltaEta2', 'p_deltaEta3', 'p_deltaPhi0', 'p_deltaPhi1', 'p_deltaPhi2', 'p_deltaPhi3', 'p_deltaPhiFromLastMeasurement', 'p_deltaPhiRescaled0', 'p_deltaPhiRescaled1', 'p_deltaPhiRescaled3', 'p_e1152', 'p_e132', 'p_e235', 'p_e255', 'p_e2ts1', 'p_ecore', 'p_emins1', 'p_etconeCorrBitset', 'p_ethad', 'p_ethad1', 'p_f1core', 'p_f3core', 'p_maxEcell_energy', 'p_maxEcell_gain', 'p_maxEcell_time', 'p_maxEcell_x', 'p_maxEcell_y', 'p_maxEcell_z', 'p_nCells_Lr0_HiG', 'p_nCells_Lr0_LowG', 'p_nCells_Lr0_MedG', 'p_nCells_Lr1_HiG', 'p_nCells_Lr1_LowG', 'p_nCells_Lr1_MedG', 'p_nCells_Lr2_HiG', 'p_nCells_Lr2_LowG', 'p_nCells_Lr2_MedG', 'p_nCells_Lr3_HiG', 'p_nCells_Lr3_LowG', 'p_nCells_Lr3_MedG', 'p_pos', 'p_pos7', 'p_poscs1', 'p_poscs2', 'p_ptconeCorrBitset', 'p_ptconecoreTrackPtrCorrection', 'p_r33over37allcalo', 'p_topoetconeCorrBitset', 'p_topoetconecoreConeEnergyCorrection', 'p_topoetconecoreConeSCEnergyCorrection', 'p_weta1', 'p_widths1', 'p_widths2', 'p_wtots1', 'p_e233', 'p_e237', 'p_e277', 'p_e2tsts1', 'p_ehad1', 'p_emaxs1', 'p_fracs1', 'p_DeltaE', 'p_E3x5_Lr0', 'p_E3x5_Lr1', 'p_E3x5_Lr2', 'p_E3x5_Lr3', 'p_E5x7_Lr0', 'p_E5x7_Lr1', 'p_E5x7_Lr2', 'p_E5x7_Lr3', 'p_E7x11_Lr0', 'p_E7x11_Lr1', 'p_E7x11_Lr2', 'p_E7x11_Lr3', 'p_E7x7_Lr0', 'p_E7x7_Lr1' ]
subset_variables = ['p_e233', 'p_ehad1', 'p_E5x7_Lr2', 'p_f3core', 'p_deltaPhi0', 'p_f3', 'p_eAccCluster', 'p_dPOverP']
subset_variables = ['p_sigmad0', 'p_deltaEta1', 'p_d0', ]

X_train = train[all_variables][:130000]
#X_train = train[subset_variables][:130000]
y_train = train['Truth'][:130000]
X_validate = train[all_variables][130000:]
#X_validate = train[subset_variables][130000:]
y_validate = train['Truth'][130000:]

X_test = test[all_variables]

with open('Classification_JuliusFoverskov_XGBoost_VariableList.txt','r') as f:
    sorted_variables = [item.strip() for item in f]


In [2]:
from sklearn.metrics import accuracy_score

def plot_boundary_results(X, y, clf):
    # sample the whole X space with a meshgrid of probabilities
    # predicted by out trained classifier
    xx, yy = np.meshgrid(np.linspace(X[:,0].min(), X[:,0].max(), 300),
                         np.linspace(X[:,1].min(), X[:,1].max(), 300))
    X_bkg = np.c_[xx.ravel(), yy.ravel()]
    y_bkg = clf.predict_proba(X_bkg)[:,0].reshape(xx.shape)
    y_pred = clf.predict(X)

    # plot the classification results
    fig, ax = plt.subplots(1, figsize=(8, 5))
    ax.contourf(xx, yy, y_bkg, cmap='brg', alpha=0.2) # coloured by class
    ax.contour(xx, yy, y_bkg, cmap='brg', alpha=0.2) # coloured by class
    ax.scatter(X[:,0][y_pred!=y], X[:,1][y_pred!=y], s=100, c='pink')
    ax.scatter(X[:,0][y==0], X[:,1][y==0], s=20, c='g')
    ax.scatter(X[:,0][y==1], X[:,1][y==1], s=20, c='b')
    ax.set_xlabel("$x_1$")
    ax.set_ylabel("$x_2$")
    plt.tight_layout()
    print(f"Train accuracy: {accuracy_score(y_pred, y)*100.0:.2f}%")

In [46]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, BatchNormalization
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.initializers import VarianceScaling
from tensorflow.keras.losses import MeanSquaredError
from kerastuner.tuners import RandomSearch

learning_rate = 0.01
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)


model = Sequential([
    Dense(15,activation='relu',name='input_layer'),
    Dense(10,activation='relu',name='hidden_layer1'),
    Dense(10,activation='relu',name='hidden_layer2'),
    Dense(1, activation='sigmoid', name='output')])

model.compile(optimizer=optimizer,
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.losses.BinaryCrossentropy()])

history = model.fit(x = X_train[sorted_variables], y = y_train, validation_data=(X_validate[sorted_variables], y_validate), epochs = 7)  


Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [47]:
y_pred = model.predict(X_train[sorted_variables])
y_validate_pred = model.predict(X_validate[sorted_variables])
y_pred = np.where(y_pred > 0.5, 1, 0)
y_validate_pred = np.where(y_validate_pred > 0.5, 1, 0)


print(f"Train accuracy: {accuracy_score(y_pred, y_train)*100.0:.2f}%")
print(f"Validation accuracy: {accuracy_score(y_validate_pred, y_validate)*100.0:.2f}%")

Train accuracy: 74.80%
Validation accuracy: 74.64%


In [48]:
y_test_pred = model.predict(X_test[sorted_variables])



In [56]:
# Write results to a file

# Probabilities of particle being an electron
with open('Classification_JuliusFoverskov_TF-NN.txt','w') as f:
    # y_test_pred col 1 = electron probability
    for index, prediction in zip(X_test.index, y_test_pred[:, 0]):
        f.write(f"{index}, {prediction}\n")

# List of most important features from permutation importance for electron classification
with open('Classification_JuliusFoverskov_TF-NN_VariableList.txt','w') as f:
    f.write("\n".join(map(str, sorted_variables)))

array([0.7515697, 0.7515697, 0.7515697, ..., 0.7515697, 0.7515697,
       0.7515697], dtype=float32)