In [4]:
import pandas as pd
import numpy as np
import os
import csv

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import precision_score, confusion_matrix, accuracy_score, log_loss
from sklearn.preprocessing import StandardScaler
from hyperopt import hp, tpe, fmin, Trials, space_eval

import shap

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


# Data loading, transforming and splitting

In [6]:
scaler = StandardScaler()

Data_train = pd.read_csv('AppML_InitialProject_train.csv')

X = Data_train.drop(['p_Truth_isElectron', 'p_Truth_Energy'], axis=1)
y = Data_train['p_Truth_isElectron']

scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

# First run of the model

In [7]:
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy', 'binary_crossentropy'])

model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

y_pred_proba = model.predict(X_val)

accuracy = accuracy_score(y_val, y_pred_proba.round())
conf_matrix = confusion_matrix(y_val, y_pred_proba.round())
logloss = log_loss(y_val, y_pred_proba)

print("Accuracy:", accuracy)
print("Confusion matrix:", conf_matrix)
print("LogLoss:", logloss)

Epoch 1/10
[1m4219/4219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - accuracy: 0.9433 - binary_crossentropy: 0.1533 - loss: 0.1533 - val_accuracy: 0.9570 - val_binary_crossentropy: 0.1149 - val_loss: 0.1149
Epoch 2/10
[1m4219/4219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - accuracy: 0.9584 - binary_crossentropy: 0.1090 - loss: 0.1090 - val_accuracy: 0.9609 - val_binary_crossentropy: 0.1050 - val_loss: 0.1050
Epoch 3/10
[1m4219/4219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - accuracy: 0.9622 - binary_crossentropy: 0.0994 - loss: 0.0994 - val_accuracy: 0.9615 - val_binary_crossentropy: 0.1054 - val_loss: 0.1054
Epoch 4/10
[1m4219/4219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - accuracy: 0.9645 - binary_crossentropy: 0.0944 - loss: 0.0944 - val_accuracy: 0.9611 - val_binary_crossentropy: 0.1035 - val_loss: 0.1035
Epoch 5/10
[1m4219/4219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step 

# Most important features using SHAP

In [8]:
explainer = shap.Explainer(model, X_train, algorithm='auto', n_jobs=-1)
shap_values = explainer.shap_values(X_train)

shap.summary_plot(shap_values, X_train, plot_type="violin")

PermutationExplainer explainer:   1%|          | 866/135000 [02:13<6:00:58,  6.19it/s]


KeyboardInterrupt: 

# Second run with the most important features

In [None]:
average_shap_values = np.abs(shap_values).mean(axis=0)
sorted_indices = np.argsort(average_shap_values)[::-1]
top_20_indices = sorted_indices[:20]

X_train_20 = X_train.iloc[:, top_20_indices]
X_val_20 = X_val.iloc[:, top_20_indices]

model_reduced = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_20.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_reduced.compile(optimizer='adam',
                      loss='binary_crossentropy',
                      metrics=['accuracy', 'binary_crossentropy'])

history_reduced = model_reduced.fit(X_train_20, y_train, epochs=10, batch_size=32, validation_data=(X_val_20, y_val))

y_pred_proba = model_reduced.predict(X_val_20)

accuracy = accuracy_score(y_val, y_pred_proba.round())
conf_matrix = confusion_matrix(y_val, y_pred_proba.round())
logloss = log_loss(y_val, y_pred_proba)

print("Accuracy:", accuracy)
print("Confusion matrix:", conf_matrix)
print("LogLoss:", logloss)

Epoch 1/10
[1m4219/4219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 842us/step - accuracy: 0.9420 - binary_crossentropy: 0.1554 - loss: 0.1554 - val_accuracy: 0.9529 - val_binary_crossentropy: 0.1235 - val_loss: 0.1237
Epoch 2/10
[1m4219/4219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 804us/step - accuracy: 0.9562 - binary_crossentropy: 0.1124 - loss: 0.1124 - val_accuracy: 0.9573 - val_binary_crossentropy: 0.1106 - val_loss: 0.1106
Epoch 3/10
[1m4219/4219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 799us/step - accuracy: 0.9590 - binary_crossentropy: 0.1055 - loss: 0.1055 - val_accuracy: 0.9593 - val_binary_crossentropy: 0.1037 - val_loss: 0.1038
Epoch 4/10
[1m4219/4219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 799us/step - accuracy: 0.9599 - binary_crossentropy: 0.1004 - loss: 0.1004 - val_accuracy: 0.9598 - val_binary_crossentropy: 0.1015 - val_loss: 0.1016
Epoch 5/10
[1m4219/4219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7

# This run using **cross validation** and **hyperparameter optimization (bayesian)**

In [None]:
space = {
    'num_hidden_layers': hp.choice('num_hidden_layers', [1, 2, 3]),
    'units': hp.choice('units', [16, 32, 64]),
    'dropout': hp.uniform('dropout', 0.0, 0.2),
    'lr': hp.uniform('lr', 0.01, 0.1)
}

scores = []
X_20 = X.iloc[:, top_20_indices]
kfold = KFold(n_splits=3, shuffle=True)

def objective(params):
    model = Sequential()

    # Add input layer
    model.add(Dense(params['units'], activation='relu'))

    # Add hidden layers
    for _ in range(params['num_hidden_layers']):
        model.add(Dense(params['units'], activation='relu'))
        model.add(Dropout(params['dropout']))

    model.add(Dense(1, activation='sigmoid'))


    model.compile(optimizer=Adam(learning_rate=params['lr']),
                  loss='binary_crossentropy',
                  metrics=['binary_crossentropy'])

    for train_index, test_index in kfold.split(X_20, y):
        X_train_cv, X_val_cv = X_20.iloc[train_index], X_20.iloc[test_index]
        y_train_cv, y_val_cv = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train_cv, y_train_cv, epochs=10, batch_size=32, verbose=0, validation_data=(X_val_cv, y_val_cv))

        _, logloss = model.evaluate(X_val_cv, y_val_cv, verbose=0)

        scores.append(logloss)
    return np.mean(scores)

trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=10,
            trials=trials)

best_params = space_eval(space, best)

print("Best Hyperparameters:", best_params)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 10/10 [17:13<00:00, 103.38s/trial, best loss: 0.12928964528772566]
Best Hyperparameters: {'dropout': 0.1767806339326483, 'lr': 0.01576289899456416, 'num_hidden_layers': 1, 'units': 64}


# Third run with the best hyperparameters and **cross validation**

In [None]:
val_logloss_scores = []
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, val_index in kf.split(X_20):
    X_train_20, X_val_20 = X_20.iloc[train_index], X_20.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    best_model = Sequential()
    
    best_model.add(Dense(best_params['units'], activation='relu', input_shape=(X_train_20.shape[1],)))
    
    for _ in range(best_params['num_hidden_layers']):
        best_model.add(Dense(best_params['units'], activation='relu'))
        best_model.add(Dropout(best_params['dropout']))
    
    best_model.add(Dense(1, activation='sigmoid'))
    
    best_model.compile(optimizer=Adam(learning_rate=best_params['lr']),
                       loss='binary_crossentropy',
                       metrics=['binary_crossentropy'])
    
    best_model.fit(X_train_20, y_train, epochs=10, batch_size=32, verbose=1, validation_data=(X_val_20, y_val))

    val_logloss = best_model.evaluate(X_val_20, y_val, verbose=0)[1]
    val_logloss_scores.append(val_logloss)

mean_logloss = np.mean(val_logloss_scores)
print("Mean LogLoss from Cross-Validation:", mean_logloss)

In [None]:
y_pred_prob = best_model.predict(X_val_20)
print(log_loss(y_val, y_pred_prob))
print(accuracy_score(y_val, y_pred_prob.round()))

# Testing the best model on the test set

In [None]:
Data_test = pd.read_csv('AppML_InitialProject_test_classification.csv')

X_test = pd.DataFrame(scaler.fit_transform(Data_test), columns=Data_test.columns)
X_test_20 = X_test.iloc[:, top_20_indices]
y_pred_prob = best_model.predict(X_test_20)

y_pred_prob = [float(pred[0]) for pred in y_pred_prob]

print(y_pred_prob)

[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 573us/step
[0.9999756813049316, 0.8547666072845459, 0.07312063127756119, 0.9999892115592957, 0.9951916337013245, 0.9999933242797852, 0.9999995231628418, 0.006528960075229406, 0.997765839099884, 1.0, 0.9999997615814209, 0.9999951124191284, 0.9999985694885254, 1.0, 0.9999967217445374, 0.999998927116394, 0.9999999403953552, 0.9999895691871643, 0.9999995231628418, 0.9998019337654114, 0.9999997019767761, 0.7515758872032166, 0.9999995827674866, 0.999761700630188, 0.9999972581863403, 0.2872881591320038, 1.0, 0.28244444727897644, 0.9999976754188538, 0.7886356115341187, 0.5034874677658081, 0.9999997615814209, 0.9999964833259583, 0.9998915195465088, 0.9999992251396179, 0.99998539686203, 0.9973448514938354, 0.28244444727897644, 0.9891201257705688, 0.989342212677002, 0.9315837621688843, 0.999974250793457, 1.0, 0.999978244304657, 0.9327735304832458, 1.0, 1.0, 0.9999995827674866, 0.9999271631240845, 0.6850562691688538, 0.99999952316

# Saving (set to False)

In [None]:
top_20_features = X.columns[top_20_indices]

folder_name = 'solutions'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

Write = False
if Write:

    top_20_features_list = top_20_features.tolist()
    variables = top_20_features

    csv_file_path = os.path.join(folder_name, 'Classification_Tensorflow_VariableList.csv')

    with open(csv_file_path, mode='w', newline='') as file:
        writer = csv.writer(file)

        for variable in variables:
            writer.writerow([variable])
    
    data = y_pred_prob

    csv_file_path = os.path.join(folder_name, 'Classification_Tensorflow.csv')
    
    with open(csv_file_path, mode='w', newline='') as file:
        writer = csv.writer(file)
        for index, item in enumerate(data, start=0):
            writer.writerow([index, item])