In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

## Pre-processing

In [2]:
dtype = {
    'ResponseID': 'category',
    'UserID': 'int8',
    'Intervention': 'int8',
    'PedPed': 'int8',
    'Barrier': 'int8',
    'CrossingSignal': 'int8',
    'AttributeLevel': 'category',
    'ScenarioTypeStrict': 'category',
    'NumberOfCharacters': 'int8',
    'DiffNumberOFCharacters': 'int8',
    'Saved': 'int8',
    'Man': 'int8',
    'Woman': 'int8',
    'Pregnant': 'int8',
    'Stroller': 'int8',
    'OldMan': 'int8',
    'OldWoman': 'int8',
    'Boy': 'int8',
    'Girl': 'int8',
    'Homeless': 'int8',
    'LargeWoman': 'int8',
    'LargeMan': 'int8',
    'Criminal': 'int8',
    'MaleExecutive': 'int8',
    'FemaleExecutive': 'int8',
    'FemaleAthlete': 'int8',
    'MaleAthlete': 'int8',
    'FemaleDoctor': 'int8',
    'MaleDoctor': 'int8',
    'Dog': 'int8',
    'Cat': 'int8'
}


In [3]:
df50 = pd.read_csv('total_50_dataset.csv', dtype=dtype)

In [4]:
df50.shape

(2500000, 31)

In [5]:
# preprocessing

# one-hot encode the AttributeLevel and ScenarioTypeStrict
df50 = pd.get_dummies(df50, columns=['AttributeLevel', 'ScenarioTypeStrict'])

print(df50.columns)


Index(['ResponseID', 'UserID', 'Intervention', 'PedPed', 'Barrier',
       'CrossingSignal', 'NumberOfCharacters', 'DiffNumberOFCharacters',
       'Saved', 'Man', 'Woman', 'Pregnant', 'Stroller', 'OldMan', 'OldWoman',
       'Boy', 'Girl', 'Homeless', 'LargeWoman', 'LargeMan', 'Criminal',
       'MaleExecutive', 'FemaleExecutive', 'FemaleAthlete', 'MaleAthlete',
       'FemaleDoctor', 'MaleDoctor', 'Dog', 'Cat', 'AttributeLevel_Fat',
       'AttributeLevel_Female', 'AttributeLevel_Fit', 'AttributeLevel_High',
       'AttributeLevel_Hoomans', 'AttributeLevel_Less', 'AttributeLevel_Low',
       'AttributeLevel_Male', 'AttributeLevel_More', 'AttributeLevel_Old',
       'AttributeLevel_Pets', 'AttributeLevel_Young', 'ScenarioTypeStrict_Age',
       'ScenarioTypeStrict_Fitness', 'ScenarioTypeStrict_Gender',
       'ScenarioTypeStrict_Social Status', 'ScenarioTypeStrict_Species',
       'ScenarioTypeStrict_Utilitarian'],
      dtype='object')


In [6]:
df50.shape

(2500000, 47)

In [7]:
df50.duplicated().sum()

0

In [8]:
# now also dropping ResponseID
# kept this on and pre-processed it thus far to keep track of if everything went right with the complete sessions being in the dataset (so each ResponseID has to be present twice)

#### Splitting

In [9]:
# Prepare features and target variable
X = df50.drop(['UserID', 'ResponseID'], axis=1)     # Features
y = df50['UserID']                                  # Target variable

In [10]:
# Split the data into train, validation and test sets
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [11]:
# Setting K for K-fold cross validation

from sklearn.model_selection import KFold

k = 5
kf = KFold(n_splits=k, random_state=42, shuffle=True)

## Modelling

### Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
# Initialize logistic regression model
lr_model = LogisticRegression(max_iter=1000)

# Lists to store results
conf_matrices_lr = []
classification_reports_lr = []
count = 0

# Loop through each fold
for train_index, test_index in kf.split(X_trainval):
    count += 1

    # Split data into train and test for this fold
    X_train, X_val = X_trainval.iloc[train_index], X_trainval.iloc[test_index]
    y_train, y_val = y_trainval.iloc[train_index], y_trainval.iloc[test_index]
    
    # Train the model
    lr_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_lr = lr_model.predict(X_val)
    
    # Generate confusion matrix and classification report
    conf_matrices_lr.append(confusion_matrix(y_val, y_pred_lr))
    classification_reports_lr.append(classification_report(y_val, y_pred_lr, output_dict=True))

    print("Done processing fold " + str(count))

Done processing fold 1
Done processing fold 2
Done processing fold 3
Done processing fold 4
Done processing fold 5


In [25]:
# Display results
for i in range(k):
    print(f"\nFold {i+1}")
    print("Confusion Matrix:")
    print(conf_matrices_lr[i])
    print("\nClassification Report:")
    for label, metrics in classification_reports_lr[i].items():
        print(f"{label}: {metrics}")


Fold 1
Confusion Matrix:
[[158939  53145]
 [117745  95171]]

Classification Report:
0: {'precision': 0.5744423240953579, 'recall': 0.7494153260029045, 'f1-score': 0.6503658177294749, 'support': 212084.0}
1: {'precision': 0.6416772296987513, 'recall': 0.44698848372127975, 'f1-score': 0.5269245249590291, 'support': 212916.0}
accuracy: 0.5979058823529412
macro avg: {'precision': 0.6080597768970546, 'recall': 0.5982019048620921, 'f1-score': 0.588645171344252, 'support': 425000.0}
weighted avg: {'precision': 0.608125588004657, 'recall': 0.5979058823529412, 'f1-score': 0.5885243441023873, 'support': 425000.0}

Fold 2
Confusion Matrix:
[[158238  54301]
 [116947  95514]]

Classification Report:
0: {'precision': 0.5750240747133747, 'recall': 0.7445127717736509, 'f1-score': 0.6488833848652107, 'support': 212539.0}
1: {'precision': 0.6375463071121049, 'recall': 0.4495601545695445, 'f1-score': 0.5272996279080039, 'support': 212461.0}
accuracy: 0.5970635294117647
macro avg: {'precision': 0.6062851

In [32]:
# Calculate average accuracy
accuracies_lr = [report['accuracy'] for report in classification_reports_lr]
average_accuracy_lr = np.mean(accuracies_lr)
print(f"\nAverage Accuracy LR: {average_accuracy_lr}")

# Caluculate average standard deviation
std_dev_lr = np.std(accuracies_lr)
print(f"Standard Deviation: {std_dev_lr}")

# Calculate average f1-score
f1s_lr_0 = [report['0']['f1-score'] for report in classification_reports_lr]
f1s_lr_1 = [report['1']['f1-score'] for report in classification_reports_lr]
total_f1s_lr = [f1s_lr_0[i] + f1s_lr_1[i] for i in range(k)]
total_f1s_lr = [f1 / 2 for f1 in total_f1s_lr]
average_f1_lr = np.mean(total_f1s_lr)
print(f"\nAverage f1-score LR: {average_f1_lr}")


Average Accuracy LR: 0.598672
Standard Deviation: 0.0011900790354958243

Average f1-score LR: 0.5894487032293704


In [33]:
# Collect the recall for the '1' class from each fold
recall_class_1_lr = [report['1']['recall'] for report in classification_reports_lr]

# Calculate the average recall for the '1' class across all folds
average_recall_class_1_lr = np.mean(recall_class_1_lr)
print(f"\nAverage Recall for class 1: {average_recall_class_1_lr}")


Average Recall for class 1: 0.4486251488725931


### Random Forest

In [34]:
from sklearn.ensemble import RandomForestClassifier

In [37]:
# Initialize Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Lists to store results
conf_matrices_rf = []
classification_reports_rf = []
count = 0

# Manually loop through each fold
for train_index, test_index in kf.split(X_trainval):
    count += 1
    print("Start processing fold " + str(count))

    # Split the data for this fold
    X_train, X_val = X_trainval.iloc[train_index], X_trainval.iloc[test_index]
    y_train, y_val = y_trainval.iloc[train_index], y_trainval.iloc[test_index]
    
    # Train the model
    rf_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_rf = rf_model.predict(X_val)
    
    # Store confusion matrix and classification report for each fold
    conf_matrices_rf.append(confusion_matrix(y_val, y_pred_rf))
    classification_reports_rf.append(classification_report(y_val, y_pred_rf, output_dict=True))

    print("Done processing fold " + str(count))

Start processing fold 1
Done processing fold 1
Start processing fold 2
Done processing fold 2
Start processing fold 3
Done processing fold 3
Start processing fold 4
Done processing fold 4
Start processing fold 5
Done processing fold 5


In [38]:
# Display results for each fold
for i in range(k):
    print(f"\nFold {i+1}")
    print("Confusion Matrix:")
    print(conf_matrices_rf[i])
    print("\nClassification Report:")
    for label, metrics in classification_reports_rf[i].items():
        print(f"{label}: {metrics}")


Fold 1
Confusion Matrix:
[[164858  47226]
 [ 60135 152781]]

Classification Report:
0: {'precision': 0.7327250181116746, 'recall': 0.7773240791384546, 'f1-score': 0.7543659355216586, 'support': 212084.0}
1: {'precision': 0.7638782642607509, 'recall': 0.7175646733923238, 'f1-score': 0.7399975298057991, 'support': 212916.0}
accuracy: 0.7473858823529411
macro avg: {'precision': 0.7483016411862127, 'recall': 0.7474443762653892, 'f1-score': 0.7471817326637289, 'support': 425000.0}
weighted avg: {'precision': 0.748332134716561, 'recall': 0.7473858823529411, 'f1-score': 0.747167668530134, 'support': 425000.0}

Fold 2
Confusion Matrix:
[[165012  47527]
 [ 59992 152469]]

Classification Report:
0: {'precision': 0.7333736289132637, 'recall': 0.7763845694202005, 'f1-score': 0.7542664378129692, 'support': 212539.0}
1: {'precision': 0.7623602472049441, 'recall': 0.7176328832115071, 'f1-score': 0.7393207049462126, 'support': 212461.0}
accuracy: 0.7470141176470588
macro avg: {'precision': 0.74786693

In [40]:
# Calculate average accuracy
accuracies_rf = [report['accuracy'] for report in classification_reports_rf]
average_accuracy_rf = np.mean(accuracies_rf)
print(f"\nAverage Accuracy RF: {average_accuracy_rf}")

# Caluculate average standard deviation of accuracies
std_dev_rf = np.std(accuracies_rf)
print(f"Standard Deviation: {std_dev_rf}")

# Calculate average f1-score
f1s_rf_0 = [report['0']['f1-score'] for report in classification_reports_rf]
f1s_rf_1 = [report['1']['f1-score'] for report in classification_reports_rf]
total_f1s_rf = [f1s_rf_0[i] + f1s_rf_1[i] for i in range(k)]
total_f1s_rf = [f1 / 2 for f1 in total_f1s_rf]
average_f1_rf = np.mean(total_f1s_rf)
print(f"\nAverage f1-score LR: {average_f1_rf}")


Average Accuracy RF: 0.7479524705882353
Standard Deviation: 0.0007023084427720712

Average f1-score LR: 0.7477566217681477


In [41]:
# Collect the recall for the '1' class from each fold
recall_class_1_rf = [report['1']['recall'] for report in classification_reports_rf]

# Calculate the average recall for the '1' class across all folds
average_recall_class_1_rf = np.mean(recall_class_1_rf)
print(f"\nAverage Recall for class 1: {average_recall_class_1_rf}")


Average Recall for class 1: 0.7198414314147527


### Support Vector Machine

In [33]:
from sklearn.svm import SVC

In [35]:
# Initialize SVC

svc = SVC(kernel='rbf', random_state=45)

# choosing rbf cause not linearly separable

In [None]:
# Lists to store results
conf_matrices_svm = []
classification_reports_svm = []
count = 0

# Manually loop through each fold
for train_index, test_index in kf.split(X_trainval):
    count += 1

    print("Starting fold " + str(count))
    # Split the data for this fold
    X_train, X_val = X_trainval.iloc[train_index], X_trainval.iloc[test_index]
    y_train, y_val = y_trainval.iloc[train_index], y_trainval.iloc[test_index]
    
    # Train the model
    svc.fit(X_train, y_train)
    print("Model fitted")
    
    # Make predictions
    y_pred_svm = svc.predict(X_val)
    
    # Store confusion matrix and classification report for each fold
    conf_matrices_svm.append(confusion_matrix(y_val, y_pred_svm))
    classification_reports_svm.append(classification_report(y_val, y_pred_svm, output_dict=True))

    print("Done processing fold " + str(count))

Starting a fold
Model fitted
Done processing a fold
Starting a fold
Model fitted
Done processing a fold
Starting a fold
Model fitted
Done processing a fold


In [None]:
# Display results for each fold
for i in range(k):
    print(f"\nFold {i+1}")
    print("Confusion Matrix:")
    print(conf_matrices_svm[i])
    print("\nClassification Report:")
    for label, metrics in classification_reports_svm[i].items():
        print(f"{label}: {metrics}")


Fold 1
Confusion Matrix:
[[33869  7839]
 [19892 21734]]

Classification Report:
0: {'precision': 0.6299920016368743, 'recall': 0.81205044595761, 'f1-score': 0.7095287475515613, 'support': 41708.0}
1: {'precision': 0.7349271294762114, 'recall': 0.5221255945803104, 'f1-score': 0.6105141926150648, 'support': 41626.0}
accuracy: 0.6672306621547027
macro avg: {'precision': 0.6824595655565429, 'recall': 0.6670880202689602, 'f1-score': 0.660021470083313, 'support': 83334.0}
weighted avg: {'precision': 0.6824079378866673, 'recall': 0.6672306621547027, 'f1-score': 0.6600701848546235, 'support': 83334.0}

Fold 2
Confusion Matrix:
[[33671  8098]
 [19928 21636]]

Classification Report:
0: {'precision': 0.6282020186943786, 'recall': 0.8061241590653355, 'f1-score': 0.7061278416240249, 'support': 41769.0}
1: {'precision': 0.7276518463711577, 'recall': 0.5205466268886536, 'f1-score': 0.6069174450896239, 'support': 41564.0}
accuracy: 0.663686654746619
macro avg: {'precision': 0.6779269325327681, 'recal

In [None]:
# Calculate average accuracy
accuracies_svm = [report['accuracy'] for report in classification_reports_svm]
average_accuracy_svm = np.mean(accuracies_svm)
print(f"\nAverage Accuracy SVM: {average_accuracy_svm}")

# Caluculate average standard deviation
std_dev_svm = np.std(accuracies_svm)
print(f"Standard Deviation: {std_dev_svm}")


Average Accuracy RF: 0.6657119939253271
Standard Deviation: 0.0014905331742503187


In [None]:
# Collect the recall for the '1' class from each fold
recall_class_1_svm = [report['1']['recall'] for report in classification_reports_svm]

# Calculate the average recall for the '1' class across all folds
average_recall_class_1_svm = np.mean(recall_class_1_svm)
print(f"\nAverage Recall for class 1: {average_recall_class_1_svm}")

### MLP

In [12]:
from tensorflow.keras import layers, optimizers, models, callbacks
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow import keras

In [13]:
# Define parameters
initial_learning_rate = 5e-4
decay_rate = 0.1  # Decay rate per step

# Define the learning rate schedule
lr_schedule = ExponentialDecay(
    initial_learning_rate,
    decay_steps=10000,
    decay_rate=decay_rate)

# Define early stopping
early_stopping = callbacks.EarlyStopping(
    monitor='val_accuracy',             # Watch accuracy
    patience=5,                         # Stop after 3 epochs with no improvement
    restore_best_weights=True,          # Restore weights from the best epoch
    min_delta=0.0005                    # Minimum change in accuracy to qualify as an improvement
)

# Function to build the model
def build_model():
    model_mlp = models.Sequential()
    model_mlp.add(layers.Dense(64, activation='relu', input_shape=(X.shape[1],)))
    model_mlp.add(layers.BatchNormalization())
    model_mlp.add(layers.Dense(64, activation='relu'))
    model_mlp.add(layers.BatchNormalization())
    model_mlp.add(layers.Dense(1, activation='sigmoid'))
    model_mlp.compile(optimizer=optimizers.Adam(learning_rate=lr_schedule), loss='binary_crossentropy', metrics=['accuracy', keras.metrics.Recall()])
    return model_mlp

In [14]:
# Convert data to float32 (required by TensorFlow)
X_trainval = np.array(X_trainval, dtype=np.float32)
y_trainval = np.array(y_trainval, dtype=np.float32)

# Lists to store results
conf_matrices_mlp = []
classification_reports_mlp = []
count = 0

# Manually loop over each fold
for train_index, test_index in kf.split(X_trainval):
    count += 1
    print(f"\nFold {count}")
    
    # Split data for this fold
    X_train, X_val = X_trainval[train_index], X_trainval[test_index]
    y_train, y_val = y_trainval[train_index], y_trainval[test_index]
    
    # Build and train the model
    model_mlp = build_model()
    model_mlp.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=32, verbose=1, callbacks=[early_stopping])
    
    # Make predictions
    y_pred_mlp = model_mlp.predict(X_val)
    y_pred_mlp = (y_pred_mlp > 0.5).astype(int)  # Convert probabilities to binary class predictions
    
    # Store confusion matrix and classification report for each fold
    conf_matrices_mlp.append(confusion_matrix(y_val, y_pred_mlp))
    classification_reports_mlp.append(classification_report(y_val, y_pred_mlp, output_dict=True))


Fold 1
Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 2ms/step - accuracy: 0.6318 - loss: 0.6137 - recall: 0.5688 - val_accuracy: 0.6546 - val_loss: 0.5923 - val_recall: 0.5556
Epoch 2/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m255s[0m 5ms/step - accuracy: 0.6440 - loss: 0.6024 - recall: 0.5718 - val_accuracy: 0.6544 - val_loss: 0.5924 - val_recall: 0.5444
Epoch 3/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 964us/step - accuracy: 0.6443 - loss: 0.6021 - recall: 0.5714 - val_accuracy: 0.6548 - val_loss: 0.5925 - val_recall: 0.5511
Epoch 4/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 952us/step - accuracy: 0.6437 - loss: 0.6026 - recall: 0.5706 - val_accuracy: 0.6545 - val_loss: 0.5923 - val_recall: 0.5570
Epoch 5/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 990us/step - accuracy: 0.6442 - loss: 0.6022 - recall: 0.5709 - val_accuracy: 0.6546 - val_loss: 0.5923

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 1ms/step - accuracy: 0.6315 - loss: 0.6165 - recall_1: 0.5711 - val_accuracy: 0.6539 - val_loss: 0.5936 - val_recall_1: 0.5707
Epoch 2/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 1ms/step - accuracy: 0.6436 - loss: 0.6030 - recall_1: 0.5754 - val_accuracy: 0.6540 - val_loss: 0.5936 - val_recall_1: 0.5726
Epoch 3/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 1ms/step - accuracy: 0.6431 - loss: 0.6034 - recall_1: 0.5758 - val_accuracy: 0.6540 - val_loss: 0.5936 - val_recall_1: 0.5595
Epoch 4/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 1ms/step - accuracy: 0.6432 - loss: 0.6032 - recall_1: 0.5758 - val_accuracy: 0.6542 - val_loss: 0.5936 - val_recall_1: 0.5617
Epoch 5/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 2ms/step - accuracy: 0.6429 - loss: 0.6033 - recall_1: 0.5758 - val_accuracy: 0.6543 - val_lo

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 874us/step - accuracy: 0.6309 - loss: 0.6174 - recall_2: 0.5770 - val_accuracy: 0.6563 - val_loss: 0.5917 - val_recall_2: 0.5705
Epoch 2/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 3ms/step - accuracy: 0.6423 - loss: 0.6043 - recall_2: 0.5786 - val_accuracy: 0.6566 - val_loss: 0.5918 - val_recall_2: 0.5592
Epoch 3/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 3ms/step - accuracy: 0.6420 - loss: 0.6041 - recall_2: 0.5780 - val_accuracy: 0.6564 - val_loss: 0.5919 - val_recall_2: 0.5581
Epoch 4/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 3ms/step - accuracy: 0.6417 - loss: 0.6043 - recall_2: 0.5784 - val_accuracy: 0.6567 - val_loss: 0.5917 - val_recall_2: 0.5835
Epoch 5/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 2ms/step - accuracy: 0.6417 - loss: 0.6041 - recall_2: 0.5778 - val_accuracy: 0.6557 - 

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m211s[0m 4ms/step - accuracy: 0.6304 - loss: 0.6136 - recall_3: 0.5628 - val_accuracy: 0.6557 - val_loss: 0.5904 - val_recall_3: 0.5562
Epoch 2/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 4ms/step - accuracy: 0.6431 - loss: 0.6013 - recall_3: 0.5670 - val_accuracy: 0.6557 - val_loss: 0.5907 - val_recall_3: 0.5507
Epoch 3/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m258s[0m 5ms/step - accuracy: 0.6432 - loss: 0.6011 - recall_3: 0.5680 - val_accuracy: 0.6561 - val_loss: 0.5905 - val_recall_3: 0.5447
Epoch 4/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m247s[0m 5ms/step - accuracy: 0.6431 - loss: 0.6013 - recall_3: 0.5672 - val_accuracy: 0.6563 - val_loss: 0.5906 - val_recall_3: 0.5410
Epoch 5/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m187s[0m 4ms/step - accuracy: 0.6434 - loss: 0.6013 - recall_3: 0.5677 - val_accuracy:

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 3ms/step - accuracy: 0.6312 - loss: 0.6149 - recall_4: 0.5645 - val_accuracy: 0.6564 - val_loss: 0.5902 - val_recall_4: 0.5528
Epoch 2/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 917us/step - accuracy: 0.6427 - loss: 0.6019 - recall_4: 0.5705 - val_accuracy: 0.6559 - val_loss: 0.5900 - val_recall_4: 0.5681
Epoch 3/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 1ms/step - accuracy: 0.6430 - loss: 0.6023 - recall_4: 0.5703 - val_accuracy: 0.6562 - val_loss: 0.5901 - val_recall_4: 0.5601
Epoch 4/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m271s[0m 5ms/step - accuracy: 0.6427 - loss: 0.6016 - recall_4: 0.5710 - val_accuracy: 0.6561 - val_loss: 0.5903 - val_recall_4: 0.5555
Epoch 5/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m270s[0m 5ms/step - accuracy: 0.6426 - loss: 0.6019 - recall_4: 0.5707 - val_accuracy: 0.6565 - v

In [17]:
# Calculate average accuracy
accuracies_mlp = [report['accuracy'] for report in classification_reports_mlp]
average_accuracy_mlp = np.mean(accuracies_mlp)
print(f"\nAverage Accuracy MLP: {average_accuracy_mlp}")

# Caluculate average standard deviation of accuracies
std_dev_mlp = np.std(accuracies_mlp)
print(f"Standard Deviation: {std_dev_mlp}")

# Calculate average f1-score
f1s_mlp_0 = [report['0.0']['f1-score'] for report in classification_reports_mlp]
f1s_mlp_1 = [report['1.0']['f1-score'] for report in classification_reports_mlp]
total_f1s_mlp = [f1s_mlp_0[i] + f1s_mlp_1[i] for i in range(k)]
total_f1s_mlp = [f1 / 2 for f1 in total_f1s_mlp]
average_f1_mlp = np.mean(total_f1s_mlp)
print(f"\nAverage f1-score LR: {average_f1_mlp}")


Average Accuracy MLP: 0.6553778823529411
Standard Deviation: 0.0009673130909467366

Average f1-score LR: 0.6522851098830483


In [18]:
import numpy as np

# Collect the recall for the '1.0' class from each fold
recall_class_1_mlp = [report_mlp['1.0']['recall'] for report_mlp in classification_reports_mlp]

# Calculate the average recall for the '1.0' class across all folds
average_recall_class_1_mlp = np.mean(recall_class_1_mlp)
print(f"\nAverage Recall for class 1.0: {average_recall_class_1_mlp}")



Average Recall for class 1.0: 0.5611755634601688


## Dataframe with results

In [19]:

# List of model names
model_names = ['Logistic Regression', 'Random Forest', 'MLP']

# Collect the accuracy and recall values by calling the variables
accuracies = [average_accuracy_lr, average_accuracy_rf, average_accuracy_mlp]
recalls = [average_recall_class_1_lr, average_recall_class_1_rf, average_recall_class_1_mlp]

# Create a dictionary for the DataFrame
data = {
    'Model': model_names,
    'Accuracy': accuracies,
    'Recall': recalls
}

# Create the DataFrame
df_results = pd.DataFrame(data)

NameError: name 'average_accuracy_lr' is not defined

In [88]:
df_results

Unnamed: 0,Model,Accuracy,Recall
0,Logistic Regression,0.59867,0.448558
1,Random Forest,0.745421,0.716566
2,MLP,0.654485,0.563293


## Random search on best model

After looking at the df of above, it can be seen that the best model is the Random Forest model. I therefore will perform random search on this model

In [20]:
# Further splitting trainval into train and validation set to tune hyperparameters on the validation set
# Training set: 70%, Validation set: 15%, Test set: 15% (still the same test set)

X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=(15/85), random_state=42)

In [21]:
print(X_train.shape)
print(f"The percentage of the original dataset that is used for training is: {X_train.shape[0]/df50.shape[0]*100} %")

(1750000, 45)
The percentage of the original dataset that is used for training is: 70.0 %


In [22]:
from scipy.stats import randint  # For defining distributions for random search

param_dist = {
    'n_estimators': randint(50, 300),                       # Number of trees in the forest (uniform distribution between 50 and 200)
    'max_depth': [None] + list(randint(1, 30).rvs(10)),     # Random depth values including None
    'min_samples_split': randint(2, 10),                    # Minimum samples required to split an internal node
    'min_samples_leaf': randint(1, 4)                       # Minimum samples required to be at a leaf node
}


param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'log2']
}

In [23]:
# Initialize the Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

rf_model_search = RandomForestClassifier(random_state=42)  # Random state for reproducibility

In [None]:
# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf_model_search,
                           param_distributions=param_dist,
                           n_iter=20,          # Number of parameter settings that are sampled
                           scoring='recall',   # Use recall as the evaluation metric (but recall on 0 or on 1???)
                           cv=k,
                           verbose=2,
                           n_jobs=-1)          # Use all available cores

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get the best parameters and best score
print("Best Parameters:", random_search.best_params_)
print("Best Cross-Validation Score:", random_search.best_score_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [None]:
y_pred_rf_random = random_search.predict(X_val)

print("Accuracy on Validation Set:", accuracy_score(y_val, y_pred_rf_random))
print(confusion_matrix(y_val, y_pred_rf_random))
print(classification_report(y_val, y_pred_rf_random))

from sklearn.metrics import recall_score
print("Recall on Validation Set:", recall_score(y_val, y_pred_rf_random))

Accuracy on Validation Set: 0.7395093333333334
[[147891  39282]
 [ 58402 129425]]
              precision    recall  f1-score   support

           0       0.72      0.79      0.75    187173
           1       0.77      0.69      0.73    187827

    accuracy                           0.74    375000
   macro avg       0.74      0.74      0.74    375000
weighted avg       0.74      0.74      0.74    375000

