In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
dtype = {
    'ResponseID': 'category',
    'UserID': 'int8',
    'Intervention': 'int8',
    'PedPed': 'int8',
    'Barrier': 'int8',
    'CrossingSignal': 'int8',
    'AttributeLevel': 'category',
    'ScenarioTypeStrict': 'category',
    'NumberOfCharacters': 'int8',
    'DiffNumberOFCharacters': 'int8',
    'Saved': 'int8',
    'Man': 'int8',
    'Woman': 'int8',
    'Pregnant': 'int8',
    'Stroller': 'int8',
    'OldMan': 'int8',
    'OldWoman': 'int8',
    'Boy': 'int8',
    'Girl': 'int8',
    'Homeless': 'int8',
    'LargeWoman': 'int8',
    'LargeMan': 'int8',
    'Criminal': 'int8',
    'MaleExecutive': 'int8',
    'FemaleExecutive': 'int8',
    'FemaleAthlete': 'int8',
    'MaleAthlete': 'int8',
    'FemaleDoctor': 'int8',
    'MaleDoctor': 'int8',
    'Dog': 'int8',
    'Cat': 'int8'
}


In [3]:
df_nw = pd.read_csv('total_nw_50_dataset.csv', dtype=dtype)

In [4]:
df_nw.shape

(2500000, 31)

In [5]:
# preprocessing

# one-hot encode the AttributeLevel and ScenarioTypeStrict
df_nw = pd.get_dummies(df_nw, columns=['AttributeLevel', 'ScenarioTypeStrict'])

print(df_nw.columns)


Index(['ResponseID', 'UserID', 'Intervention', 'PedPed', 'Barrier',
       'CrossingSignal', 'NumberOfCharacters', 'DiffNumberOFCharacters',
       'Saved', 'Man', 'Woman', 'Pregnant', 'Stroller', 'OldMan', 'OldWoman',
       'Boy', 'Girl', 'Homeless', 'LargeWoman', 'LargeMan', 'Criminal',
       'MaleExecutive', 'FemaleExecutive', 'FemaleAthlete', 'MaleAthlete',
       'FemaleDoctor', 'MaleDoctor', 'Dog', 'Cat', 'AttributeLevel_Fat',
       'AttributeLevel_Female', 'AttributeLevel_Fit', 'AttributeLevel_High',
       'AttributeLevel_Hoomans', 'AttributeLevel_Less', 'AttributeLevel_Low',
       'AttributeLevel_Male', 'AttributeLevel_More', 'AttributeLevel_Old',
       'AttributeLevel_Pets', 'AttributeLevel_Young', 'ScenarioTypeStrict_Age',
       'ScenarioTypeStrict_Fitness', 'ScenarioTypeStrict_Gender',
       'ScenarioTypeStrict_Social Status', 'ScenarioTypeStrict_Species',
       'ScenarioTypeStrict_Utilitarian'],
      dtype='object')


In [6]:
df_nw.shape

(2500000, 47)

In [7]:
df_nw.duplicated().sum()

0

In [8]:
# now also dropping ResponseID
# kept this on and pre-processed it thus far to keep track of if everything went right with the complete sessions being in the dataset (so each ResponseID has to be present twice)

#### Splitting

In [8]:
# Prepare features and target variable
X = df_nw.drop(['UserID', 'ResponseID'], axis=1)     # Features
y = df_nw['UserID']                                  # Target variable

In [9]:
# Split the data into train, validation and test sets
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [10]:
X_trainval.shape

(2125000, 45)

In [18]:
# Setting K for K-fold cross validation

from sklearn.model_selection import KFold

k = 5
kf = KFold(n_splits=k, random_state=42, shuffle=True)

## Modelling

### Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
# Initialize logistic regression model
lr_model = LogisticRegression(max_iter=1000)

# Lists to store results
conf_matrices_lr = []
classification_reports_lr = []
count = 0

# Loop through each fold
for train_index, test_index in kf.split(X_trainval):
    count += 1

    # Split data into train and test for this fold
    X_train, X_val = X_trainval.iloc[train_index], X_trainval.iloc[test_index]
    y_train, y_val = y_trainval.iloc[train_index], y_trainval.iloc[test_index]
    
    # Train the model
    lr_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_lr = lr_model.predict(X_val)
    
    # Generate confusion matrix and classification report
    conf_matrices_lr.append(confusion_matrix(y_val, y_pred_lr))
    classification_reports_lr.append(classification_report(y_val, y_pred_lr, output_dict=True))

    print("Done processing fold " + str(count))

Done processing fold 1
Done processing fold 2
Done processing fold 3
Done processing fold 4
Done processing fold 5


In [21]:
# Display results
for i in range(k):
    print(f"\nFold {i+1}")
    print("Confusion Matrix:")
    print(conf_matrices_lr[i])
    print("\nClassification Report:")
    for label, metrics in classification_reports_lr[i].items():
        print(f"{label}: {metrics}")


Fold 1
Confusion Matrix:
[[160310  51774]
 [118510  94406]]

Classification Report:
0: {'precision': 0.5749587547521698, 'recall': 0.7558797457611135, 'f1-score': 0.6531215879275785, 'support': 212084.0}
1: {'precision': 0.6458202216445478, 'recall': 0.44339551748107237, 'f1-score': 0.5257981152672266, 'support': 212916.0}
accuracy: 0.5993317647058823
macro avg: {'precision': 0.6103894881983588, 'recall': 0.599637631621093, 'f1-score': 0.5894598515974026, 'support': 425000.0}
weighted avg: {'precision': 0.6104588490694818, 'recall': 0.5993317647058823, 'f1-score': 0.5893352243865162, 'support': 425000.0}

Fold 2
Confusion Matrix:
[[159592  52947]
 [117944  94517]]

Classification Report:
0: {'precision': 0.5750317075982936, 'recall': 0.750883367287886, 'f1-score': 0.6512962301688517, 'support': 212539.0}
1: {'precision': 0.6409496555091412, 'recall': 0.44486752862878365, 'f1-score': 0.5252038619156769, 'support': 212461.0}
accuracy: 0.5979035294117647
macro avg: {'precision': 0.607990

In [22]:
# Calculate average accuracy
accuracies_lr = [report['accuracy'] for report in classification_reports_lr]
average_accuracy_lr = np.mean(accuracies_lr)
print(f"\nAverage Accuracy LR: {average_accuracy_lr}")

# Caluculate average standard deviation
std_dev_lr = np.std(accuracies_lr)
print(f"Standard Deviation: {std_dev_lr}")

# Calculate average f1-score
f1s_lr_0 = [report['0']['f1-score'] for report in classification_reports_lr]
f1s_lr_1 = [report['1']['f1-score'] for report in classification_reports_lr]
total_f1s_lr = [f1s_lr_0[i] + f1s_lr_1[i] for i in range(k)]
total_f1s_lr = [f1 / 2 for f1 in total_f1s_lr]
average_f1_lr = np.mean(total_f1s_lr)
print(f"\nAverage f1-score LR: {average_f1_lr}")


Average Accuracy LR: 0.598945411764706
Standard Deviation: 0.0007657085191717521

Average f1-score LR: 0.5893495332546128


In [23]:
# Collect the recall for the '1' class from each fold
recall_class_1_lr = [report['1']['recall'] for report in classification_reports_lr]

# Calculate the average recall for the '1' class across all folds
average_recall_class_1_lr = np.mean(recall_class_1_lr)
print(f"\nAverage Recall for class 1: {average_recall_class_1_lr}")


Average Recall for class 1: 0.44592373883958664


### Random Forest

In [24]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
# Initialize Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Lists to store results
conf_matrices_rf = []
classification_reports_rf = []
count = 0

# Manually loop through each fold
for train_index, test_index in kf.split(X_trainval):
    count += 1

    print("Processing fold " + str(count))
    
    # Split the data for this fold
    X_train, X_val = X_trainval.iloc[train_index], X_trainval.iloc[test_index]
    y_train, y_val = y_trainval.iloc[train_index], y_trainval.iloc[test_index]
    
    # Train the model
    rf_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_rf = rf_model.predict(X_val)
    
    # Store confusion matrix and classification report for each fold
    conf_matrices_rf.append(confusion_matrix(y_val, y_pred_rf))
    classification_reports_rf.append(classification_report(y_val, y_pred_rf, output_dict=True))

    print("Done processing fold " + str(count))

Processing fold 1
Done processing fold 1
Processing fold 2
Done processing fold 2
Processing fold 3
Done processing fold 3
Processing fold 4
Done processing fold 4
Processing fold 5
Done processing fold 5


In [26]:
# Display results for each fold
for i in range(k):
    print(f"\nFold {i+1}")
    print("Confusion Matrix:")
    print(conf_matrices_rf[i])
    print("\nClassification Report:")
    for label, metrics in classification_reports_rf[i].items():
        print(f"{label}: {metrics}")


Fold 1
Confusion Matrix:
[[163499  48585]
 [ 59334 153582]]

Classification Report:
0: {'precision': 0.7337288462660378, 'recall': 0.7709162407348031, 'f1-score': 0.7518629991469636, 'support': 212084.0}
1: {'precision': 0.7596788793423259, 'recall': 0.7213267203967761, 'f1-score': 0.740006215624345, 'support': 212916.0}
accuracy: 0.7460729411764706
macro avg: {'precision': 0.7467038628041819, 'recall': 0.7461214805657896, 'f1-score': 0.7459346073856543, 'support': 425000.0}
weighted avg: {'precision': 0.746729263307146, 'recall': 0.7460729411764706, 'f1-score': 0.7459230016869591, 'support': 425000.0}

Fold 2
Confusion Matrix:
[[164125  48414]
 [ 59378 153083]]

Classification Report:
0: {'precision': 0.7343301879616828, 'recall': 0.7722112177059269, 'f1-score': 0.752794455579967, 'support': 212539.0}
1: {'precision': 0.7597284326813799, 'recall': 0.7205228253655965, 'f1-score': 0.7396064335029158, 'support': 212461.0}
accuracy: 0.7463717647058824
macro avg: {'precision': 0.747029310

In [27]:
# Calculate average accuracy
accuracies_rf = [report['accuracy'] for report in classification_reports_rf]
average_accuracy_rf = np.mean(accuracies_rf)
print(f"\nAverage Accuracy RF: {average_accuracy_rf}")

# Caluculate average standard deviation
std_dev_rf = np.std(accuracies_rf)
print(f"Standard Deviation: {std_dev_rf}")

# Calculate average f1-score
f1s_rf_0 = [report['0']['f1-score'] for report in classification_reports_rf]
f1s_rf_1 = [report['1']['f1-score'] for report in classification_reports_rf]
total_f1s_rf = [f1s_rf_0[i] + f1s_rf_1[i] for i in range(k)]
total_f1s_rf = [f1 / 2 for f1 in total_f1s_rf]
average_f1_rf = np.mean(total_f1s_rf)
print(f"\nAverage f1-score LR: {average_f1_rf}")


Average Accuracy RF: 0.7467919999999999
Standard Deviation: 0.000580928930373869

Average f1-score LR: 0.746641150363843


In [28]:
# Collect the recall for the '1' class from each fold
recall_class_1_rf = [report['1']['recall'] for report in classification_reports_rf]

# Calculate the average recall for the '1' class across all folds
average_recall_class_1_rf = np.mean(recall_class_1_rf)
print(f"\nAverage Recall for class 1: {average_recall_class_1_rf}")


Average Recall for class 1: 0.7221938908475314


### Support Vector Machine

In [None]:
from sklearn.svm import SVC

In [None]:
# Initialize SVC

svc = SVC(kernel='rbf', random_state=45)

# choosing rbf cause not linearly separable

In [None]:
# Lists to store results
conf_matrices_svm = []
classification_reports_svm = []
count = 0

# Manually loop through each fold
for train_index, test_index in kf.split(X_trainval):
    count += 1

    print("Starting fold " + str(count))
    # Split the data for this fold
    X_train, X_val = X_trainval.iloc[train_index], X_trainval.iloc[test_index]
    y_train, y_val = y_trainval.iloc[train_index], y_trainval.iloc[test_index]
    
    # Train the model
    svc.fit(X_train, y_train)
    print("Model fitted")
    
    # Make predictions
    y_pred_svm = svc.predict(X_val)
    
    # Store confusion matrix and classification report for each fold
    conf_matrices_svm.append(confusion_matrix(y_val, y_pred_svm))
    classification_reports_svm.append(classification_report(y_val, y_pred_svm, output_dict=True))

    print("Done processing fold " + str(count))

Starting a fold
Model fitted
Done processing a fold
Starting a fold
Model fitted
Done processing a fold
Starting a fold
Model fitted
Done processing a fold


In [None]:
# Display results for each fold
for i in range(k):
    print(f"\nFold {i+1}")
    print("Confusion Matrix:")
    print(conf_matrices_svm[i])
    print("\nClassification Report:")
    for label, metrics in classification_reports_svm[i].items():
        print(f"{label}: {metrics}")


Fold 1
Confusion Matrix:
[[33869  7839]
 [19892 21734]]

Classification Report:
0: {'precision': 0.6299920016368743, 'recall': 0.81205044595761, 'f1-score': 0.7095287475515613, 'support': 41708.0}
1: {'precision': 0.7349271294762114, 'recall': 0.5221255945803104, 'f1-score': 0.6105141926150648, 'support': 41626.0}
accuracy: 0.6672306621547027
macro avg: {'precision': 0.6824595655565429, 'recall': 0.6670880202689602, 'f1-score': 0.660021470083313, 'support': 83334.0}
weighted avg: {'precision': 0.6824079378866673, 'recall': 0.6672306621547027, 'f1-score': 0.6600701848546235, 'support': 83334.0}

Fold 2
Confusion Matrix:
[[33671  8098]
 [19928 21636]]

Classification Report:
0: {'precision': 0.6282020186943786, 'recall': 0.8061241590653355, 'f1-score': 0.7061278416240249, 'support': 41769.0}
1: {'precision': 0.7276518463711577, 'recall': 0.5205466268886536, 'f1-score': 0.6069174450896239, 'support': 41564.0}
accuracy: 0.663686654746619
macro avg: {'precision': 0.6779269325327681, 'recal

In [None]:
# Calculate average accuracy
accuracies_svm = [report['accuracy'] for report in classification_reports_svm]
average_accuracy_svm = np.mean(accuracies_svm)
print(f"\nAverage Accuracy SVM: {average_accuracy_svm}")

# Caluculate average standard deviation
std_dev_svm = np.std(accuracies_svm)
print(f"Standard Deviation: {std_dev_svm}")


Average Accuracy RF: 0.6657119939253271
Standard Deviation: 0.0014905331742503187


In [None]:
# Collect the recall for the '1' class from each fold
recall_class_1_svm = [report['1']['recall'] for report in classification_reports_svm]

# Calculate the average recall for the '1' class across all folds
average_recall_class_1_svm = np.mean(recall_class_1_svm)
print(f"\nAverage Recall for class 1: {average_recall_class_1_svm}")

### MLP

In [29]:
from tensorflow.keras import layers, optimizers, models, callbacks
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow import keras

In [30]:
# Define parameters
initial_learning_rate = 5e-4
decay_rate = 0.1  # Decay rate per step

# Define the learning rate schedule
lr_schedule = ExponentialDecay(
    initial_learning_rate,
    decay_steps=10000,
    decay_rate=decay_rate)

# Define early stopping
early_stopping = callbacks.EarlyStopping(
    monitor='val_accuracy',             # Watch accuracy
    patience=5,                         # Stop after 3 epochs with no improvement
    restore_best_weights=True,          # Restore weights from the best epoch
    min_delta=0.0005                    # Minimum change in accuracy to qualify as an improvement
)

# Function to build the model
def build_model():
    model_mlp = models.Sequential()
    model_mlp.add(layers.Dense(64, activation='relu', input_shape=(X.shape[1],)))
    model_mlp.add(layers.BatchNormalization())
    model_mlp.add(layers.Dense(64, activation='relu'))
    model_mlp.add(layers.BatchNormalization())
    model_mlp.add(layers.Dense(1, activation='sigmoid'))
    model_mlp.compile(optimizer=optimizers.Adam(learning_rate=lr_schedule), loss='binary_crossentropy', metrics=['accuracy', keras.metrics.Recall()])
    return model_mlp

In [31]:
# Convert data to float32 (required by TensorFlow)
X_trainval = np.array(X_trainval, dtype=np.float32)
y_trainval = np.array(y_trainval, dtype=np.float32)

# Lists to store results
conf_matrices_mlp = []
classification_reports_mlp = []
count = 0

# Manually loop over each fold
for train_index, test_index in kf.split(X_trainval):
    count += 1
    print(f"\nFold {count}")
    
    # Split data for this fold
    X_train, X_val = X_trainval[train_index], X_trainval[test_index]
    y_train, y_val = y_trainval[train_index], y_trainval[test_index]
    
    # Build and train the model
    model_mlp = build_model()
    model_mlp.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=32, verbose=1, callbacks=[early_stopping])
    
    # Make predictions
    y_pred_mlp = model_mlp.predict(X_val)
    y_pred_mlp = (y_pred_mlp > 0.5).astype(int)  # Convert probabilities to binary class predictions
    
    # Store confusion matrix and classification report for each fold
    conf_matrices_mlp.append(confusion_matrix(y_val, y_pred_mlp))
    classification_reports_mlp.append(classification_report(y_val, y_pred_mlp, output_dict=True))


Fold 1


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m274s[0m 5ms/step - accuracy: 0.6282 - loss: 0.6167 - recall: 0.5616 - val_accuracy: 0.6512 - val_loss: 0.5938 - val_recall: 0.5622
Epoch 2/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m259s[0m 5ms/step - accuracy: 0.6398 - loss: 0.6042 - recall: 0.5639 - val_accuracy: 0.6519 - val_loss: 0.5938 - val_recall: 0.5366
Epoch 3/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 1ms/step - accuracy: 0.6407 - loss: 0.6036 - recall: 0.5650 - val_accuracy: 0.6520 - val_loss: 0.5941 - val_recall: 0.5276
Epoch 4/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 885us/step - accuracy: 0.6404 - loss: 0.6038 - recall: 0.5638 - val_accuracy: 0.6515 - val_loss: 0.5936 - val_recall: 0.5725
Epoch 5/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 906us/step - accuracy: 0.6397 - loss: 0.6042 - recall: 0.5638 - val_accuracy: 0.6513 - val_los

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 885us/step - accuracy: 0.6298 - loss: 0.6160 - recall_1: 0.5676 - val_accuracy: 0.6549 - val_loss: 0.5933 - val_recall_1: 0.5605
Epoch 2/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 959us/step - accuracy: 0.6421 - loss: 0.6026 - recall_1: 0.5705 - val_accuracy: 0.6546 - val_loss: 0.5932 - val_recall_1: 0.5608
Epoch 3/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 1ms/step - accuracy: 0.6410 - loss: 0.6037 - recall_1: 0.5708 - val_accuracy: 0.6542 - val_loss: 0.5932 - val_recall_1: 0.5714
Epoch 4/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 937us/step - accuracy: 0.6417 - loss: 0.6031 - recall_1: 0.5709 - val_accuracy: 0.6543 - val_loss: 0.5933 - val_recall_1: 0.5563
Epoch 5/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 1ms/step - accuracy: 0.6401 - loss: 0.6036 - recall_1: 0.5691 - val_accuracy: 0.6543 - 

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 909us/step - accuracy: 0.6275 - loss: 0.6180 - recall_2: 0.5658 - val_accuracy: 0.6512 - val_loss: 0.5949 - val_recall_2: 0.5589
Epoch 2/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 832us/step - accuracy: 0.6398 - loss: 0.6046 - recall_2: 0.5691 - val_accuracy: 0.6520 - val_loss: 0.5952 - val_recall_2: 0.5422
Epoch 3/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 801us/step - accuracy: 0.6401 - loss: 0.6050 - recall_2: 0.5695 - val_accuracy: 0.6518 - val_loss: 0.5949 - val_recall_2: 0.5483
Epoch 4/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 795us/step - accuracy: 0.6398 - loss: 0.6046 - recall_2: 0.5686 - val_accuracy: 0.6521 - val_loss: 0.5949 - val_recall_2: 0.5547
Epoch 5/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 804us/step - accuracy: 0.6401 - loss: 0.6042 - recall_2: 0.5692 - val_accuracy: 0.651

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 804us/step - accuracy: 0.6261 - loss: 0.6183 - recall_3: 0.5641 - val_accuracy: 0.6525 - val_loss: 0.5945 - val_recall_3: 0.5550
Epoch 2/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 801us/step - accuracy: 0.6373 - loss: 0.6058 - recall_3: 0.5660 - val_accuracy: 0.6526 - val_loss: 0.5945 - val_recall_3: 0.5417
Epoch 3/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 802us/step - accuracy: 0.6374 - loss: 0.6060 - recall_3: 0.5661 - val_accuracy: 0.6524 - val_loss: 0.5945 - val_recall_3: 0.5521
Epoch 4/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 797us/step - accuracy: 0.6386 - loss: 0.6055 - recall_3: 0.5673 - val_accuracy: 0.6527 - val_loss: 0.5946 - val_recall_3: 0.5417
Epoch 5/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 795us/step - accuracy: 0.6386 - loss: 0.6054 - recall_3: 0.5668 - val_accuracy: 0.653

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 799us/step - accuracy: 0.6295 - loss: 0.6156 - recall_4: 0.5597 - val_accuracy: 0.6513 - val_loss: 0.5938 - val_recall_4: 0.5242
Epoch 2/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 797us/step - accuracy: 0.6390 - loss: 0.6039 - recall_4: 0.5620 - val_accuracy: 0.6512 - val_loss: 0.5932 - val_recall_4: 0.5509
Epoch 3/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 804us/step - accuracy: 0.6396 - loss: 0.6035 - recall_4: 0.5627 - val_accuracy: 0.6511 - val_loss: 0.5931 - val_recall_4: 0.5412
Epoch 4/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 806us/step - accuracy: 0.6405 - loss: 0.6034 - recall_4: 0.5638 - val_accuracy: 0.6512 - val_loss: 0.5929 - val_recall_4: 0.5481
Epoch 5/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 807us/step - accuracy: 0.6395 - loss: 0.6039 - recall_4: 0.5634 - val_accuracy: 0.651

In [32]:
# Calculate average accuracy
accuracies_mlp = [report['accuracy'] for report in classification_reports_mlp]
average_accuracy_mlp = np.mean(accuracies_mlp)
print(f"\nAverage Accuracy MLP: {average_accuracy_mlp}")

# Caluculate average standard deviation of accuracies
std_dev_mlp = np.std(accuracies_mlp)
print(f"Standard Deviation: {std_dev_mlp}")

# Calculate average f1-score
f1s_mlp_0 = [report['0.0']['f1-score'] for report in classification_reports_mlp]
f1s_mlp_1 = [report['1.0']['f1-score'] for report in classification_reports_mlp]
total_f1s_mlp = [f1s_mlp_0[i] + f1s_mlp_1[i] for i in range(k)]
total_f1s_mlp = [f1 / 2 for f1 in total_f1s_mlp]
average_f1_mlp = np.mean(total_f1s_mlp)
print(f"\nAverage f1-score LR: {average_f1_mlp}")


Average Accuracy MLP: 0.6523350588235294
Standard Deviation: 0.0013461321035241718

Average f1-score LR: 0.6483899183028696


In [33]:
import numpy as np

# Collect the recall for the '1.0' class from each fold
recall_class_1_mlp = [report_mlp['1.0']['recall'] for report_mlp in classification_reports_mlp]

# Calculate the average recall for the '1.0' class across all folds
average_recall_class_1_mlp = np.mean(recall_class_1_mlp)
print(f"\nAverage Recall for class 1.0: {average_recall_class_1_mlp}")



Average Recall for class 1.0: 0.547043728529927


## Dataframe with results

In [34]:

# List of model names
model_names = ['Logistic Regression', 'Random Forest', 'MLP']

# Collect the accuracy and recall values by calling the variables
accuracies = [average_accuracy_lr, average_accuracy_rf, average_accuracy_mlp]
recalls = [average_recall_class_1_lr, average_recall_class_1_rf, average_recall_class_1_mlp]

# Create a dictionary for the DataFrame
data = {
    'Model': model_names,
    'Accuracy': accuracies,
    'Recall': recalls
}

# Create the DataFrame
df_results = pd.DataFrame(data)

In [35]:
df_results

Unnamed: 0,Model,Accuracy,Recall
0,Logistic Regression,0.598945,0.445924
1,Random Forest,0.746792,0.722194
2,MLP,0.652335,0.547044


## Random search on best model

After looking at the df of above, it can be seen that the best model is the Random Forest model. I therefore will perform random search on this model

In [36]:
# Further splitting trainval into train and validation set to tune hyperparameters on the validation set
# Training set: 70%, Validation set: 15%, Test set: 15% (still the same test set)

X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=(15/85), random_state=42)

In [37]:
print(X_train.shape)
print(f"The percentage of the original dataset that is used for training is: {X_train.shape[0]/df_nw.shape[0]*100} %")

(1750000, 45)
The percentage of the original dataset that is used for training is: 70.0 %


In [38]:
from scipy.stats import randint  # For defining distributions for random search

param_dist = {
    'n_estimators': randint(50, 200),  # Number of trees in the forest (uniform distribution between 50 and 200)
    'max_depth': [None] + list(randint(1, 30).rvs(10)),  # Random depth values including None
    'min_samples_split': randint(2, 10),  # Minimum samples required to split an internal node
    'min_samples_leaf': randint(1, 4)  # Minimum samples required to be at a leaf node
}

In [39]:
# Initialize the Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

rf_model_search = RandomForestClassifier(random_state=42)  # Random state for reproducibility

In [40]:
# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf_model_search,
                           param_distributions=param_dist,
                           n_iter=20,          # Number of parameter settings that are sampled
                           scoring='recall',   # Use recall as the evaluation metric
                           cv=k,
                           verbose=2,
                           n_jobs=-1)          # Use all available cores

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get the best parameters and best score
print("Best Parameters:", random_search.best_params_)
print("Best Cross-Validation Score:", random_search.best_score_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


MemoryError: Unable to allocate 240. MiB for an array with shape (1400000, 45) and data type float32

In [None]:
y_pred_rf_random = random_search.predict(X_val)

print("Accuracy on Validation Set:", accuracy_score(y_val, y_pred_rf_random))
print(confusion_matrix(y_val, y_pred_rf_random))
print(classification_report(y_val, y_pred_rf_random))

from sklearn.metrics import recall_score
print("Recall on Validation Set:", recall_score(y_val, y_pred_rf_random))

Accuracy on Validation Set: 0.7413466666666667
[[144404  42769]
 [ 54226 133601]]
              precision    recall  f1-score   support

           0       0.73      0.77      0.75    187173
           1       0.76      0.71      0.73    187827

    accuracy                           0.74    375000
   macro avg       0.74      0.74      0.74    375000
weighted avg       0.74      0.74      0.74    375000

