In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
dtype = {
    'ResponseID': 'category',
    'UserID': 'int8',
    'Intervention': 'int8',
    'PedPed': 'int8',
    'Barrier': 'int8',
    'CrossingSignal': 'int8',
    'AttributeLevel': 'category',
    'ScenarioTypeStrict': 'category',
    'NumberOfCharacters': 'int8',
    'DiffNumberOFCharacters': 'int8',
    'Saved': 'int8',
    'Man': 'int8',
    'Woman': 'int8',
    'Pregnant': 'int8',
    'Stroller': 'int8',
    'OldMan': 'int8',+
    'OldWoman': 'int8',
    'Boy': 'int8',
    'Girl': 'int8',
    'Homeless': 'int8',
    'LargeWoman': 'int8',
    'LargeMan': 'int8',
    'Criminal': 'int8',
    'MaleExecutive': 'int8',
    'FemaleExecutive': 'int8',
    'FemaleAthlete': 'int8',
    'MaleAthlete': 'int8',
    'FemaleDoctor': 'int8',
    'MaleDoctor': 'int8',
    'Dog': 'int8',
    'Cat': 'int8'
}


In [3]:
df_del = pd.read_csv('total_deleted_50_dataset.csv', dtype=dtype)

In [4]:
df_del.shape

(2500000, 31)

In [5]:
# preprocessing

# one-hot encode the AttributeLevel and ScenarioTypeStrict
df_del = pd.get_dummies(df_del, columns=['AttributeLevel', 'ScenarioTypeStrict'])

print(df_del.columns)


Index(['ResponseID', 'UserID', 'Intervention', 'PedPed', 'Barrier',
       'CrossingSignal', 'NumberOfCharacters', 'DiffNumberOFCharacters',
       'Saved', 'Man', 'Woman', 'Pregnant', 'Stroller', 'OldMan', 'OldWoman',
       'Boy', 'Girl', 'Homeless', 'LargeWoman', 'LargeMan', 'Criminal',
       'MaleExecutive', 'FemaleExecutive', 'FemaleAthlete', 'MaleAthlete',
       'FemaleDoctor', 'MaleDoctor', 'Dog', 'Cat', 'AttributeLevel_Fat',
       'AttributeLevel_Female', 'AttributeLevel_Fit', 'AttributeLevel_Hoomans',
       'AttributeLevel_Less', 'AttributeLevel_Male', 'AttributeLevel_More',
       'AttributeLevel_Old', 'AttributeLevel_Pets', 'AttributeLevel_Young',
       'ScenarioTypeStrict_Age', 'ScenarioTypeStrict_Fitness',
       'ScenarioTypeStrict_Gender', 'ScenarioTypeStrict_Species',
       'ScenarioTypeStrict_Utilitarian'],
      dtype='object')


In [6]:
df_del.shape

(2500000, 44)

In [7]:
df_del.duplicated().sum()

0

In [8]:
# now also dropping ResponseID
# kept this on and pre-processed it thus far to keep track of if everything went right with the complete sessions being in the dataset (so each ResponseID has to be present twice)

#### Splitting

In [9]:
# Prepare features and target variable
X = df_del.drop(['UserID', 'ResponseID'], axis=1)     # Features
y = df_del['UserID']                                  # Target variable

In [10]:
# Split the data into train, validation and test sets
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [11]:
X_trainval.shape

(2125000, 42)

In [13]:
# Setting K for K-fold cross validation

from sklearn.model_selection import KFold

k = 5
kf = KFold(n_splits=k, random_state=42, shuffle=True)

## Modelling

### Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
# Initialize logistic regression model
lr_model = LogisticRegression(max_iter=1000)

# Lists to store results
conf_matrices_lr = []
classification_reports_lr = []
count = 0

# Loop through each fold
for train_index, test_index in kf.split(X_trainval):
    count += 1

    # Split data into train and test for this fold
    X_train, X_val = X_trainval.iloc[train_index], X_trainval.iloc[test_index]
    y_train, y_val = y_trainval.iloc[train_index], y_trainval.iloc[test_index]
    
    # Train the model
    lr_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_lr = lr_model.predict(X_val)
    
    # Generate confusion matrix and classification report
    conf_matrices_lr.append(confusion_matrix(y_val, y_pred_lr))
    classification_reports_lr.append(classification_report(y_val, y_pred_lr, output_dict=True))

    print("Done processing fold " + str(count))

AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

In [None]:
# Display results
for i in range(k):
    print(f"\nFold {i+1}")
    print("Confusion Matrix:")
    print(conf_matrices_lr[i])
    print("\nClassification Report:")
    for label, metrics in classification_reports_lr[i].items():
        print(f"{label}: {metrics}")


Fold 1
Confusion Matrix:
[[185800 168022]
 [141240 213272]]

Classification Report:
0: {'precision': 0.5681262230919765, 'recall': 0.5251228018608227, 'f1-score': 0.5457787334290943, 'support': 353822.0}
1: {'precision': 0.5593374141738396, 'recall': 0.6015931759714763, 'f1-score': 0.5796962786386629, 'support': 354512.0}
accuracy: 0.563395234451544
macro avg: {'precision': 0.5637318186329081, 'recall': 0.5633579889161495, 'f1-score': 0.5627375060338786, 'support': 708334.0}
weighted avg: {'precision': 0.563727537970005, 'recall': 0.563395234451544, 'f1-score': 0.5627540258579973, 'support': 708334.0}

Fold 2
Confusion Matrix:
[[185544 168907]
 [141047 212835]]

Classification Report:
0: {'precision': 0.5681234326726701, 'recall': 0.523468688196676, 'f1-score': 0.5448826944593725, 'support': 354451.0}
1: {'precision': 0.5575362417549026, 'recall': 0.6014292899893184, 'f1-score': 0.5786515937489805, 'support': 353882.0}
accuracy: 0.5624176764318477
macro avg: {'precision': 0.5628298372

In [None]:
# Calculate average accuracy
accuracies_lr = [report['accuracy'] for report in classification_reports_lr]
average_accuracy_lr = np.mean(accuracies_lr)
print(f"\nAverage Accuracy LR: {average_accuracy_lr}")

# Caluculate average standard deviation
std_dev_lr = np.std(accuracies_lr)
print(f"Standard Deviation: {std_dev_lr}")

# Calculate average f1-score
f1s_lr_0 = [report['0']['f1-score'] for report in classification_reports_lr]
f1s_lr_1 = [report['1']['f1-score'] for report in classification_reports_lr]
total_f1s_lr = [f1s_lr_0[i] + f1s_lr_1[i] for i in range(k)]
total_f1s_lr = [f1 / 2 for f1 in total_f1s_lr]
average_f1_lr = np.mean(total_f1s_lr)
print(f"\nAverage f1-score LR: {average_f1_lr}")


Average Accuracy LR: 0.563181176369855
Standard Deviation: 0.0005569677941461436


In [None]:
# Collect the recall for the '1' class from each fold
recall_class_1_lr = [report['1']['recall'] for report in classification_reports_lr]

# Calculate the average recall for the '1' class across all folds
average_recall_class_1_lr = np.mean(recall_class_1_lr)
print(f"\nAverage Recall for class 1: {average_recall_class_1_lr}")


Average Recall for class 1: 0.6018635155379575


### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Initialize Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Lists to store results
conf_matrices_rf = []
classification_reports_rf = []
count = 0

# Manually loop through each fold
for train_index, test_index in kf.split(X_trainval):
    count += 1
    # Split the data for this fold
    X_train, X_val = X_trainval.iloc[train_index], X_trainval.iloc[test_index]
    y_train, y_val = y_trainval.iloc[train_index], y_trainval.iloc[test_index]
    
    # Train the model
    rf_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_rf = rf_model.predict(X_val)
    
    # Store confusion matrix and classification report for each fold
    conf_matrices_rf.append(confusion_matrix(y_val, y_pred_rf))
    classification_reports_rf.append(classification_report(y_val, y_pred_rf, output_dict=True))

    print("Done processing fold" + str(count))

Done processing fold1
Done processing fold2
Done processing fold3


In [None]:
# Display results for each fold
for i in range(k):
    print(f"\nFold {i+1}")
    print("Confusion Matrix:")
    print(conf_matrices_rf[i])
    print("\nClassification Report:")
    for label, metrics in classification_reports_rf[i].items():
        print(f"{label}: {metrics}")


Fold 1
Confusion Matrix:
[[256226  97596]
 [ 93809 260703]]

Classification Report:
0: {'precision': 0.7320010856057252, 'recall': 0.7241663887491451, 'f1-score': 0.7280626604551776, 'support': 353822.0}
1: {'precision': 0.7276129712893421, 'recall': 0.7353855440718509, 'f1-score': 0.7314786107397333, 'support': 354512.0}
accuracy: 0.7297814307939475
macro avg: {'precision': 0.7298070284475336, 'recall': 0.7297759664104979, 'f1-score': 0.7297706355974555, 'support': 708334.0}
weighted avg: {'precision': 0.7298048911797488, 'recall': 0.7297814307939475, 'f1-score': 0.7297722993646165, 'support': 708334.0}

Fold 2
Confusion Matrix:
[[254981  99470]
 [ 92165 261717]]

Classification Report:
0: {'precision': 0.7345065188710226, 'recall': 0.7193688267207597, 'f1-score': 0.7268588662722332, 'support': 354451.0}
1: {'precision': 0.7246024912303045, 'recall': 0.7395600793484834, 'f1-score': 0.7320048834448144, 'support': 353882.0}
accuracy: 0.7294563432735733
macro avg: {'precision': 0.729554

In [None]:
# Calculate average accuracy
accuracies_rf = [report['accuracy'] for report in classification_reports_rf]
average_accuracy_rf = np.mean(accuracies_rf)
print(f"\nAverage Accuracy RF: {average_accuracy_rf}")

# Caluculate average standard deviation
std_dev_rf = np.std(accuracies_rf)
print(f"Standard Deviation: {std_dev_rf}")

# Calculate average f1-score
f1s_rf_0 = [report['0']['f1-score'] for report in classification_reports_rf]
f1s_rf_1 = [report['1']['f1-score'] for report in classification_reports_rf]
total_f1s_rf = [f1s_rf_0[i] + f1s_rf_1[i] for i in range(k)]
total_f1s_rf = [f1 / 2 for f1 in total_f1s_rf]
average_f1_rf = np.mean(total_f1s_rf)
print(f"\nAverage f1-score LR: {average_f1_rf}")


Average Accuracy RF: 0.7299971765721157
Standard Deviation: 0.000551198329982249


In [None]:
# Collect the recall for the '1' class from each fold
recall_class_1_rf = [report['1']['recall'] for report in classification_reports_rf]

# Calculate the average recall for the '1' class across all folds
average_recall_class_1_rf = np.mean(recall_class_1_rf)
print(f"\nAverage Recall for class 1: {average_recall_class_1_rf}")


Average Recall for class 1: 0.7400420803185739


### Support Vector Machine

In [None]:
from sklearn.svm import SVC

In [None]:
# Initialize SVC

svc = SVC(kernel='rbf', random_state=45)

# choosing rbf cause not linearly separable

In [None]:
# Lists to store results
conf_matrices_svm = []
classification_reports_svm = []
count = 0

# Manually loop through each fold
for train_index, test_index in kf.split(X_trainval):
    count += 1

    print("Starting fold " + str(count))
    # Split the data for this fold
    X_train, X_val = X_trainval.iloc[train_index], X_trainval.iloc[test_index]
    y_train, y_val = y_trainval.iloc[train_index], y_trainval.iloc[test_index]
    
    # Train the model
    svc.fit(X_train, y_train)
    print("Model fitted")
    
    # Make predictions
    y_pred_svm = svc.predict(X_val)
    
    # Store confusion matrix and classification report for each fold
    conf_matrices_svm.append(confusion_matrix(y_val, y_pred_svm))
    classification_reports_svm.append(classification_report(y_val, y_pred_svm, output_dict=True))

    print("Done processing fold " + str(count))

Starting a fold
Model fitted
Done processing a fold
Starting a fold
Model fitted
Done processing a fold
Starting a fold
Model fitted
Done processing a fold


In [None]:
# Display results for each fold
for i in range(k):
    print(f"\nFold {i+1}")
    print("Confusion Matrix:")
    print(conf_matrices_svm[i])
    print("\nClassification Report:")
    for label, metrics in classification_reports_svm[i].items():
        print(f"{label}: {metrics}")


Fold 1
Confusion Matrix:
[[33869  7839]
 [19892 21734]]

Classification Report:
0: {'precision': 0.6299920016368743, 'recall': 0.81205044595761, 'f1-score': 0.7095287475515613, 'support': 41708.0}
1: {'precision': 0.7349271294762114, 'recall': 0.5221255945803104, 'f1-score': 0.6105141926150648, 'support': 41626.0}
accuracy: 0.6672306621547027
macro avg: {'precision': 0.6824595655565429, 'recall': 0.6670880202689602, 'f1-score': 0.660021470083313, 'support': 83334.0}
weighted avg: {'precision': 0.6824079378866673, 'recall': 0.6672306621547027, 'f1-score': 0.6600701848546235, 'support': 83334.0}

Fold 2
Confusion Matrix:
[[33671  8098]
 [19928 21636]]

Classification Report:
0: {'precision': 0.6282020186943786, 'recall': 0.8061241590653355, 'f1-score': 0.7061278416240249, 'support': 41769.0}
1: {'precision': 0.7276518463711577, 'recall': 0.5205466268886536, 'f1-score': 0.6069174450896239, 'support': 41564.0}
accuracy: 0.663686654746619
macro avg: {'precision': 0.6779269325327681, 'recal

In [None]:
# Calculate average accuracy
accuracies_svm = [report['accuracy'] for report in classification_reports_svm]
average_accuracy_svm = np.mean(accuracies_svm)
print(f"\nAverage Accuracy SVM: {average_accuracy_svm}")

# Caluculate average standard deviation
std_dev_svm = np.std(accuracies_svm)
print(f"Standard Deviation: {std_dev_svm}")


Average Accuracy RF: 0.6657119939253271
Standard Deviation: 0.0014905331742503187


In [None]:
# Collect the recall for the '1' class from each fold
recall_class_1_svm = [report['1']['recall'] for report in classification_reports_svm]

# Calculate the average recall for the '1' class across all folds
average_recall_class_1_svm = np.mean(recall_class_1_svm)
print(f"\nAverage Recall for class 1: {average_recall_class_1_svm}")

### MLP

In [14]:
from tensorflow.keras import layers, optimizers, models, callbacks
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow import keras

In [15]:
# Define parameters
initial_learning_rate = 5e-4
decay_rate = 0.1  # Decay rate per step

# Define the learning rate schedule
lr_schedule = ExponentialDecay(
    initial_learning_rate,
    decay_steps=10000,
    decay_rate=decay_rate)

# Define early stopping
early_stopping = callbacks.EarlyStopping(
    monitor='val_accuracy',             # Watch accuracy
    patience=5,                         # Stop after 3 epochs with no improvement
    restore_best_weights=True,          # Restore weights from the best epoch
    min_delta=0.0005                    # Minimum change in accuracy to qualify as an improvement
)

# Function to build the model
def build_model():
    model_mlp = models.Sequential()
    model_mlp.add(layers.Dense(64, activation='relu', input_shape=(X.shape[1],)))
    model_mlp.add(layers.BatchNormalization())
    model_mlp.add(layers.Dense(64, activation='relu'))
    model_mlp.add(layers.BatchNormalization())
    model_mlp.add(layers.Dense(1, activation='sigmoid'))
    model_mlp.compile(optimizer=optimizers.Adam(learning_rate=lr_schedule), loss='binary_crossentropy', metrics=['accuracy', keras.metrics.Recall()])
    return model_mlp

In [16]:
# Convert data to float32 (required by TensorFlow)
X_trainval = np.array(X_trainval, dtype=np.float32)
y_trainval = np.array(y_trainval, dtype=np.float32)

# Lists to store results
conf_matrices_mlp = []
classification_reports_mlp = []
count = 0

# Manually loop over each fold
for train_index, test_index in kf.split(X_trainval):
    count += 1
    print(f"\nFold {count}")
    
    # Split data for this fold
    X_train, X_val = X_trainval[train_index], X_trainval[test_index]
    y_train, y_val = y_trainval[train_index], y_trainval[test_index]
    
    # Build and train the model
    model_mlp = build_model()
    model_mlp.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=32, verbose=1, callbacks=[early_stopping])
    
    # Make predictions
    y_pred_mlp = model_mlp.predict(X_val)
    y_pred_mlp = (y_pred_mlp > 0.5).astype(int)  # Convert probabilities to binary class predictions
    
    # Store confusion matrix and classification report for each fold
    conf_matrices_mlp.append(confusion_matrix(y_val, y_pred_mlp))
    classification_reports_mlp.append(classification_report(y_val, y_pred_mlp, output_dict=True))


Fold 1
Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 910us/step - accuracy: 0.6012 - loss: 0.6512 - recall: 0.5915 - val_accuracy: 0.6229 - val_loss: 0.6326 - val_recall: 0.6022
Epoch 2/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 877us/step - accuracy: 0.6130 - loss: 0.6402 - recall: 0.5936 - val_accuracy: 0.6230 - val_loss: 0.6325 - val_recall: 0.5980
Epoch 3/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 984us/step - accuracy: 0.6127 - loss: 0.6404 - recall: 0.5930 - val_accuracy: 0.6229 - val_loss: 0.6326 - val_recall: 0.5983
Epoch 4/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 1ms/step - accuracy: 0.6142 - loss: 0.6401 - recall: 0.5948 - val_accuracy: 0.6233 - val_loss: 0.6326 - val_recall: 0.5960
Epoch 5/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 1ms/step - accuracy: 0.6120 - loss: 0.6406 - recall: 0.5926 - val_accuracy: 0.6235 - val_loss: 0.6326 -

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m287s[0m 5ms/step - accuracy: 0.6011 - loss: 0.6514 - recall_1: 0.5878 - val_accuracy: 0.6244 - val_loss: 0.6317 - val_recall_1: 0.6040
Epoch 2/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m277s[0m 5ms/step - accuracy: 0.6152 - loss: 0.6391 - recall_1: 0.5945 - val_accuracy: 0.6245 - val_loss: 0.6317 - val_recall_1: 0.6110
Epoch 3/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m280s[0m 5ms/step - accuracy: 0.6152 - loss: 0.6392 - recall_1: 0.5945 - val_accuracy: 0.6254 - val_loss: 0.6314 - val_recall_1: 0.6046
Epoch 4/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m282s[0m 5ms/step - accuracy: 0.6141 - loss: 0.6395 - recall_1: 0.5933 - val_accuracy: 0.6249 - val_loss: 0.6315 - val_recall_1: 0.6104
Epoch 5/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m277s[0m 5ms/step - accuracy: 0.6147 - loss: 0.6394 - recall_1: 0.5934 - val_accuracy:

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m287s[0m 5ms/step - accuracy: 0.6025 - loss: 0.6506 - recall_2: 0.5905 - val_accuracy: 0.6236 - val_loss: 0.6324 - val_recall_2: 0.5847
Epoch 2/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m283s[0m 5ms/step - accuracy: 0.6153 - loss: 0.6384 - recall_2: 0.5987 - val_accuracy: 0.6233 - val_loss: 0.6323 - val_recall_2: 0.5994
Epoch 3/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m266s[0m 5ms/step - accuracy: 0.6153 - loss: 0.6386 - recall_2: 0.5990 - val_accuracy: 0.6232 - val_loss: 0.6323 - val_recall_2: 0.5926
Epoch 4/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m257s[0m 5ms/step - accuracy: 0.6149 - loss: 0.6390 - recall_2: 0.5983 - val_accuracy: 0.6236 - val_loss: 0.6323 - val_recall_2: 0.5936
Epoch 5/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 2ms/step - accuracy: 0.6159 - loss: 0.6385 - recall_2: 0.6000 - val_accuracy: 

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 893us/step - accuracy: 0.5996 - loss: 0.6510 - recall_3: 0.5911 - val_accuracy: 0.6221 - val_loss: 0.6314 - val_recall_3: 0.6161
Epoch 2/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 912us/step - accuracy: 0.6118 - loss: 0.6397 - recall_3: 0.5953 - val_accuracy: 0.6224 - val_loss: 0.6314 - val_recall_3: 0.6045
Epoch 3/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 1ms/step - accuracy: 0.6128 - loss: 0.6397 - recall_3: 0.5966 - val_accuracy: 0.6218 - val_loss: 0.6314 - val_recall_3: 0.6012
Epoch 4/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 885us/step - accuracy: 0.6131 - loss: 0.6394 - recall_3: 0.5968 - val_accuracy: 0.6226 - val_loss: 0.6314 - val_recall_3: 0.5985
Epoch 5/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 903us/step - accuracy: 0.6121 - loss: 0.6398 - recall_3: 0.5963 - val_accuracy: 0.6223 

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 964us/step - accuracy: 0.5998 - loss: 0.6517 - recall_4: 0.5870 - val_accuracy: 0.6251 - val_loss: 0.6311 - val_recall_4: 0.5938
Epoch 2/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 1ms/step - accuracy: 0.6124 - loss: 0.6402 - recall_4: 0.5906 - val_accuracy: 0.6250 - val_loss: 0.6311 - val_recall_4: 0.5955
Epoch 3/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 933us/step - accuracy: 0.6135 - loss: 0.6400 - recall_4: 0.5928 - val_accuracy: 0.6247 - val_loss: 0.6310 - val_recall_4: 0.6031
Epoch 4/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 1ms/step - accuracy: 0.6124 - loss: 0.6400 - recall_4: 0.5908 - val_accuracy: 0.6246 - val_loss: 0.6310 - val_recall_4: 0.5993
Epoch 5/20
[1m53125/53125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 951us/step - accuracy: 0.6147 - loss: 0.6396 - recall_4: 0.5935 - val_accuracy: 0.6247 - 

In [17]:
# Calculate average accuracy
accuracies_mlp = [report['accuracy'] for report in classification_reports_mlp]
average_accuracy_mlp = np.mean(accuracies_mlp)
print(f"\nAverage Accuracy MLP: {average_accuracy_mlp}")

# Caluculate average standard deviation of accuracies
std_dev_mlp = np.std(accuracies_mlp)
print(f"Standard Deviation: {std_dev_mlp}")

# Calculate average f1-score
f1s_mlp_0 = [report['0.0']['f1-score'] for report in classification_reports_mlp]
f1s_mlp_1 = [report['1.0']['f1-score'] for report in classification_reports_mlp]
total_f1s_mlp = [f1s_mlp_0[i] + f1s_mlp_1[i] for i in range(k)]
total_f1s_mlp = [f1 / 2 for f1 in total_f1s_mlp]
average_f1_mlp = np.mean(total_f1s_mlp)
print(f"\nAverage f1-score LR: {average_f1_mlp}")


Average Accuracy MLP: 0.6239199999999999
Standard Deviation: 0.0012098062986597368

Average f1-score LR: 0.6236411383236677


In [18]:
import numpy as np

# Collect the recall for the '1.0' class from each fold
recall_class_1_mlp = [report_mlp['1.0']['recall'] for report_mlp in classification_reports_mlp]

# Calculate the average recall for the '1.0' class across all folds
average_recall_class_1_mlp = np.mean(recall_class_1_mlp)
print(f"\nAverage Recall for class 1.0: {average_recall_class_1_mlp}")



Average Recall for class 1.0: 0.5986750405409876


## Dataframe with results

In [21]:

# List of model names
model_names = ['Logistic Regression', 'Random Forest', 'MLP']

# Collect the accuracy and recall values by calling the variables
accuracies = [average_accuracy_lr, average_accuracy_rf, average_accuracy_mlp]
recalls = [average_recall_class_1_lr, average_recall_class_1_rf, average_recall_class_1_mlp]

# Create a dictionary for the DataFrame
data = {
    'Model': model_names,
    'Accuracy': accuracies,
    'Recall': recalls
}

# Create the DataFrame
df_results = pd.DataFrame(data)

NameError: name 'average_accuracy_lr' is not defined

In [31]:
df_results

Unnamed: 0,Model,Accuracy,Recall
0,Logistic Regression,0.563181,0.601864
1,Random Forest,0.729997,0.740042
2,MLP,0.624808,0.613513


## Random search on best model

After looking at the df of above, it can be seen that the best model is the Random Forest model. I therefore will perform random search on this model

In [22]:
# Further splitting trainval into train and validation set to tune hyperparameters on the validation set
# Training set: 70%, Validation set: 15%, Test set: 15% (still the same test set)

X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=(15/85), random_state=42)

In [23]:
print(X_train.shape)
print(f"The percentage of the original dataset that is used for training is: {X_train.shape[0]/df_del.shape[0]*100} %")

(1750000, 42)
The percentage of the original dataset that is used for training is: 70.0 %


In [24]:
from scipy.stats import randint  # For defining distributions for random search

param_dist = {
    'n_estimators': randint(50, 200),  # Number of trees in the forest (uniform distribution between 50 and 200)
    'max_depth': [None] + list(randint(1, 30).rvs(10)),  # Random depth values including None
    'min_samples_split': randint(2, 10),  # Minimum samples required to split an internal node
    'min_samples_leaf': randint(1, 4)  # Minimum samples required to be at a leaf node
}

In [25]:
# Initialize the Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

rf_model_search = RandomForestClassifier(random_state=42)  # Random state for reproducibility

In [26]:
# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf_model_search,
                           param_distributions=param_dist,
                           n_iter=20,          # Number of parameter settings that are sampled
                           scoring='recall',   # Use recall as the evaluation metric
                           cv=k,
                           verbose=2,
                           n_jobs=-1)          # Use all available cores

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get the best parameters and best score
print("Best Parameters:", random_search.best_params_)
print("Best Cross-Validation Score:", random_search.best_score_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


MemoryError: Unable to allocate 224. MiB for an array with shape (1400000, 42) and data type float32

In [None]:
y_pred_rf_random = random_search.predict(X_val)

print("Accuracy on Validation Set:", accuracy_score(y_val, y_pred_rf_random))
print(confusion_matrix(y_val, y_pred_rf_random))
print(classification_report(y_val, y_pred_rf_random))

from sklearn.metrics import recall_score
print("Recall on Validation Set:", recall_score(y_val, y_pred_rf_random))

Accuracy on Validation Set: 0.5484586666666666
[[ 65772 121401]
 [ 47927 139900]]
              precision    recall  f1-score   support

           0       0.58      0.35      0.44    187173
           1       0.54      0.74      0.62    187827

    accuracy                           0.55    375000
   macro avg       0.56      0.55      0.53    375000
weighted avg       0.56      0.55      0.53    375000

Recall on Validation Set: 0.744834342240466
