In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, fbeta_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt

In [5]:
dtype = {
    'ResponseID': 'category',
    'UserID': 'int8',
    'Intervention': 'int8',
    'PedPed': 'int8',
    'Barrier': 'int8',
    'CrossingSignal': 'int8',
    'AttributeLevel': 'category',
    'ScenarioTypeStrict': 'category',
    'NumberOfCharacters': 'int8',
    'DiffNumberOFCharacters': 'int8',
    'Saved': 'int8',
    'Man': 'int8',
    'Woman': 'int8',
    'Pregnant': 'int8',
    'Stroller': 'int8',
    'OldMan': 'int8',
    'OldWoman': 'int8',
    'Boy': 'int8',
    'Girl': 'int8',
    'Homeless': 'int8',
    'LargeWoman': 'int8',
    'LargeMan': 'int8',
    'Criminal': 'int8',
    'MaleExecutive': 'int8',
    'FemaleExecutive': 'int8',
    'FemaleAthlete': 'int8',
    'MaleAthlete': 'int8',
    'FemaleDoctor': 'int8',
    'MaleDoctor': 'int8',
    'Dog': 'int8',
    'Cat': 'int8'
}


In [6]:
df_deleted = pd.read_csv('total_deleted_50_dataset.csv', dtype=dtype)

In [7]:
df_deleted.shape

(2500000, 31)

In [8]:
# preprocessing

# one-hot encode the AttributeLevel and ScenarioTypeStrict
df_deleted = pd.get_dummies(df_deleted, columns=['AttributeLevel', 'ScenarioTypeStrict'])

print(df_deleted.columns)


Index(['ResponseID', 'UserID', 'Intervention', 'PedPed', 'Barrier',
       'CrossingSignal', 'NumberOfCharacters', 'DiffNumberOFCharacters',
       'Saved', 'Man', 'Woman', 'Pregnant', 'Stroller', 'OldMan', 'OldWoman',
       'Boy', 'Girl', 'Homeless', 'LargeWoman', 'LargeMan', 'Criminal',
       'MaleExecutive', 'FemaleExecutive', 'FemaleAthlete', 'MaleAthlete',
       'FemaleDoctor', 'MaleDoctor', 'Dog', 'Cat', 'AttributeLevel_Fat',
       'AttributeLevel_Female', 'AttributeLevel_Fit', 'AttributeLevel_Hoomans',
       'AttributeLevel_Less', 'AttributeLevel_Male', 'AttributeLevel_More',
       'AttributeLevel_Old', 'AttributeLevel_Pets', 'AttributeLevel_Young',
       'ScenarioTypeStrict_Age', 'ScenarioTypeStrict_Fitness',
       'ScenarioTypeStrict_Gender', 'ScenarioTypeStrict_Species',
       'ScenarioTypeStrict_Utilitarian'],
      dtype='object')


In [9]:
df_deleted.shape

(2500000, 44)

In [10]:
df_deleted.duplicated().sum()

0

In [11]:
# now also dropping ResponseID
# kept this on and pre-processed it thus far to keep track of if everything went right with the complete sessions being in the dataset (so each ResponseID has to be present twice)

#### Splitting

In [12]:
# Prepare features and target variable
X = df_deleted.drop(['UserID', 'ResponseID'], axis=1)   # Features
y = df_deleted['UserID']                # Target variable

In [13]:
# Split the data into train, validation and test sets
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.111111111111, random_state=42)

### Logistic regression

In [11]:
# Create and train the logistic regression model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

In [12]:
# Make predictions
y_pred_lr = lr_model.predict(X_val)

In [13]:
# Evaluate the model
print('Accuracy:', accuracy_score(y_val, y_pred_lr))

Accuracy: 0.563668


In [14]:
cm = confusion_matrix(y_val, y_pred_lr)
print('Confusion Matrix:')
print(cm)

print('Classification Report:')
print(classification_report(y_val, y_pred_lr))

Confusion Matrix:
[[65742 59209]
 [49874 75175]]
Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.53      0.55    124951
           1       0.56      0.60      0.58    125049

    accuracy                           0.56    250000
   macro avg       0.56      0.56      0.56    250000
weighted avg       0.56      0.56      0.56    250000



### Logistic regression with K-fold

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import numpy as np

# Initialize logistic regression model
log_reg = LogisticRegression(max_iter=1000)

# Set up K-Fold Cross Validation
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Lists to store results
conf_matrices = []
classification_reports = []

# Loop through each fold
for train_index, test_index in kf.split(X):
    # Split data into train and test for this fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the model
    log_reg.fit(X_train, y_train)
    
    # Make predictions
    y_pred = log_reg.predict(X_test)
    
    # Generate confusion matrix and classification report
    conf_matrices.append(confusion_matrix(y_test, y_pred))
    report = classification_report(y_test, y_pred, output_dict=True)
    classification_reports.append(report)

In [16]:
# Display results
for i in range(k):
    print(f"\nFold {i+1}")
    print("Confusion Matrix:")
    print(conf_matrices[i])
    print("\nClassification Report:")
    for label, metrics in classification_reports[i].items():
        print(f"{label}: {metrics}")


Fold 1
Confusion Matrix:
[[130966 119158]
 [ 99357 150519]]

Classification Report:
0: {'precision': 0.5686188526547501, 'recall': 0.5236042922710336, 'f1-score': 0.5451839641000984, 'support': 250124.0}
1: {'precision': 0.5581454851544626, 'recall': 0.6023747778898334, 'f1-score': 0.5794173068002687, 'support': 249876.0}
accuracy: 0.56297
macro avg: {'precision': 0.5633821689046064, 'recall': 0.5629895350804335, 'f1-score': 0.5623006354501836, 'support': 500000.0}
weighted avg: {'precision': 0.5633847662997464, 'recall': 0.56297, 'f1-score': 0.562292145581194, 'support': 500000.0}

Fold 2
Confusion Matrix:
[[131173 118819]
 [ 99942 150066]]

Classification Report:
0: {'precision': 0.5675659303809791, 'recall': 0.5247087906813018, 'f1-score': 0.5452965764372583, 'support': 249992.0}
1: {'precision': 0.5581047659780204, 'recall': 0.6002447921666507, 'f1-score': 0.578408265287834, 'support': 250008.0}
accuracy: 0.562478
macro avg: {'precision': 0.5628353481794998, 'recall': 0.5624767914

In [17]:
# Calculate average accuracy
accuracies = [report['accuracy'] for report in classification_reports]
average_accuracy = np.mean(accuracies)
print(f"\nAverage Accuracy: {average_accuracy}")

# Caluculate average standard deviation
std_dev = np.std(accuracies)
print(f"Standard Deviation: {std_dev}")


Average Accuracy: 0.563332
Standard Deviation: 0.0006120862684295331


### Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

In [21]:
# Make predictions on the test data
y_pred_rf = rf_model.predict(X_val)

In [22]:
# Evaluate the model
accuracy_rf = accuracy_score(y_val, y_pred_rf)
print(f'Accuracy: {accuracy_rf:}')

Accuracy: 0.743944


In [23]:
# Confusion matrix
conf_matrix_rf = confusion_matrix(y_val, y_pred_rf)
print("Confusion Matrix:\n", conf_matrix_rf)

# Classification report
class_report_rf = classification_report(y_val, y_pred_rf)
print("Classification Report:\n", class_report_rf)

Confusion Matrix:
 [[91940 33011]
 [31003 94046]]
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.74      0.74    124951
           1       0.74      0.75      0.75    125049

    accuracy                           0.74    250000
   macro avg       0.74      0.74      0.74    250000
weighted avg       0.74      0.74      0.74    250000



### Random Forest with K-fold

In [24]:
### Random Forest with K-fold
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

# Initialize Random Forest model
rf_model_k = RandomForestClassifier(n_estimators=100, random_state=42)

# Set up K-Fold Cross Validation (e.g., K=5)
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Lists to store results
conf_matrices_rf = []
classification_reports_rf = []

# Manually loop through each fold
for train_index, test_index in kf.split(X):
    # Split the data for this fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the model
    rf_model_k.fit(X_train, y_train)
    
    # Make predictions
    y_pred_rf = rf_model_k.predict(X_test)
    
    # Store confusion matrix and classification report for each fold
    conf_matrices_rf.append(confusion_matrix(y_test, y_pred_rf))
    classification_reports_rf.append(classification_report(y_test, y_pred_rf, output_dict=True))

    print("Done processing a fold")

Done processing a fold
Done processing a fold
Done processing a fold
Done processing a fold
Done processing a fold


In [25]:
# Display results for each fold
for i in range(k):
    print(f"\nFold {i+1}")
    print("Confusion Matrix:")
    print(conf_matrices_rf[i])
    print("\nClassification Report:")
    for label, metrics in classification_reports_rf[i].items():
        print(f"{label}: {metrics}")


Fold 1
Confusion Matrix:
[[181180  68944]
 [ 63595 186281]]

Classification Report:
0: {'precision': 0.7401899703809621, 'recall': 0.7243607170843261, 'f1-score': 0.7321898003431003, 'support': 250124.0}
1: {'precision': 0.7298697227936135, 'recall': 0.745493764907394, 'f1-score': 0.7375990148504953, 'support': 249876.0}
accuracy: 0.734922
macro avg: {'precision': 0.7350298465872878, 'recall': 0.73492724099586, 'f1-score': 0.7348944075967978, 'support': 500000.0}
weighted avg: {'precision': 0.7350324060086895, 'recall': 0.734922, 'f1-score': 0.7348930661116, 'support': 500000.0}

Fold 2
Confusion Matrix:
[[180516  69476]
 [ 62972 187036]]

Classification Report:
0: {'precision': 0.7413753449862005, 'recall': 0.7220871067874172, 'f1-score': 0.7316041176947394, 'support': 249992.0}
1: {'precision': 0.7291510728542914, 'recall': 0.748120060158075, 'f1-score': 0.7385137803048251, 'support': 250008.0}
accuracy: 0.735104
macro avg: {'precision': 0.735263208920246, 'recall': 0.73510358347274

In [26]:
# Calculate average accuracy
accuracies_rf = [report['accuracy'] for report in classification_reports_rf]
average_accuracy_rf = np.mean(accuracies_rf)
print(f"\nAverage Accuracy RF: {average_accuracy_rf}")

# Caluculate average standard deviation
std_dev_rf = np.std(accuracies_rf)
print(f"Standard Deviation: {std_dev_rf}")


Average Accuracy RF: 0.7344679999999999
Standard Deviation: 0.00046382237979640013


### Random Forest Random Search

In [14]:
from scipy.stats import randint, uniform  # For defining distributions for random search

param_dist = {
    'n_estimators': randint(50, 200),  # Number of trees in the forest (uniform distribution between 50 and 200)
    'max_depth': [None] + list(randint(1, 30).rvs(10)),  # Random depth values including None
    'min_samples_split': randint(2, 10),  # Minimum samples required to split an internal node
    'min_samples_leaf': randint(1, 4)  # Minimum samples required to be at a leaf node
}

In [15]:
# Initialize the Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

rf_model_search = RandomForestClassifier(random_state=42)  # Random state for reproducibility

In [16]:
# Set up K-Fold Cross Validation
k = 3  # Number of folds
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Set up GridSearchCV
random_search = RandomizedSearchCV(estimator=rf_model_search,
                           param_distributions=param_dist,
                           n_iter=20,          # Number of parameter settings that are sampled
                           scoring='accuracy',
                           cv=k,
                           verbose=2,
                           n_jobs=-1)          # Use all available cores

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get the best parameters and best score
print("Best Parameters:", random_search.best_params_)
print("Best Cross-Validation Score:", random_search.best_score_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters: {'max_depth': 28, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 162}
Best Cross-Validation Score: 0.7154754997756076


In [19]:
y_pred_rf_random = random_search.predict(X_val)

print("Accuracy on Validation Set:", accuracy_score(y_val, y_pred_rf_random))
print("F2 score on Validation Set:", fbeta_score(y_val, y_pred_rf_random, beta=2))
print(confusion_matrix(y_val, y_pred_rf_random))
print(classification_report(y_val, y_pred_rf_random))

Accuracy on Validation Set: 0.722188
F2 score on Validation Set: 0.7238507363469012
[[89951 35000]
 [34453 90596]]
              precision    recall  f1-score   support

           0       0.72      0.72      0.72    124951
           1       0.72      0.72      0.72    125049

    accuracy                           0.72    250000
   macro avg       0.72      0.72      0.72    250000
weighted avg       0.72      0.72      0.72    250000



### SVM

### MLP with K-fold

In [2]:
import numpy as np
from tensorflow.keras import layers, optimizers, models
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, classification_report

# Define parameters
learning_rate = 5e-4
n_splits = 3  # Number of folds

# Function to build the model
def build_model():
    model = models.Sequential()
    model.add(layers.Dense(64, activation='relu', input_shape=(X.shape[1],)))
    model.add(layers.BatchNormalization())
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer=optimizers.Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [14]:
# Convert data to float32 (required by TensorFlow)
X = np.array(X, dtype=np.float32)
y = np.array(y, dtype=np.float32)

# Set up K-Fold cross-validation
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
fold = 1

# Lists to store results
conf_matrices_mlp = []
classification_reports_mlp = []

# Loop over each fold
for train_index, test_index in kf.split(X):
    print(f"\nFold {fold}")
    
    # Split data into training and testing for the current fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Build and train the model
    model = build_model()
    model.fit(X_train, y_train, epochs=7, batch_size=32, verbose=1)
    
    # Predict on the test data
    y_pred = model.predict(X_test)
    y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to binary class predictions
    
    # Calculate confusion matrix and classification report
    cm_mlp = confusion_matrix(y_test, y_pred)
    report_mlp = classification_report(y_test, y_pred, output_dict=True)
    
    # Store results
    conf_matrices_mlp.append(cm_mlp)
    classification_reports_mlp.append(report_mlp)
    
    # Print results for the current fold
    print("Confusion Matrix:")
    print(cm_mlp)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    fold += 1

# Optional: Calculate and print average metrics across all folds if needed



Fold 1


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/7
[1m52084/52084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 1ms/step - accuracy: 0.6081 - loss: 0.6437
Epoch 2/7
[1m52084/52084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 1ms/step - accuracy: 0.6327 - loss: 0.6188
Epoch 3/7
[1m52084/52084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 1ms/step - accuracy: 0.6358 - loss: 0.6158
Epoch 4/7
[1m52084/52084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 1ms/step - accuracy: 0.6371 - loss: 0.6146
Epoch 5/7
[1m52084/52084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 1ms/step - accuracy: 0.6377 - loss: 0.6140
Epoch 6/7
[1m52084/52084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 1ms/step - accuracy: 0.6390 - loss: 0.6128
Epoch 7/7
[1m52084/52084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 1ms/step - accuracy: 0.6389 - loss: 0.6132
[1m26042/26042[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 834us/step
Confusion Matrix:
[[310102 106536]
 [190993 225703

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m52084/52084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 1ms/step - accuracy: 0.6074 - loss: 0.6447
Epoch 2/7
[1m52084/52084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 1ms/step - accuracy: 0.6323 - loss: 0.6194
Epoch 3/7
[1m52084/52084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 1ms/step - accuracy: 0.6351 - loss: 0.6165
Epoch 4/7
[1m52084/52084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 1ms/step - accuracy: 0.6381 - loss: 0.6146
Epoch 5/7
[1m52084/52084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 1ms/step - accuracy: 0.6377 - loss: 0.6142
Epoch 6/7
[1m52084/52084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 1ms/step - accuracy: 0.6393 - loss: 0.6134
Epoch 7/7
[1m52084/52084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 1ms/step - accuracy: 0.6402 - loss: 0.6126
[1m26042/26042[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 855us/step
Confusion Matrix:
[[295811 120977]
 [173976 242569]]

Classi

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m52084/52084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 1ms/step - accuracy: 0.6073 - loss: 0.6443
Epoch 2/7
[1m52084/52084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 1ms/step - accuracy: 0.6331 - loss: 0.6176
Epoch 3/7
[1m52084/52084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 1ms/step - accuracy: 0.6349 - loss: 0.6157
Epoch 4/7
[1m52084/52084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 1ms/step - accuracy: 0.6381 - loss: 0.6139
Epoch 5/7
[1m52084/52084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 1ms/step - accuracy: 0.6384 - loss: 0.6132
Epoch 6/7
[1m52084/52084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 1ms/step - accuracy: 0.6396 - loss: 0.6128
Epoch 7/7
[1m52084/52084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 1ms/step - accuracy: 0.6403 - loss: 0.6119
[1m26042/26042[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 819us/step
Confusion Matrix:
[[297337 119237]
 [176706 240053]]

Classi

In [15]:
# Calculate average accuracy
accuracies_mlp = [report_mlp['accuracy'] for report_mlp in classification_reports_mlp]
average_accuracy_mlp = np.mean(accuracies_mlp)
print(f"\nAverage Accuracy: {average_accuracy_mlp}")

# Caluculate average standard deviation
std_dev_mlp= np.std(accuracies_mlp)
print(f"Standard Deviation: {std_dev_mlp}")


Average Accuracy: 0.644630000665806
Standard Deviation: 0.0012730001352581183


In [16]:
import numpy as np

# Collect the recall for the '1.0' class from each fold
recall_class_1 = [report_mlp['1.0']['recall'] for report_mlp in classification_reports_mlp]

# Calculate the average recall for the '1.0' class across all folds
average_recall_class_1 = np.mean(recall_class_1)
print(f"\nAverage Recall for class 1.0: {average_recall_class_1}")



Average Recall for class 1.0: 0.5666614228036408
