In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt

In [2]:
dtype = {
    'ResponseID': 'category',
    'UserID': 'int8',
    'Intervention': 'int8',
    'PedPed': 'int8',
    'Barrier': 'int8',
    'CrossingSignal': 'int8',
    'AttributeLevel': 'category',
    'ScenarioTypeStrict': 'category',
    'NumberOfCharacters': 'int8',
    'DiffNumberOFCharacters': 'int8',
    'Saved': 'int8',
    'Man': 'int8',
    'Woman': 'int8',
    'Pregnant': 'int8',
    'Stroller': 'int8',
    'OldMan': 'int8',
    'OldWoman': 'int8',
    'Boy': 'int8',
    'Girl': 'int8',
    'Homeless': 'int8',
    'LargeWoman': 'int8',
    'LargeMan': 'int8',
    'Criminal': 'int8',
    'MaleExecutive': 'int8',
    'FemaleExecutive': 'int8',
    'FemaleAthlete': 'int8',
    'MaleAthlete': 'int8',
    'FemaleDoctor': 'int8',
    'MaleDoctor': 'int8',
    'Dog': 'int8',
    'Cat': 'int8'
}


In [3]:
# load data

df_total = pd.read_csv('total_dataset.csv', dtype=dtype)

In [4]:
df_total.shape

(14678400, 31)

In [5]:
df_total.head()

Unnamed: 0,ResponseID,UserID,Intervention,PedPed,Barrier,CrossingSignal,AttributeLevel,ScenarioTypeStrict,NumberOfCharacters,DiffNumberOFCharacters,...,LargeMan,Criminal,MaleExecutive,FemaleExecutive,FemaleAthlete,MaleAthlete,FemaleDoctor,MaleDoctor,Dog,Cat
0,res_04906808,0,1,0,0,1,Less,Utilitarian,3,2,...,0,0,0,0,0,0,0,1,0,0
1,res_01167621,0,0,1,0,2,Pets,Species,2,0,...,0,0,0,0,0,0,0,0,1,1
2,res_03198848,0,1,0,0,2,Old,Age,2,0,...,0,0,0,0,0,0,0,0,0,0
3,res_00035908,1,0,0,1,0,Male,Gender,5,0,...,0,0,1,0,0,1,0,0,0,0
4,res_03263521,0,1,0,1,0,Fat,Fitness,5,0,...,2,0,0,0,0,0,0,0,0,0


In [6]:
df_total['UserID'].value_counts()

UserID
0    14384832
1      293568
Name: count, dtype: int64

In [8]:
# Found that the LLM's have Social Value instead of Social Status in the ScenarioTypeStrict column, so change this to Social Status

df_total['ScenarioTypeStrict'].replace('Social Value', 'Social Status', inplace=True)
print(df_total['ScenarioTypeStrict'].value_counts())


ScenarioTypeStrict
Utilitarian      2879300
Age              2835642
Species          2835568
Fitness          2827378
Gender           2824958
Social Status     475554
Name: count, dtype: int64
ScenarioTypeStrict
Utilitarian      287326
Age              284529
Fitness          283253
Species          283240
Gender           281875
Social Status     47617
Name: count, dtype: int64


In [10]:
# checking how many different categories are in the categorical columns
print(len(df_total['AttributeLevel'].value_counts()))
print(len(df_total['ScenarioTypeStrict'].value_counts()))


13
6


In [30]:
# with one hot encoding, this means that there will be 19 (!) extra columns

In [11]:
# preprocessing

# one-hot encode the AttributeLevel and ScenarioTypeStrict
df_total_encoded = pd.get_dummies(df_total, columns=['AttributeLevel', 'ScenarioTypeStrict'])

print(df_total_encoded.columns)


Index(['ResponseID', 'UserID', 'Intervention', 'PedPed', 'Barrier',
       'CrossingSignal', 'NumberOfCharacters', 'DiffNumberOFCharacters',
       'Saved', 'Man', 'Woman', 'Pregnant', 'Stroller', 'OldMan', 'OldWoman',
       'Boy', 'Girl', 'Homeless', 'LargeWoman', 'LargeMan', 'Criminal',
       'MaleExecutive', 'FemaleExecutive', 'FemaleAthlete', 'MaleAthlete',
       'FemaleDoctor', 'MaleDoctor', 'Dog', 'Cat', 'AttributeLevel_Fat',
       'AttributeLevel_Female', 'AttributeLevel_Fit', 'AttributeLevel_High',
       'AttributeLevel_Hoomans', 'AttributeLevel_Less', 'AttributeLevel_Low',
       'AttributeLevel_Male', 'AttributeLevel_More', 'AttributeLevel_Old',
       'AttributeLevel_Pets', 'AttributeLevel_Rand', 'AttributeLevel_Young',
       'ScenarioTypeStrict_Age', 'ScenarioTypeStrict_Fitness',
       'ScenarioTypeStrict_Gender', 'ScenarioTypeStrict_Social Status',
       'ScenarioTypeStrict_Species', 'ScenarioTypeStrict_Utilitarian'],
      dtype='object')


In [12]:
df_total_encoded.dtypes

ResponseID                          category
UserID                                  int8
Intervention                            int8
PedPed                                  int8
Barrier                                 int8
CrossingSignal                          int8
NumberOfCharacters                      int8
DiffNumberOFCharacters                  int8
Saved                                   int8
Man                                     int8
Woman                                   int8
Pregnant                                int8
Stroller                                int8
OldMan                                  int8
OldWoman                                int8
Boy                                     int8
Girl                                    int8
Homeless                                int8
LargeWoman                              int8
LargeMan                                int8
Criminal                                int8
MaleExecutive                           int8
FemaleExec

In [13]:
df_total_encoded.head()

Unnamed: 0,ResponseID,UserID,Intervention,PedPed,Barrier,CrossingSignal,NumberOfCharacters,DiffNumberOFCharacters,Saved,Man,...,AttributeLevel_Old,AttributeLevel_Pets,AttributeLevel_Rand,AttributeLevel_Young,ScenarioTypeStrict_Age,ScenarioTypeStrict_Fitness,ScenarioTypeStrict_Gender,ScenarioTypeStrict_Social Status,ScenarioTypeStrict_Species,ScenarioTypeStrict_Utilitarian
0,res_04906808,0,1,0,0,1,3,2,0,0,...,False,False,False,False,False,False,False,False,False,True
1,res_01167621,0,0,1,0,2,2,0,0,0,...,False,True,False,False,False,False,False,False,True,False
2,res_03198848,0,1,0,0,2,2,0,0,0,...,True,False,False,False,True,False,False,False,False,False
3,res_00035908,1,0,0,1,0,5,0,0,0,...,False,False,False,False,False,False,True,False,False,False
4,res_03263521,0,1,0,1,0,5,0,0,1,...,False,False,False,False,False,True,False,False,False,False


In [12]:
"""
# Convert response_ID to a numeric sequential range
df_total_encoded['response_ID_mapped'] = df_total_encoded['ResponseID'].astype('category').cat.codes

# Optionally, drop the original response_ID column
df_total_encoded = df_total_encoded.drop('ResponseID', axis=1)
"""

In [14]:
df_total_encoded.head()

Unnamed: 0,ResponseID,UserID,Intervention,PedPed,Barrier,CrossingSignal,NumberOfCharacters,DiffNumberOFCharacters,Saved,Man,...,AttributeLevel_Old,AttributeLevel_Pets,AttributeLevel_Rand,AttributeLevel_Young,ScenarioTypeStrict_Age,ScenarioTypeStrict_Fitness,ScenarioTypeStrict_Gender,ScenarioTypeStrict_Social Status,ScenarioTypeStrict_Species,ScenarioTypeStrict_Utilitarian
0,res_04906808,0,1,0,0,1,3,2,0,0,...,False,False,False,False,False,False,False,False,False,True
1,res_01167621,0,0,1,0,2,2,0,0,0,...,False,True,False,False,False,False,False,False,True,False
2,res_03198848,0,1,0,0,2,2,0,0,0,...,True,False,False,False,True,False,False,False,False,False
3,res_00035908,1,0,0,1,0,5,0,0,0,...,False,False,False,False,False,False,True,False,False,False
4,res_03263521,0,1,0,1,0,5,0,0,1,...,False,False,False,False,False,True,False,False,False,False


In [15]:
df_total_sub = df_total_encoded.sample(frac=0.1, random_state=12)

In [17]:
# Prepare features and target variable
X = df_total_encoded.drop(['UserID', 'ResponseID'], axis=1)   # Features
y = df_total_encoded['UserID']                # Target variable

In [18]:
# Split the data into train, validation and test sets
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.111111111111, random_state=42)

In [17]:
# Create and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

In [18]:
# Make predictions
y_pred = model.predict(X_val)

In [19]:
# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy: {accuracy:}')

Accuracy: 0.9800182581207761


In [20]:
cm = confusion_matrix(y_val, y_pred)
print('Confusion Matrix:')
print(cm)

print('Classification Report:')
print(classification_report(y_val, y_pred))

Confusion Matrix:
[[1438431       0]
 [  29330      79]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99   1438431
           1       1.00      0.00      0.01     29409

    accuracy                           0.98   1467840
   macro avg       0.99      0.50      0.50   1467840
weighted avg       0.98      0.98      0.97   1467840



In [None]:
# zonder responseID doet ie het beter dan zonder
# met 97999 dus slechter dan alleen 0 gokken
# zonder 98296, wat wat beter is dan alleen 0 gokken

# beide op de validation set!

# op test set deed ie het nog wat beter vm

In [21]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

In [22]:
# Make predictions on the test data
y_pred_rf = rf_model.predict(X_val)

In [25]:
# Evaluate the model
accuracy_rf = accuracy_score(y_val, y_pred_rf)
print(f'Accuracy: {accuracy_rf:}')

Accuracy: 0.9842905221277524


In [24]:
# Confusion matrix
conf_matrix_rf = confusion_matrix(y_val, y_pred_rf)
print("Confusion Matrix:\n", conf_matrix_rf)

# Classification report
class_report_rf = classification_report(y_val, y_pred_rf)
print("Classification Report:\n", class_report_rf)

Confusion Matrix:
 [[1437559     872]
 [  22187    7222]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99   1438431
           1       0.89      0.25      0.39     29409

    accuracy                           0.98   1467840
   macro avg       0.94      0.62      0.69   1467840
weighted avg       0.98      0.98      0.98   1467840



### SVM on a subset

In [19]:
# Prepare features and target variable
X_sub = df_total_sub.drop(['UserID', 'ResponseID'], axis=1)     # Features
y_sub = df_total_sub['UserID']                                  # Target variable

# Split the data into train, validation and test sets
X_trainval_sub, X_test_sub, y_trainval_sub, y_test_sub = train_test_split(X_sub, y_sub, test_size=0.1, random_state=42)
X_train_sub, X_val_sub, y_train_sub, y_val_sub = train_test_split(X_trainval_sub, y_trainval_sub, test_size=0.111111111111, random_state=42)

In [20]:
from sklearn.svm import SVC

# Initialize the SVM model with a linear kernel
svm_model = SVC(kernel='rbf', random_state=42)  # You can try other kernels like 'rbf', 'poly', etc.

# choosing rbf cause not linearly separable

In [None]:
# Train the model
svm_model.fit(X_train_sub, y_train_sub)

In [None]:
# Make predictions on the test data
y_pred_svm = svm_model.predict(X_test_sub)

In [None]:
# Evaluate the model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f'Accuracy: {accuracy_svm}')

In [None]:
# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_svm)
print("Confusion Matrix:\n", conf_matrix)

# Classification report
class_report = classification_report(y_test, y_pred_svm)
print("Classification Report:\n", class_report)


### MLP

In [None]:
# paper by Agrawal 2019
# ReLU, no dropout, three 32-unit hidden layers

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, optimizers

# Assuming you have a DataFrame `df` with your data
# X = df.iloc[:, :45].values  # Input features (replace with your feature selection)
# y = df['target_column'].values  # Target variable (replace with your target column)

# Example random data
X = np.random.rand(1000, 45)  # 1000 samples, 45 features
y = np.random.randint(0, 2, 1000)  # Binary target

# Define the model
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(45,)),
    layers.BatchNormalization(),
    layers.Dense(64, activation='relu'),
    layers.BatchNormalization(),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
learning_rate = 5e-4

model.compile(optimizer=optimizers.Adam(learning_rate=learning_rate),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Fit the model (replace with your actual data)
model.fit(X, y, epochs=20, batch_size=32)  # Adjust epochs and batch size as needed

# Evaluate the model
# model.evaluate(X_test, y_test)  # Replace with your test data
