In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt

In [2]:
dtype = {
    'ResponseID': 'category',
    'UserID': 'int8',
    'Intervention': 'int8',
    'PedPed': 'int8',
    'Barrier': 'int8',
    'CrossingSignal': 'int8',
    'AttributeLevel': 'category',
    'ScenarioTypeStrict': 'category',
    'NumberOfCharacters': 'int8',
    'DiffNumberOFCharacters': 'int8',
    'Saved': 'int8',
    'Man': 'int8',
    'Woman': 'int8',
    'Pregnant': 'int8',
    'Stroller': 'int8',
    'OldMan': 'int8',
    'OldWoman': 'int8',
    'Boy': 'int8',
    'Girl': 'int8',
    'Homeless': 'int8',
    'LargeWoman': 'int8',
    'LargeMan': 'int8',
    'Criminal': 'int8',
    'MaleExecutive': 'int8',
    'FemaleExecutive': 'int8',
    'FemaleAthlete': 'int8',
    'MaleAthlete': 'int8',
    'FemaleDoctor': 'int8',
    'MaleDoctor': 'int8',
    'Dog': 'int8',
    'Cat': 'int8'
}


In [3]:
df50 = pd.read_csv('total_50_dataset.csv', dtype=dtype)

In [4]:
# preprocessing

# one-hot encode the AttributeLevel and ScenarioTypeStrict
df50 = pd.get_dummies(df50, columns=['AttributeLevel', 'ScenarioTypeStrict'])

print(df50.columns)


Index(['Intervention', 'PedPed', 'Barrier', 'CrossingSignal',
       'NumberOfCharacters', 'DiffNumberOFCharacters', 'Saved', 'Man', 'Woman',
       'Pregnant', 'Stroller', 'OldMan', 'OldWoman', 'Boy', 'Girl', 'Homeless',
       'LargeWoman', 'LargeMan', 'Criminal', 'MaleExecutive',
       'FemaleExecutive', 'FemaleAthlete', 'MaleAthlete', 'FemaleDoctor',
       'MaleDoctor', 'Dog', 'Cat', 'UserID', 'AttributeLevel_Fat',
       'AttributeLevel_Female', 'AttributeLevel_Fit', 'AttributeLevel_High',
       'AttributeLevel_Hoomans', 'AttributeLevel_Less', 'AttributeLevel_Low',
       'AttributeLevel_Male', 'AttributeLevel_More', 'AttributeLevel_Old',
       'AttributeLevel_Pets', 'AttributeLevel_Young', 'ScenarioTypeStrict_Age',
       'ScenarioTypeStrict_Fitness', 'ScenarioTypeStrict_Gender',
       'ScenarioTypeStrict_Social Status', 'ScenarioTypeStrict_Species',
       'ScenarioTypeStrict_Utilitarian'],
      dtype='object')


In [5]:
df50.shape

(5000000, 46)

In [8]:
df50.duplicated().sum()

4443264

In [10]:
# Count duplicates by class in the target variable

sub_1 = df50[df50['UserID'] == 1]
sub_0 = df50[df50['UserID'] == 0]

print(sub_1.duplicated().sum())
print(sub_0.duplicated().sum())

2431416
2011848


In [10]:
# Prepare features and target variable
X = df50.drop(['UserID'], axis=1)   # Features
y = df50['UserID']                # Target variable

In [11]:
# Split the data into train, validation and test sets
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.111111111111, random_state=42)

In [9]:
# Create and train the logistic regression model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

In [10]:
# Make predictions
y_pred_lr = lr_model.predict(X_val)

In [11]:
# Evaluate the model
print('Accuracy:', accuracy_score(y_val, y_pred_lr))

Accuracy: 0.595692


In [12]:
cm = confusion_matrix(y_val, y_pred_lr)
print('Confusion Matrix:')
print(cm)

print('Classification Report:')
print(classification_report(y_val, y_pred_lr))

Confusion Matrix:
[[180222  70287]
 [131867 117624]]
Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.72      0.64    250509
           1       0.63      0.47      0.54    249491

    accuracy                           0.60    500000
   macro avg       0.60      0.60      0.59    500000
weighted avg       0.60      0.60      0.59    500000



In [None]:
# zonder responseID doet ie het beter dan zonder
# met 97999 dus slechter dan alleen 0 gokken
# zonder 98296, wat wat beter is dan alleen 0 gokken

# beide op de validation set!

# op test set deed ie het nog wat beter vm

Random Forest

In [19]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

In [20]:
# Make predictions on the test data
y_pred_rf = rf_model.predict(X_val)

In [21]:
# Evaluate the model
accuracy_rf = accuracy_score(y_val, y_pred_rf)
print(f'Accuracy: {accuracy_rf:}')

Accuracy: 0.76391


In [22]:
# Confusion matrix
conf_matrix_rf = confusion_matrix(y_val, y_pred_rf)
print("Confusion Matrix:\n", conf_matrix_rf)

# Classification report
class_report_rf = classification_report(y_val, y_pred_rf)
print("Classification Report:\n", class_report_rf)

Confusion Matrix:
 [[193537  56972]
 [ 61073 188418]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.76      0.77      0.77    250509
         1.0       0.77      0.76      0.76    249491

    accuracy                           0.76    500000
   macro avg       0.76      0.76      0.76    500000
weighted avg       0.76      0.76      0.76    500000



SVM

In [None]:
sub_10000_encoded['UserID'].value_counts()

UserID
0    9800
1     200
Name: count, dtype: int64

### MLP

In [15]:
from tensorflow.keras import layers, optimizers
from keras import models

learning_rate = 5e-4

def build_model():
    model = models.Sequential()
    model.add(layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
    model.add(layers.BatchNormalization())
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer=optimizers.Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [13]:
X_train = np.array(X_train, dtype=np.float32)  # Convert features to float32
y_train = np.array(y_train, dtype=np.float32)
X_val = np.array(X_val, dtype=np.float32)
y_val = np.array(y_val, dtype=np.float32)

In [16]:
model = build_model()
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val), verbose=1)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m125000/125000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 807us/step - accuracy: 0.6505 - loss: 0.5941 - val_accuracy: 0.6784 - val_loss: 0.5632
Epoch 2/20
[1m125000/125000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 849us/step - accuracy: 0.6707 - loss: 0.5709 - val_accuracy: 0.6788 - val_loss: 0.5616
Epoch 3/20
[1m125000/125000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 858us/step - accuracy: 0.6728 - loss: 0.5688 - val_accuracy: 0.6798 - val_loss: 0.5609
Epoch 4/20
[1m125000/125000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 820us/step - accuracy: 0.6739 - loss: 0.5679 - val_accuracy: 0.6796 - val_loss: 0.5609
Epoch 5/20
[1m125000/125000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 807us/step - accuracy: 0.6743 - loss: 0.5672 - val_accuracy: 0.6799 - val_loss: 0.5603
Epoch 6/20
[1m125000/125000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 808us/step - accuracy: 0.6750 - loss: 0.5667 - val_accu

In [17]:
# evaluate the model

val_loss, val_accuracy = model.evaluate(X_val, y_val)

[1m15625/15625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 566us/step - accuracy: 0.6842 - loss: 0.5590


In [18]:
# confusion matrix

y_pred_mlp = model.predict(X_val)
y_pred_mlp = (y_pred_mlp > 0.5)
np.concatenate((y_pred_mlp.reshape(len(y_pred_mlp),1), y_val.reshape(len(y_val),1)),1)

cm = confusion_matrix(y_val, y_pred_mlp)
print(cm)

# classification report

print(classification_report(y_val, y_pred_mlp))

[1m15625/15625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 522us/step
[[192173  58336]
 [100123 149368]]
              precision    recall  f1-score   support

         0.0       0.66      0.77      0.71    250509
         1.0       0.72      0.60      0.65    249491

    accuracy                           0.68    500000
   macro avg       0.69      0.68      0.68    500000
weighted avg       0.69      0.68      0.68    500000

