# Multi Layer Perceptron

## Loading the dataset

In [1]:
from keras import models, layers, Input
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import pandas as pd
import numpy as np
import logging
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

2024-10-20 17:47:45.314601: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
dtype_dict = {
    # Columns that should be strings
    'ResponseID': 'str',
    'ExtendedSessionID': 'str',
    'UserID': 'str',
    'AttributeLevel': 'str',
    'ScenarioTypeStrict': 'str',

    # Columns that should be int8
    'PedPed': 'int8',
    'Barrier': 'int8',
    'CrossingSignal': 'int8',
    'NumberOfCharacters': 'int8',
    'DiffNumberOFCharacters': 'int8',
    'Man': 'int8',
    'Woman': 'int8',
    'Pregnant': 'int8',
    'Stroller': 'int8',
    'OldMan': 'int8',
    'OldWoman': 'int8',
    'Boy': 'int8',
    'Girl': 'int8',
    'Homeless': 'int8',
    'LargeWoman': 'int8',
    'LargeMan': 'int8',
    'Criminal': 'int8',
    'MaleExecutive': 'int8',
    'FemaleExecutive': 'int8',
    'FemaleAthlete': 'int8',
    'MaleAthlete': 'int8',
    'FemaleDoctor': 'int8',
    'MaleDoctor': 'int8',
    'Dog': 'int8',
    'Cat': 'int8',
    'Saved': 'int8',

    # Columns that should be float32
    'Finance_access': 'float32',
    'ICT': 'float32',
    'Industry_activity': 'float32',
    'Overall_index': 'float32',
    'Research_and_development': 'float32',
    'Skills': 'float32',
    'Total': 'float32',
    'Males': 'float32',
    'Females': 'float32',
    'Passengers': 'float32',
    'Pedestrians': 'float32'
}

In [3]:
df = dd.read_csv('Data/Utilitarian_subset.csv', dtype=dtype_dict)

# Show progress when computations are performed
#pbar = ProgressBar()
#pbar.register()

# Persist the dataframe for operations later
df = df.persist()

In [4]:
df = dd.read_csv('Data/cleaned_data.csv', dtype=dtype_dict)

In [4]:
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,ResponseID,ExtendedSessionID,UserID,PedPed,Barrier,CrossingSignal,AttributeLevel,ScenarioTypeStrict,NumberOfCharacters_x,DiffNumberOFCharacters,Saved,Country,Man,Woman,Pregnant,Stroller,OldMan,OldWoman,Boy,Girl,Homeless,LargeWoman,LargeMan,Criminal,MaleExecutive,FemaleExecutive,FemaleAthlete,MaleAthlete,FemaleDoctor,MaleDoctor,Dog,Cat,Finance_access,ICT,Industry_activity,Overall_index,Research_and_development,Skills,Total,Males,Females,Passengers,Pedestrians,NumberOfLivesSaved
0,225e6AAuRbMsFLmPw,800690618_8381025834549560.0,8381025834549560.0,0,0,0,Less,Utilitarian,2,3,1,DEU,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.8,0.8,0.9,0.9,0.8,0.75,4.2,6.35,2.15,1.912,0.612,-3
1,229AS4ANJzqkECeWr,1306199838_2127450984180964.0,2127450984180960.0,0,1,0,Less,Utilitarian,3,2,1,GBR,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0.9,0.8,0.8,0.9,0.75,0.85,2.95,4.6,1.45,1.155,0.711,-2
2,22EnPHDydkozvrsmN,959039305_2158440549669984.0,2158440549669980.0,1,0,2,Less,Utilitarian,2,3,0,BRA,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.7,0.45,0.7,0.7,0.6,0.55,20.35,33.900002,7.15,4.6168,3.6019,-3
3,22Kp459kE4XnpkBhC,1828530624_3085943025398171.0,3085943025398170.0,0,0,1,Less,Utilitarian,2,3,0,NLD,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0.8,0.8,0.9,0.95,0.7,0.85,3.8,5.65,1.95,1.444,0.3496,-3
4,22PbQ7njqmSP7Jwnr,-1145927433_4193653037.0,4193653037.0,0,1,0,Less,Utilitarian,1,4,0,FRA,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.8,0.7,0.9,0.9,0.8,0.75,5.5,8.55,2.7,2.992,0.8855,-4


## Data preparation

In [5]:
# Delete the columns 'ResponseID', ExtendedSessionID' and 'UserID' and 'Country'
df = df.drop(['ResponseID', 'ExtendedSessionID', 'UserID', 'Country'], axis=1)

In [6]:
num_cols = ['NumberOfCharacters', 'DiffNumberOFCharacters', 'Man', 'Woman', 'Pregnant', 'Stroller', 'OldMan', 'OldWoman', 'Boy', 'Girl', 'Homeless', 'LargeWoman', 'LargeMan', 'Criminal', 'MaleExecutive', 'FemaleExecutive', 'FemaleAthlete', 'MaleAthlete', 'FemaleDoctor', 'MaleDoctor', 'Dog', 'Cat', 'Finance_access', 'ICT', 'Industry_activity', 'Overall_index', 'Research_and_development', 'Skills', 'Total', 'Males', 'Females', 'Passengers', 'Pedestrians']
cat_cols = ['AttributeLevel', 'ScenarioTypeStrict', 'CrossingSignal'] #Categorical columns that shouldn't be scaled: 'CrossingSignal'
binary_cols = ['PedPed', 'Barrier']

In [9]:
print(df.dtypes)

PedPed                         int8
Barrier                        int8
CrossingSignal                 int8
AttributeLevel               object
ScenarioTypeStrict           object
NumberOfCharacters             int8
DiffNumberOFCharacters         int8
Saved                          int8
Man                            int8
Woman                          int8
Pregnant                       int8
Stroller                       int8
OldMan                         int8
OldWoman                       int8
Boy                            int8
Girl                           int8
Homeless                       int8
LargeWoman                     int8
LargeMan                       int8
Criminal                       int8
MaleExecutive                  int8
FemaleExecutive                int8
FemaleAthlete                  int8
MaleAthlete                    int8
FemaleDoctor                   int8
MaleDoctor                     int8
Dog                            int8
Cat                         

In [8]:
df[['AttributeLevel','ScenarioTypeStrict']] = df[['AttributeLevel','ScenarioTypeStrict']].astype(str)
df[["PedPed", "Barrier", "CrossingSignal", "NumberOfCharacters", "DiffNumberOFCharacters", "Man", "Woman", "Pregnant", "Stroller", "OldMan", "OldWoman", "Boy", "Girl", "Homeless", "LargeWoman", "LargeMan", "Criminal", "MaleExecutive", "FemaleExecutive", "FemaleAthlete", "MaleAthlete", "FemaleDoctor", "MaleDoctor", "Dog", "Cat", "Saved"]] = df[["PedPed", "Barrier", "CrossingSignal", "NumberOfCharacters", "DiffNumberOFCharacters", "Man", "Woman", "Pregnant", "Stroller", "OldMan", "OldWoman", "Boy", "Girl", "Homeless", "LargeWoman", "LargeMan", "Criminal", "MaleExecutive", "FemaleExecutive", "FemaleAthlete", "MaleAthlete", "FemaleDoctor", "MaleDoctor", "Dog", "Cat", "Saved"]].astype(float).round().astype('int8')
df[["Finance_access", "ICT", "Industry_activity", "Overall_index", "Research_and_development", "Skills", "Total", "Males", "Females", "Passengers", "Pedestrians"]] = df[["Finance_access", "ICT", "Industry_activity", "Overall_index", "Research_and_development", "Skills", "Total", "Males", "Females", "Passengers", "Pedestrians"]].astype('float32')

In [10]:
# Normalize numerical columns
# Calculate mean and std with Dask
mean = df[num_cols].mean().compute()
std = df[num_cols].std().compute()

# Scale numerical columns using Dask
df[num_cols] = (df[num_cols] - mean) / std

# Persist the transformed DataFrame to free memory
df = df.persist()

KeyboardInterrupt: 

In [None]:
# Label encode categorical columns
def encode_labels(df, cols):
    le = LabelEncoder()
    for col in cols:
        # Fit the encoder and transform the column
        df[col] = le.fit_transform(df[col].astype(str))  # Convert to string to handle non-numeric types
    return df

# Apply label encoding
df = df.map_partitions(encode_labels, cat_cols)

# Trigger the computation to save the changes
df = df.persist()

In [14]:
# Label encode the categorical columns
labelencoder = LabelEncoder()
for col in cat_cols:
    df[col] = labelencoder.fit_transform(df[col])

In [None]:
pd.set_option('display.max_columns', None)
df.head()

## Train and evaluate the model

In [16]:
# Split dataset in features and target variable
feature_cols = ['NumberOfCharacters', 'DiffNumberOFCharacters', 'Man', 'Woman', 'Pregnant', 'Stroller', 'OldMan', 'OldWoman', 'Boy', 'Girl', 'Homeless', 'LargeWoman', 'LargeMan', 'Criminal', 'MaleExecutive', 'FemaleExecutive', 'FemaleAthlete', 'MaleAthlete', 'FemaleDoctor', 'MaleDoctor', 'Dog', 'Cat', 'Finance_access', 'ICT', 'Industry_activity', 'Overall_index', 'Research_and_development', 'Skills', 'Total', 'Males', 'Females', 'Passengers', 'Pedestrians']
label = 'Saved'
X = df[feature_cols] # Features
y = df[label] # Target variable

In [None]:
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size = 0.15, stratify=y, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size = 0.17647059, stratify=y_trainval, random_state = 42)
print(X_train.shape, X_val.shape, X_test.shape)

In [None]:
def build_model():
    model = models.Sequential()
    model.add(layers.Dense(32, activation='relu',
                           input_shape=(X_train.shape[1],)))
    model.add(layers.Dense(16, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

model = build_model()

history = model.fit(X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=150, batch_size=32, verbose=1)

In [None]:
acc_history = history.history['accuracy']
val_acc_history = history.history['val_accuracy']

plt.plot(range(1, len(acc_history) + 1), acc_history)
plt.plot(range(1, len(val_acc_history) + 1), val_acc_history)
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(['training', 'validation'])
plt.show()

smooth_acc_history = smooth_curve(acc_history)
val_smooth_acc_history = smooth_curve(val_acc_history)

plt.plot(range(1, len(smooth_acc_history) + 1), smooth_acc_history)
plt.plot(range(1, len(val_smooth_acc_history) + 1), val_smooth_acc_history)
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(['training', 'validation'])
plt.show()

In [None]:
# Training the final model and the whole training set
model = build_model()
model.fit(X_trainval, y_trainval,
          epochs=50, batch_size=32, verbose=0)
test_loss_score, test_acc_score = model.evaluate(X_test, y_test)
print("Test accuracy:", test_acc_score)

In [None]:
y_pred = model.predict(X_test)
print(y_pred)
y_pred = (y_pred > 0.5)
print(y_pred)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)