# MLP

In [1]:
import pandas as pd
import numpy as np
import logging
from keras import models, layers, optimizers, initializers, Input
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold
import matplotlib.pyplot as plt

2024-10-21 12:31:52.758655: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_csv('Data/cleaned_data.csv')

In [3]:
df[['ResponseID', 'ExtendedSessionID', 'UserID', 'AttributeLevel','ScenarioTypeStrict']] = df[['ResponseID', 'ExtendedSessionID', 'UserID', 'AttributeLevel','ScenarioTypeStrict']].astype(str)
df[["PedPed", "Barrier", "CrossingSignal", "NumberOfCharacters", "DiffNumberOFCharacters", "Man", "Woman", "Pregnant", "Stroller", "OldMan", "OldWoman", "Boy", "Girl", "Homeless", "LargeWoman", "LargeMan", "Criminal", "MaleExecutive", "FemaleExecutive", "FemaleAthlete", "MaleAthlete", "FemaleDoctor", "MaleDoctor", "Dog", "Cat", "Saved"]] = df[["PedPed", "Barrier", "CrossingSignal", "NumberOfCharacters", "DiffNumberOFCharacters", "Man", "Woman", "Pregnant", "Stroller", "OldMan", "OldWoman", "Boy", "Girl", "Homeless", "LargeWoman", "LargeMan", "Criminal", "MaleExecutive", "FemaleExecutive", "FemaleAthlete", "MaleAthlete", "FemaleDoctor", "MaleDoctor", "Dog", "Cat", "Saved"]].astype(float).round().astype('int8')
df[["Finance_access", "ICT", "Industry_activity", "Overall_index", "Research_and_development", "Skills", "Total", "Males", "Females", "Passengers", "Pedestrians"]] = df[["Finance_access", "ICT", "Industry_activity", "Overall_index", "Research_and_development", "Skills", "Total", "Males", "Females", "Passengers", "Pedestrians"]].astype('float32')

In [4]:
df.dtypes

ResponseID                   object
ExtendedSessionID            object
UserID                       object
PedPed                         int8
Barrier                        int8
CrossingSignal                 int8
AttributeLevel               object
ScenarioTypeStrict           object
NumberOfCharacters             int8
DiffNumberOFCharacters         int8
Saved                          int8
Country                      object
Man                            int8
Woman                          int8
Pregnant                       int8
Stroller                       int8
OldMan                         int8
OldWoman                       int8
Boy                            int8
Girl                           int8
Homeless                       int8
LargeWoman                     int8
LargeMan                       int8
Criminal                       int8
MaleExecutive                  int8
FemaleExecutive                int8
FemaleAthlete                  int8
MaleAthlete                 

In [5]:
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,ResponseID,ExtendedSessionID,UserID,PedPed,Barrier,CrossingSignal,AttributeLevel,ScenarioTypeStrict,NumberOfCharacters,DiffNumberOFCharacters,Saved,Country,Man,Woman,Pregnant,Stroller,OldMan,OldWoman,Boy,Girl,Homeless,LargeWoman,LargeMan,Criminal,MaleExecutive,FemaleExecutive,FemaleAthlete,MaleAthlete,FemaleDoctor,MaleDoctor,Dog,Cat,Finance_access,ICT,Industry_activity,Overall_index,Research_and_development,Skills,Total,Males,Females,Passengers,Pedestrians
0,2223Xu54ufgjcyMR3,1425316635_327833569077076.0,327833569077076.0,0,1,0,Old,Age,5,0,0,MEX,0,0,0,0,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.6,0.55,0.8,0.6,0.5,0.4,13.6,22.049999,5.45,2.5024,3.876
1,2223jMWDEGNeszivb,-1683127088_785070916172117.0,785070916172117.0,1,0,2,More,Utilitarian,5,2,0,CHE,0,0,0,0,1,0,0,0,0,0,0,1,1,0,1,0,0,0,1,0,0.9,0.65,0.9,0.9,0.7,0.8,2.9,4.2,1.6,0.5076,0.6237
2,222HpiEf2LtAwEg62,-1232628507_1597557389,1597557389.0,0,1,0,Female,Gender,2,0,0,UKR,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.75,0.5,0.65,0.65,0.5,0.65,13.25,21.75,5.95,7.412,5.6984
3,222KuWty7pNeiv77a,1654911454_3639764894860440.0,3639764894860440.0,1,0,0,Low,Social Status,2,0,0,USA,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0.9,0.65,0.8,1.0,1.0,0.75,12.5,17.85,7.25,3.9603,1.9737
4,222LDp4wz24C3chzj,-1679158262_3623236506.0,3623236506.0,0,0,0,Fat,Fitness,2,0,0,DEU,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.8,0.8,0.9,0.9,0.8,0.75,4.2,6.35,2.15,1.912,0.612


In [6]:
# Delete the columns 'ResponseID', ExtendedSessionID' and 'UserID' and 'Country'
df = df.drop(['ResponseID', 'ExtendedSessionID', 'UserID', 'Country'], axis=1)

In [7]:
num_cols = ['NumberOfCharacters', 'DiffNumberOFCharacters', 'Man', 'Woman', 'Pregnant', 'Stroller', 'OldMan', 'OldWoman', 'Boy', 'Girl', 'Homeless', 'LargeWoman', 'LargeMan', 'Criminal', 'MaleExecutive', 'FemaleExecutive', 'FemaleAthlete', 'MaleAthlete', 'FemaleDoctor', 'MaleDoctor', 'Dog', 'Cat', 'Finance_access', 'ICT', 'Industry_activity', 'Overall_index', 'Research_and_development', 'Skills', 'Total', 'Males', 'Females', 'Passengers', 'Pedestrians']
cat_cols = ['AttributeLevel', 'ScenarioTypeStrict', 'CrossingSignal'] #Categorical columns that shouldn't be scaled: 'CrossingSignal'
binary_cols = ['PedPed', 'Barrier']

In [8]:
# Normalize the numerical columns
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

In [9]:
# Label encode the categorical columns
labelencoder = LabelEncoder()
for col in cat_cols:
    df[col] = labelencoder.fit_transform(df[col])

In [14]:
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,PedPed,Barrier,CrossingSignal,AttributeLevel,ScenarioTypeStrict,NumberOfCharacters,DiffNumberOFCharacters,Saved,Man,Woman,Pregnant,Stroller,OldMan,OldWoman,Boy,Girl,Homeless,LargeWoman,LargeMan,Criminal,MaleExecutive,FemaleExecutive,FemaleAthlete,MaleAthlete,FemaleDoctor,MaleDoctor,Dog,Cat,Finance_access,ICT,Industry_activity,Overall_index,Research_and_development,Skills,Total,Males,Females,Passengers,Pedestrians
0,0,1,0,9,0,1.355313,-0.483363,0,-0.525093,-0.525323,-0.233628,-0.227924,3.519691,5.458979,-0.349847,-0.350054,-0.276851,-0.350099,-0.350039,-0.227544,-0.304694,-0.305035,-0.358794,-0.35871,-0.28825,-0.288368,-0.299649,-0.29939,-2.087185,-0.997931,0.062204,-1.920313,-1.103975,-2.651884,0.895208,1.028475,0.377931,-0.485556,2.023675
1,1,0,2,8,6,1.355313,1.287652,0,-0.525093,-0.525323,-0.233628,-0.227924,1.580369,-0.359223,-0.349847,-0.350054,-0.276851,-0.350099,-0.350039,3.776382,2.591354,-0.305035,1.581299,-0.35871,-0.28825,-0.288368,1.496821,-0.29939,0.806543,-0.199999,1.252444,0.266284,-0.170225,0.597756,-1.208276,-1.16604,-1.246748,-1.752301,-0.941406
2,0,1,0,1,2,-0.66781,-0.483363,0,-0.525093,-0.525323,-0.233628,-0.227924,-0.358953,1.580178,-0.349847,-0.350054,-0.276851,1.920806,-0.350039,-0.227544,-0.304694,-0.305035,-0.358794,-0.35871,-0.28825,-0.288368,-0.299649,-0.29939,-0.640321,-1.396898,-1.723157,-1.555881,-1.103975,-0.620859,0.826402,0.991592,0.588928,2.632155,3.685135
3,1,0,0,6,4,-0.66781,-0.483363,0,-0.525093,-0.525323,-0.233628,-0.227924,-0.358953,-0.359223,-0.349847,-0.350054,4.994596,-0.350099,-0.350039,-0.227544,-0.304694,-0.305035,-0.358794,-0.35871,-0.28825,-0.288368,-0.299649,-0.29939,0.806543,-0.199999,0.062204,0.99515,1.230399,0.191551,0.678962,0.512118,1.137521,0.440244,0.289372
4,0,0,0,0,1,-0.66781,-0.483363,0,1.167251,-0.525323,-0.233628,-0.227924,-0.358953,-0.359223,-0.349847,-0.350054,-0.276851,1.920806,-0.350039,-0.227544,-0.304694,-0.305035,-0.358794,-0.35871,-0.28825,-0.288368,-0.299649,-0.29939,-0.158033,0.9969,1.252444,0.266284,0.29665,0.191551,-0.952712,-0.901715,-1.014651,-0.860474,-0.952073


In [11]:
# Assume cleaned_data is already loaded as a pandas DataFrame
# Example: cleaned_data = pd.read_csv('your_large_dataset.csv')

# Define the batch size
batch_size = 100_000

# Split features (X) and labels (y)
X = df.drop(columns=['Saved'])  # Assuming 'target' is the label column
y = df['Saved']

# Optionally, you can split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# Define a generator for batching the data
def data_generator(X, y, batch_size):
    num_samples = len(X)
    while True:  # Loop forever so the generator never terminates
        # Shuffle data at the start of each epoch
        indices = np.arange(num_samples)
        np.random.shuffle(indices)
        X = X.iloc[indices]
        y = y.iloc[indices]
        
        # Yield batches
        for start_idx in range(0, num_samples, batch_size):
            end_idx = min(start_idx + batch_size, num_samples)
            yield X.iloc[start_idx:end_idx].values, y.iloc[start_idx:end_idx].values

# Load data in batches using the generator
train_data_gen = data_generator(X_train, y_train, batch_size)
test_data_gen = data_generator(X_test, y_test, batch_size)

# Print information about the dataset and generator
try:
    # Get number of samples in train and test sets
    num_train_samples = len(X_train)
    num_test_samples = len(X_test)
    
    print("Data loading completed successfully.")
    print(f"Number of training samples: {num_train_samples}")
    print(f"Number of test samples: {num_test_samples}")
    
except Exception as e:
    print(f"Error loading data: {e}")
    raise

Data loading completed successfully.
Number of training samples: 8499993
Number of test samples: 1499999


In [15]:
def build_model():
    model = models.Sequential()
    model.add(layers.Dense(32, activation='relu', kernel_initializer=initializers.HeNormal(), input_shape=(df.shape[1] - 1,)))
    model.add(layers.Dense(16, activation='relu', kernel_initializer=initializers.HeNormal()))
    model.add(layers.Dense(1, activation='sigmoid', kernel_initializer=initializers.HeNormal()))
    model.compile(optimizer=optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

model = build_model()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# Number of folds for cross-validation
k_folds = 5

# Define the batch size
batch_size = 100_000

# Assume cleaned_data is already loaded as a pandas DataFrame
# Example: cleaned_data = pd.read_csv('your_large_dataset.csv')

# Split features (X) and labels (y)
X = df.drop(columns=['Saved'])  # Assuming 'target' is the label column
y = df['Saved']

# Define a generator for batching the data
def data_generator(X, y, batch_size):
    num_samples = len(X)
    while True:  # Loop forever so the generator never terminates
        # Yield batches
        for start_idx in range(0, num_samples, batch_size):
            end_idx = min(start_idx + batch_size, num_samples)
            yield X.iloc[start_idx:end_idx].values, y.iloc[start_idx:end_idx].values

# Initialize KFold from scikit-learn
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# Array to store accuracy for each fold
fold_accuracies = []

# Loop through each fold
for fold, (train_index, val_index) in enumerate(kf.split(X)):
    print(f"Training fold {fold+1}/{k_folds}...")

    # Create train and validation sets for this fold
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Create generators for this fold
    train_data_gen = data_generator(X_train, y_train, batch_size)
    val_data_gen = data_generator(X_val, y_val, batch_size)

    # Define your model here
    # For simplicity, let's assume we have a Keras model defined as 'model'
    # You would need to reinitialize your model for each fold (since the model should be retrained from scratch)
    # model = create_model()  # Assuming this function creates a fresh instance of your model

    # Train the model on this fold
    model.fit(
        train_data_gen,
        steps_per_epoch=len(X_train) // batch_size,
        validation_data=val_data_gen,
        validation_steps=len(X_val) // batch_size,
        epochs=5  # You can adjust the number of epochs as needed
    )

    # Evaluate the model on the validation set (using the generator)
    val_predictions = model.predict(val_data_gen, steps=len(X_val) // batch_size)
    
    # Assuming your target values are categorical, you might need to convert predictions to class labels
    val_predictions = np.argmax(val_predictions, axis=1)  # Convert probabilities to class labels
    
    # Calculate accuracy for this fold
    accuracy = accuracy_score(y_val[:len(val_predictions)], val_predictions)
    fold_accuracies.append(accuracy)

    print(f"Fold {fold+1} accuracy: {accuracy}")

# Calculate average accuracy across all folds
average_accuracy = np.mean(fold_accuracies)
print(f"Average accuracy across {k_folds} folds: {average_accuracy}")

Training fold 1/5...
Epoch 1/5
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 174ms/step - accuracy: 0.5385 - loss: 0.7413 - val_accuracy: 0.6071 - val_loss: 0.6533
Epoch 2/5
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 122ms/step - accuracy: 0.6137 - loss: 0.6510 - val_accuracy: 0.6407 - val_loss: 0.6349
Epoch 3/5
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 152ms/step - accuracy: 0.6406 - loss: 0.6356 - val_accuracy: 0.6546 - val_loss: 0.6252
Epoch 4/5
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 123ms/step - accuracy: 0.6534 - loss: 0.6266 - val_accuracy: 0.6633 - val_loss: 0.6189
Epoch 5/5
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 106ms/step - accuracy: 0.6613 - loss: 0.6198 - val_accuracy: 0.6688 - val_loss: 0.6145
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 52ms/step
Fold 1 accuracy: 0.5021460537610809
Training fold 2/5...
Epoch 1/5
[1m79/79[0m [32m━━━━━━━━━━

In [13]:
def build_model():
    model = models.Sequential()
    model.add(layers.Dense(32, activation='relu', kernel_initializer=initializers.HeNormal(), input_shape=(X_train.shape[1],)))
    model.add(layers.Dense(16, activation='relu', kernel_initializer=initializers.HeNormal()))
    model.add(layers.Dense(1, activation='sigmoid', kernel_initializer=initializers.HeNormal()))
    model.compile(optimizer=optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

model = build_model()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [14]:
# Assume you have a Keras model called `model`
model.fit(
    train_data_gen,  # Training data generator
    steps_per_epoch=len(X_train) // batch_size,  # Total steps per epoch
    validation_data=val_data_gen,  # Test data generator
    validation_steps=len(X_val) // batch_size,  # Validation steps per epoch
    epochs=10  # Number of epochs
)

Epoch 1/10
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 207ms/step - accuracy: 0.5672 - loss: 0.6850 - val_accuracy: 0.6438 - val_loss: 0.6332
Epoch 2/10
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 273ms/step - accuracy: 0.6491 - loss: 0.6284 - val_accuracy: 0.6661 - val_loss: 0.6170
Epoch 3/10
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 362ms/step - accuracy: 0.6682 - loss: 0.6151 - val_accuracy: 0.6749 - val_loss: 0.6078
Epoch 4/10
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 297ms/step - accuracy: 0.6781 - loss: 0.6058 - val_accuracy: 0.6826 - val_loss: 0.6000
Epoch 5/10
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 366ms/step - accuracy: 0.6846 - loss: 0.5989 - val_accuracy: 0.6881 - val_loss: 0.5936
Epoch 6/10
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 361ms/step - accuracy: 0.6903 - loss: 0.5928 - val_accuracy: 0.6945 - val_loss: 0.5877
Epoch 7/10
[1m84/84[

<keras.src.callbacks.history.History at 0x1376bf860>

In [None]:
# Evaluate the model on the validation set (using the generator)
val_predictions = model.predict(val_data_gen, steps=len(X_val) // batch_size)
    
# Assuming your target values are categorical, you might need to convert predictions to class labels
val_predictions = np.argmax(val_predictions, axis=1)  # Convert probabilities to class labels

# Calculate accuracy for this fold
accuracy = accuracy_score(y_val[:len(val_predictions)], val_predictions)
fold_accuracies.append(accuracy)
print(f"Fold {fold+1} accuracy: {accuracy}")

# Calculate average accuracy across all folds
average_accuracy = np.mean(fold_accuracies)
print(f"Average accuracy across {k_folds} folds: {average_accuracy}")