In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')


# Neural networks
from tensorflow import keras
#import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

pd.set_option('display.max_columns', None)

In [3]:
df= pd.read_csv('cleaned_data.csv')
df.head()

Unnamed: 0,Time,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Owner_of_vehicle,Area_accident_occured,Lanes_or_Medians,Road_allignment,Types_of_Junction,Road_surface_type,Road_surface_conditions,Light_conditions,Weather_conditions,Type_of_collision,Number_of_vehicles_involved,Number_of_casualties,Vehicle_movement,Casualty_class,Sex_of_casualty,Age_band_of_casualty,Casualty_severity,Pedestrian_movement,Cause_of_accident,Accident_severity
0,17:02:00,Monday,18-30,Male,Above high school,Employee,1-2yr,Owner,Residential areas,Unknown,Tangent road with flat terrain,No junction,Asphalt roads,Dry,Daylight,Normal,Collision with roadside-parked vehicles,2,2,Going straight,na,na,na,na,Not a Pedestrian,Moving Backward,Slight Injury
1,17:02:00,Monday,31-50,Male,Junior high school,Employee,Above 10yr,Owner,Office areas,Undivided Two way,Tangent road with flat terrain,No junction,Asphalt roads,Dry,Daylight,Normal,Vehicle with vehicle collision,2,2,Going straight,na,na,na,na,Not a Pedestrian,Overtaking,Slight Injury
2,17:02:00,Monday,18-30,Male,Junior high school,Employee,1-2yr,Owner,Recreational areas,other,Tangent road with flat terrain,No junction,Asphalt roads,Dry,Daylight,Normal,Collision with roadside objects,2,2,Going straight,Driver or rider,Male,31-50,3,Not a Pedestrian,Changing lane to the left,Serious Injury
3,1:06:00,Sunday,18-30,Male,Junior high school,Employee,5-10yr,Governmental,Office areas,other,Tangent road with mild grade and flat terrain,Y Shape,Earth roads,Dry,Darkness - lights lit,Normal,Vehicle with vehicle collision,2,2,Going straight,Pedestrian,Female,18-30,3,Not a Pedestrian,Changing lane to the right,Slight Injury
4,1:06:00,Sunday,18-30,Male,Junior high school,Employee,2-5yr,Owner,Industrial areas,other,Tangent road with flat terrain,Y Shape,Asphalt roads,Dry,Darkness - lights lit,Normal,Vehicle with vehicle collision,2,2,Going straight,na,na,na,na,Not a Pedestrian,Overtaking,Slight Injury


In [4]:
df.isnull().sum()

Time                           0
Day_of_week                    0
Age_band_of_driver             0
Sex_of_driver                  0
Educational_level              0
Vehicle_driver_relation        0
Driving_experience             0
Owner_of_vehicle               0
Area_accident_occured          0
Lanes_or_Medians               0
Road_allignment                0
Types_of_Junction              0
Road_surface_type              0
Road_surface_conditions        0
Light_conditions               0
Weather_conditions             0
Type_of_collision              0
Number_of_vehicles_involved    0
Number_of_casualties           0
Vehicle_movement               0
Casualty_class                 0
Sex_of_casualty                0
Age_band_of_casualty           0
Casualty_severity              0
Pedestrian_movement            0
Cause_of_accident              0
Accident_severity              0
dtype: int64

### Feature Engineering

In [5]:
# Convert time to datetime
df['Time']=pd.to_datetime(df['Time'], format= '%H:%M:%S')

# Extract hours from Datetime
df['hour']= df['Time'].dt.hour

### Drop correlated values

In [6]:
df.drop(columns= ['Time', 'Casualty_severity', 'Sex_of_casualty', 'Casualty_class'], axis=1,inplace=True)

In [7]:
df.head() 

Unnamed: 0,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Owner_of_vehicle,Area_accident_occured,Lanes_or_Medians,Road_allignment,Types_of_Junction,Road_surface_type,Road_surface_conditions,Light_conditions,Weather_conditions,Type_of_collision,Number_of_vehicles_involved,Number_of_casualties,Vehicle_movement,Age_band_of_casualty,Pedestrian_movement,Cause_of_accident,Accident_severity,hour
0,Monday,18-30,Male,Above high school,Employee,1-2yr,Owner,Residential areas,Unknown,Tangent road with flat terrain,No junction,Asphalt roads,Dry,Daylight,Normal,Collision with roadside-parked vehicles,2,2,Going straight,na,Not a Pedestrian,Moving Backward,Slight Injury,17
1,Monday,31-50,Male,Junior high school,Employee,Above 10yr,Owner,Office areas,Undivided Two way,Tangent road with flat terrain,No junction,Asphalt roads,Dry,Daylight,Normal,Vehicle with vehicle collision,2,2,Going straight,na,Not a Pedestrian,Overtaking,Slight Injury,17
2,Monday,18-30,Male,Junior high school,Employee,1-2yr,Owner,Recreational areas,other,Tangent road with flat terrain,No junction,Asphalt roads,Dry,Daylight,Normal,Collision with roadside objects,2,2,Going straight,31-50,Not a Pedestrian,Changing lane to the left,Serious Injury,17
3,Sunday,18-30,Male,Junior high school,Employee,5-10yr,Governmental,Office areas,other,Tangent road with mild grade and flat terrain,Y Shape,Earth roads,Dry,Darkness - lights lit,Normal,Vehicle with vehicle collision,2,2,Going straight,18-30,Not a Pedestrian,Changing lane to the right,Slight Injury,1
4,Sunday,18-30,Male,Junior high school,Employee,2-5yr,Owner,Industrial areas,other,Tangent road with flat terrain,Y Shape,Asphalt roads,Dry,Darkness - lights lit,Normal,Vehicle with vehicle collision,2,2,Going straight,na,Not a Pedestrian,Overtaking,Slight Injury,1


**During the Analysis with the tree-based models, i dropped some features such as Time and other features with high correlation. I would do the same here** 
- Casualty severity, Sex of casualty, Casualty class all had high correlations with each other

### Select the categorical & numerical variables

In [8]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols

['Day_of_week',
 'Age_band_of_driver',
 'Sex_of_driver',
 'Educational_level',
 'Vehicle_driver_relation',
 'Driving_experience',
 'Owner_of_vehicle',
 'Area_accident_occured',
 'Lanes_or_Medians',
 'Road_allignment',
 'Types_of_Junction',
 'Road_surface_type',
 'Road_surface_conditions',
 'Light_conditions',
 'Weather_conditions',
 'Type_of_collision',
 'Vehicle_movement',
 'Age_band_of_casualty',
 'Pedestrian_movement',
 'Cause_of_accident',
 'Accident_severity']

In [9]:
numerical_cols = df.select_dtypes(include=['int32', 'int64', 'float']).columns.tolist()
numerical_cols

['Number_of_vehicles_involved', 'Number_of_casualties', 'hour']

In [10]:
# Label encode target variable
Target_mapping={'Slight Injury':0, 'Serious Injury':1, 'Fatal injury':2}
df['Accident_severity']= df['Accident_severity'].map(Target_mapping)

### Data Preprcoessing

In [11]:
# Seperate features from target
X= df.drop(columns='Accident_severity' , axis=1)
y= df['Accident_severity']

In [12]:
y.value_counts()

Accident_severity
0    10415
1     1743
2      158
Name: count, dtype: int64

The value_counts show a clear sign of imbalance

### Train Test split

In [13]:
# Train test split
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [14]:
# Dropped Accident_severity from my list of categorical columns since it is now a target variable and not a feature. 
# I had an error of column name not found in column after running the preprocessor
categorical_cols= categorical_cols[:-1]
categorical_cols

['Day_of_week',
 'Age_band_of_driver',
 'Sex_of_driver',
 'Educational_level',
 'Vehicle_driver_relation',
 'Driving_experience',
 'Owner_of_vehicle',
 'Area_accident_occured',
 'Lanes_or_Medians',
 'Road_allignment',
 'Types_of_Junction',
 'Road_surface_type',
 'Road_surface_conditions',
 'Light_conditions',
 'Weather_conditions',
 'Type_of_collision',
 'Vehicle_movement',
 'Age_band_of_casualty',
 'Pedestrian_movement',
 'Cause_of_accident']

### Initiating a Column Transformer

I used a column transformer to make the code more tidy and streamlined, showing a clear process

Since there is a clear imbalance, i introduced SMOTE to balance the minorities

In [15]:
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Preprocessor
preprocessor= ColumnTransformer(transformers= [
    ('num_pipeline', StandardScaler(), numerical_cols),
    ('cat_pipeline', OneHotEncoder(drop='first', handle_unknown= 'ignore', sparse_output= False), categorical_cols)
])

# fit-transform preprocessor to the training set
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Apply SMOTE to solve imbalance
smote= SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train_preprocessed, y_train)

In [16]:
# check to see if there is balance
y_train_sm.value_counts()

Accident_severity
0    8331
1    8331
2    8331
Name: count, dtype: int64

### Tensorflow/Keras Implementation

In [None]:
# One hot encode the transformed smote target variable for neural network use
#y_train_enc= keras.utils.to_categorical(y_sm)
#y_test_enc = keras.utils.to_categorical(y_test)

In [38]:
from tensorflow.keras.regularizers import l2
model = Sequential(
    [
        Dense(128, activation='relu', input_shape= X_train_sm.shape[1:], kernel_regularizer=l2(0.01)),# l2 to shrink the weights
        Dropout(0.4),   # Implementing a 40% dropout regularization layer in this hidden layer
        Dense(64, activation='relu', kernel_regularizer=l2(0.002)),
        Dropout(0.2),   # Implementing a 20% dropout regularization layer in this hidden layer
        Dense(32, activation='relu'),
        Dense(3, activation='softmax')
    ]
)

Softmax is used in the output layer becasue it converts each output value into a probability for each classification and the problem is a multiclass classification

In [39]:
model.summary()

In [40]:
# compile the model
model.compile(loss= 'sparse_categorical_crossentropy', optimizer= 'Adam', metrics= ['accuracy'])

sparse_categorical_crossentropy is used as the loss function instead of BinaryCrossEntropy as the problem being solved is not a binary classification

In [41]:
# Train the model
from tensorflow.keras.callbacks import EarlyStopping

# Early stopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=15, # extra chances for improvement
    restore_best_weights=True,  # return to best weights
    verbose=1
)

history = model.fit(X_train_sm, y_train_sm,
                    batch_size=32,
                    epochs= 100,
                    verbose= 2,
                    validation_data=(X_test_preprocessed, y_test),
                    callbacks= [early_stopping]) 

scores= model.evaluate(X_test_preprocessed, y_test, verbose=1)

# loss
print('Overall Test loss:', scores[0])

# Accuracy
print('Overall Test accuracy:', scores[1])

Epoch 1/100
782/782 - 8s - 10ms/step - accuracy: 0.6100 - loss: 1.0418 - val_accuracy: 0.4696 - val_loss: 0.9777
Epoch 2/100
782/782 - 3s - 4ms/step - accuracy: 0.7296 - loss: 0.6634 - val_accuracy: 0.6822 - val_loss: 0.8083
Epoch 3/100
782/782 - 4s - 5ms/step - accuracy: 0.7532 - loss: 0.6151 - val_accuracy: 0.5069 - val_loss: 0.9563
Epoch 4/100
782/782 - 4s - 5ms/step - accuracy: 0.7765 - loss: 0.5838 - val_accuracy: 0.7642 - val_loss: 0.7373
Epoch 5/100
782/782 - 4s - 5ms/step - accuracy: 0.7818 - loss: 0.5784 - val_accuracy: 0.6266 - val_loss: 0.8510
Epoch 6/100
782/782 - 4s - 5ms/step - accuracy: 0.7922 - loss: 0.5742 - val_accuracy: 0.6875 - val_loss: 0.8251
Epoch 7/100
782/782 - 4s - 5ms/step - accuracy: 0.8014 - loss: 0.5652 - val_accuracy: 0.4769 - val_loss: 1.0600
Epoch 8/100
782/782 - 5s - 7ms/step - accuracy: 0.8085 - loss: 0.5478 - val_accuracy: 0.6725 - val_loss: 0.8240
Epoch 9/100
782/782 - 4s - 5ms/step - accuracy: 0.8095 - loss: 0.5580 - val_accuracy: 0.7058 - val_loss

#### Parameters used to tackle overfitting
- Drop out regularization: In this case, At each training step, a certain percentage of neurons (e.g., 30%) is temporarily deactivated (set to zero output), forcing the network to learn robust representations insetad of relying on specific neurons. 
- l2 Regularization (Ridge): Adds a penalty to shrink the weights.
- Early stop: The point where the raining stops when the model sense no further improvement
- Patience: The extra chances give to the model for improvement
- restore_best_weights: returns the best weights from the epcohs with early stopping.

In [42]:
# predict Probabilities
y_pred_proba= model.predict(X_test_preprocessed)

[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step


The outputs of a Neural network are probabilities for each class, so to view how well the model is evaluated, it is important to convert to class predictions.

In [43]:
# convert probabilities into class predictions for evaluation
y_pred= np.argmax(y_pred_proba, axis=1)

from sklearn.metrics import accuracy_score, recall_score, f1_score

# Evaluating the model: callculating the metrics
accuracy= accuracy_score(y_test, y_pred)
recall= recall_score(y_test, y_pred, average='weighted')
f1= f1_score(y_test, y_pred,  average='weighted')
         
# create an empty list to store results
results=[]

# append evaluation metrics to the result
results.append({
    'model': 'Neural network',
    'Accuracy':f'{accuracy:.2f}' ,
    'Recall': f'{recall:.2f}',
    'F1 score': f'{f1:.2f}'
})
print('Evaluation on Test set')
# display results in tabular form
result_df= pd.DataFrame(results)
result_df

Evaluation on Test set


Unnamed: 0,model,Accuracy,Recall,F1 score
0,Neural network,0.8,0.8,0.78
