# Libraries and Packages

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report,accuracy_score
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings("ignore")

# Exploratory Data Analysis

In [12]:
#reading the raw data file from local
acc_sev=pd.read_csv('E:\BIA Data Analytics and AI\Capstone Project\Road_Raw_Data.csv') 

In [13]:
#shows the first 10 values from top of table to display all columns
pd.set_option('display.max_columns', None)  
acc_sev.head(10)

Unnamed: 0,Time,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Service_year_of_vehicle,Defect_of_vehicle,Area_accident_occured,Lanes_or_Medians,Road_allignment,Types_of_Junction,Road_surface_type,Road_surface_conditions,Light_conditions,Weather_conditions,Type_of_collision,Number_of_vehicles_involved,Number_of_casualties,Vehicle_movement,Casualty_class,Sex_of_casualty,Age_band_of_casualty,Casualty_severity,Work_of_casuality,Fitness_of_casuality,Pedestrian_movement,Cause_of_accident,Accident_severity
0,17:02:00,Monday,18-30,Male,Above high school,Employee,1-2yr,Automobile,Owner,Above 10yr,No defect,Residential areas,,Tangent road with flat terrain,No junction,Asphalt roads,Dry,Daylight,Normal,Collision with roadside-parked vehicles,2,2,Going straight,na,na,na,na,,,Not a Pedestrian,Moving Backward,Slight Injury
1,17:02:00,Monday,31-50,Male,Junior high school,Employee,Above 10yr,Public (> 45 seats),Owner,5-10yrs,No defect,Office areas,Undivided Two way,Tangent road with flat terrain,No junction,Asphalt roads,Dry,Daylight,Normal,Vehicle with vehicle collision,2,2,Going straight,na,na,na,na,,,Not a Pedestrian,Overtaking,Slight Injury
2,17:02:00,Monday,18-30,Male,Junior high school,Employee,1-2yr,Lorry (41?100Q),Owner,,No defect,Recreational areas,other,,No junction,Asphalt roads,Dry,Daylight,Normal,Collision with roadside objects,2,2,Going straight,Driver or rider,Male,31-50,3,Driver,,Not a Pedestrian,Changing lane to the left,Serious Injury
3,1:06:00,Sunday,18-30,Male,Junior high school,Employee,5-10yr,Public (> 45 seats),Governmental,,No defect,Office areas,other,Tangent road with mild grade and flat terrain,Y Shape,Earth roads,Dry,Darkness - lights lit,Normal,Vehicle with vehicle collision,2,2,Going straight,Pedestrian,Female,18-30,3,Driver,Normal,Not a Pedestrian,Changing lane to the right,Slight Injury
4,1:06:00,Sunday,18-30,Male,Junior high school,Employee,2-5yr,,Owner,5-10yrs,No defect,Industrial areas,other,Tangent road with flat terrain,Y Shape,Asphalt roads,Dry,Darkness - lights lit,Normal,Vehicle with vehicle collision,2,2,Going straight,na,na,na,na,,,Not a Pedestrian,Overtaking,Slight Injury
5,14:15:00,Friday,31-50,Male,,Unknown,,,,,,,,,Y Shape,,Dry,Daylight,Normal,Vehicle with vehicle collision,1,1,U-Turn,Driver or rider,Male,31-50,3,Driver,Normal,Not a Pedestrian,Overloading,Slight Injury
6,17:30:00,Wednesday,18-30,Male,Junior high school,Employee,2-5yr,Automobile,Owner,,No defect,Residential areas,Undivided Two way,Tangent road with flat terrain,Crossing,,Dry,Daylight,Normal,Vehicle with vehicle collision,1,1,Moving Backward,Driver or rider,Female,18-30,3,Driver,Normal,Not a Pedestrian,Other,Slight Injury
7,17:20:00,Friday,18-30,Male,Junior high school,Employee,2-5yr,Automobile,Governmental,Above 10yr,No defect,Residential areas,other,Tangent road with flat terrain,Y Shape,Asphalt roads,Dry,Daylight,Normal,Vehicle with vehicle collision,2,1,U-Turn,na,na,na,na,,Normal,Not a Pedestrian,No priority to vehicle,Slight Injury
8,17:20:00,Friday,18-30,Male,Junior high school,Employee,Above 10yr,Lorry (41?100Q),Owner,1-2yr,No defect,Industrial areas,other,Tangent road with flat terrain,Y Shape,Earth roads,Dry,Daylight,Normal,Collision with roadside-parked vehicles,2,1,Going straight,Pedestrian,Male,Under 18,3,Driver,Normal,Crossing from driver's nearside,Changing lane to the right,Slight Injury
9,17:20:00,Friday,18-30,Male,Junior high school,Employee,1-2yr,Automobile,Owner,2-5yrs,No defect,Residential areas,Undivided Two way,Tangent road with flat terrain,Y Shape,Asphalt roads,Dry,Daylight,Normal,Collision with roadside-parked vehicles,2,1,U-Turn,Passenger,Male,18-30,3,Driver,Normal,Not a Pedestrian,Moving Backward,Serious Injury


# Pre Processing

In [14]:
#replacing the missing categorical values with the most frequent categorical values from dataset
acc_sev['Educational_level']=acc_sev['Educational_level'].fillna('Junior high school')
acc_sev['Vehicle_driver_relation']=acc_sev['Vehicle_driver_relation'].fillna('Employee')
acc_sev['Driving_experience']=acc_sev['Driving_experience'].fillna('5-10yr')
acc_sev['Type_of_vehicle']=acc_sev['Type_of_vehicle'].fillna('Automobile')
acc_sev['Owner_of_vehicle']=acc_sev['Owner_of_vehicle'].fillna('Owner')
acc_sev['Service_year_of_vehicle']=acc_sev['Service_year_of_vehicle'].fillna('Unknown')
acc_sev['Defect_of_vehicle']=acc_sev['Defect_of_vehicle'].fillna('No defect')
acc_sev['Area_accident_occured']=acc_sev['Area_accident_occured'].fillna('Other')
acc_sev['Lanes_or_Medians']=acc_sev['Lanes_or_Medians'].fillna('Two-way (divided with broken lines road marking)')
acc_sev['Road_allignment']=acc_sev['Road_allignment'].fillna('Tangent road with flat terrain')
acc_sev['Types_of_Junction']=acc_sev['Types_of_Junction'].fillna('Y Shape')
acc_sev['Road_surface_type']=acc_sev['Road_surface_type'].fillna('Asphalt roads')
acc_sev['Type_of_collision']=acc_sev['Type_of_collision'].fillna('Vehicle with vehicle collision')
acc_sev['Vehicle_movement']=acc_sev['Vehicle_movement'].fillna('Going straight')
acc_sev['Work_of_casuality']=acc_sev['Work_of_casuality'].fillna('Driver')
acc_sev['Fitness_of_casuality']=acc_sev['Fitness_of_casuality'].fillna('Normal')

In [15]:
#remove beginning spaces from the data points in specified column
acc_sev['Area_accident_occured']=acc_sev['Area_accident_occured'].str.lstrip()

# Encoding 

In [16]:
#replacing the values that can't be converted by encoder
acc_sev['Driving_experience']=acc_sev['Driving_experience'].replace(['Below 1yr','1-2yr','2-5yr','5-10yr','Above 10yr'], 
                                                                    ['a','b','c','d','e'])
acc_sev['Age_band_of_driver']=acc_sev['Age_band_of_driver'].replace(['Under 18','18-30','31-50','Over 51'],
                                                                    ['a','b','c','d'])
acc_sev['Type_of_vehicle']=acc_sev['Type_of_vehicle'].replace(['Lorry (41?100Q)','Pick up upto 10Q','Public (12 seats)','Lorry (11?40Q)'
                                                             ,'Public (13?45 seats)','Public (> 45 seats)'],
                                                             ['a','b','c','d','e','f'])
acc_sev['Service_year_of_vehicle']=acc_sev['Service_year_of_vehicle'].replace(['Below 1yr','1-2yr','2-5yrs','5-10yrs','Above 10yr'],
                                                                              ['a','b','c','d','e'])
acc_sev['Age_band_of_casualty']=acc_sev['Age_band_of_casualty'].replace(['Under 18','18-30','31-50','Over 51'],
                                                                        ['a','b','c','d'])
acc_sev['Fitness_of_casuality']=acc_sev['Fitness_of_casuality'].replace(['NormalNormal'],['Normal'])

In [17]:
#encoding categorical data using label encoder(non-null value columns)
encoder1=LabelEncoder() 
acc_sev['Accident_severity']=encoder1.fit_transform(acc_sev['Accident_severity'])#0 = Fatal Injury, 1= Serious Injury, 2=Slight Injury 
acc_sev['Sex_of_driver']=encoder1.fit_transform(acc_sev['Sex_of_driver'])
acc_sev['Road_surface_conditions']=encoder1.fit_transform(acc_sev['Road_surface_conditions'])
acc_sev['Light_conditions']=encoder1.fit_transform(acc_sev['Light_conditions'])
acc_sev['Casualty_class']=encoder1.fit_transform(acc_sev['Casualty_class'])
acc_sev['Sex_of_casualty']=encoder1.fit_transform(acc_sev['Sex_of_casualty'])
acc_sev['Day_of_week']=encoder1.fit_transform(acc_sev['Day_of_week'])
acc_sev['Pedestrian_movement']=encoder1.fit_transform(acc_sev['Pedestrian_movement'])
acc_sev['Weather_conditions']=encoder1.fit_transform(acc_sev['Weather_conditions'])
acc_sev['Cause_of_accident']=encoder1.fit_transform(acc_sev['Cause_of_accident'])
acc_sev['Age_band_of_driver']=encoder1.fit_transform(acc_sev['Age_band_of_driver'])
acc_sev['Age_band_of_casualty']=encoder1.fit_transform(acc_sev['Age_band_of_casualty'])
acc_sev['Casualty_severity']=encoder1.fit_transform(acc_sev['Casualty_severity'])


#encoding categorical data using label encoder(null value columns)
encoder2=LabelEncoder()
acc_sev['Educational_level']=encoder2.fit_transform(acc_sev['Educational_level'])
acc_sev['Vehicle_driver_relation']=encoder2.fit_transform(acc_sev['Vehicle_driver_relation'])
acc_sev['Owner_of_vehicle']=encoder2.fit_transform(acc_sev['Owner_of_vehicle'])
acc_sev['Defect_of_vehicle']=encoder2.fit_transform(acc_sev['Defect_of_vehicle'])
acc_sev['Fitness_of_casuality']=encoder2.fit_transform(acc_sev['Fitness_of_casuality'])
acc_sev['Work_of_casuality']=encoder2.fit_transform(acc_sev['Work_of_casuality'])
acc_sev['Vehicle_movement']=encoder2.fit_transform(acc_sev['Vehicle_movement'])
acc_sev['Type_of_collision']=encoder2.fit_transform(acc_sev['Type_of_collision'])
acc_sev['Road_surface_type']=encoder2.fit_transform(acc_sev['Road_surface_type'])
acc_sev['Area_accident_occured']=encoder2.fit_transform(acc_sev['Area_accident_occured'])
acc_sev['Lanes_or_Medians']=encoder2.fit_transform(acc_sev['Lanes_or_Medians'])
acc_sev['Types_of_Junction']=encoder2.fit_transform(acc_sev['Types_of_Junction'])
acc_sev['Road_allignment']=encoder2.fit_transform(acc_sev['Road_allignment'])
acc_sev['Type_of_vehicle']=encoder2.fit_transform(acc_sev['Type_of_vehicle'])
acc_sev['Driving_experience']=encoder2.fit_transform(acc_sev['Driving_experience'])
acc_sev['Service_year_of_vehicle']=encoder2.fit_transform(acc_sev['Service_year_of_vehicle'])

In [18]:
#gives the described information of data
acc_sev.describe()

Unnamed: 0,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Service_year_of_vehicle,Defect_of_vehicle,Area_accident_occured,Lanes_or_Medians,Road_allignment,Types_of_Junction,Road_surface_type,Road_surface_conditions,Light_conditions,Weather_conditions,Type_of_collision,Number_of_vehicles_involved,Number_of_casualties,Vehicle_movement,Casualty_class,Sex_of_casualty,Age_band_of_casualty,Casualty_severity,Work_of_casuality,Fitness_of_casuality,Pedestrian_movement,Cause_of_accident,Accident_severity
count,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0,12316.0
mean,2.980513,2.270867,0.957535,3.208347,0.333793,3.326486,6.689834,2.693975,1.546931,1.987983,4.938698,2.946736,4.930091,3.635677,0.177899,0.718902,2.177493,2.414907,6.405408,2.040679,1.548149,2.864566,1.453962,1.148019,3.319341,2.293927,0.674082,1.997077,4.836067,6.916044,1.832819
std,2.059712,1.158926,0.263767,1.361538,0.741375,1.287379,5.648358,0.877637,1.886462,0.133116,2.442829,1.716511,0.902867,3.094667,0.696446,1.276059,1.324187,1.146012,2.726791,0.68879,1.007179,2.057282,1.331309,0.742709,1.480446,0.587734,1.221069,0.093602,0.893611,5.101751,0.406082
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,2.0,1.0,2.0,0.0,2.0,0.0,3.0,0.0,2.0,4.0,2.0,5.0,1.0,0.0,0.0,0.0,2.0,6.0,2.0,1.0,2.0,0.0,1.0,2.0,2.0,0.0,2.0,5.0,1.0,2.0
50%,3.0,2.0,1.0,4.0,0.0,4.0,8.0,3.0,0.0,2.0,5.0,2.0,5.0,2.0,0.0,0.0,3.0,2.0,8.0,2.0,1.0,2.0,1.0,1.0,3.0,2.0,0.0,2.0,5.0,9.0,2.0
75%,5.0,3.0,1.0,4.0,0.0,4.0,11.0,3.0,3.0,2.0,6.0,4.0,5.0,7.0,0.0,0.0,3.0,2.0,8.0,2.0,2.0,3.0,3.0,2.0,5.0,3.0,1.0,2.0,5.0,11.0,2.0
max,6.0,4.0,2.0,6.0,3.0,6.0,16.0,3.0,5.0,2.0,12.0,6.0,8.0,7.0,4.0,3.0,3.0,8.0,9.0,7.0,8.0,12.0,3.0,2.0,5.0,3.0,6.0,3.0,8.0,19.0,2.0


# Splitting the data

In [25]:
#spliting the features and target from the dataset
# Specify the columns we want to use
specified_columns = ['Day_of_week','Sex_of_casualty','Light_conditions','Age_band_of_casualty','Number_of_vehicles_involved']

# Filter the DataFrame to include only the specified columns
x = acc_sev[specified_columns]
y = acc_sev['Accident_severity']
# sm = SMOTE(random_state=42)
# x_smote,y_smote = sm.fit_resample(x,y)

# ML Model Implementation

### Random Forest Classifiers

In [26]:
#using train_test_split method to give model training and testing data
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)
x_train = np.ascontiguousarray(x_train)
x_test = np.ascontiguousarray(x_test)
y_train = np.ascontiguousarray(y_train)
y_test = np.ascontiguousarray(y_test)

In [27]:
#training the model from the obtained best parameters
param = {
    'bootstrap': False,
    'class_weight': None,
    'max_depth': 20,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    'n_estimators': 300
}

rfc= RandomForestClassifier(**param)
rfc_model= rfc.fit(x_train,y_train)
rfc_model

In [28]:
#testing the model once after completing training
y_pred=rfc_model.predict(x_test)

In [29]:
#evaluation metrics for model's performance
RFC_accuracy=accuracy_score(y_test,y_pred)

print('Accuracy of Random Forest Classifier: {:.2f}%'.format(RFC_accuracy * 100))

#report for classfication to evaluate performance
Report=classification_report(y_test,y_pred)

print('Classification Report of Random Forest:','\n',Report)

Accuracy of Random Forest Classifier: 83.06%
Classification Report of Random Forest: 
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        38
           1       0.24      0.05      0.08       531
           2       0.85      0.97      0.91      3126

    accuracy                           0.83      3695
   macro avg       0.36      0.34      0.33      3695
weighted avg       0.75      0.83      0.78      3695



# Model Deployment

In [30]:
from joblib import dump
from joblib import load
import streamlit as st
from streamlit_jupyter import StreamlitPatcher, tqdm
StreamlitPatcher().jupyter()

In [31]:
dump(rfc,'Accident_severity.joblib')

['Accident_severity.joblib']