In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [3]:
# read csv file
data=pd.read_csv("./road_accident1.csv")

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,age_of_driver,Sex_of_driver,car_age,Road_surface_type,Road_surface_conditions,Light_conditions,Weather_conditions,Cause_of_accident,Accident_severity,accident history
0,0,18-30,Male,Above 10yr,Asphalt roads,Dry,Daylight,Normal,Moving Backward,2,12
1,1,31-50,Male,5-10yrs,Asphalt roads,Dry,Daylight,Normal,Overtaking,2,1665
2,4,18-30,Male,5-10yrs,Asphalt roads,Dry,Darkness - lights lit,Normal,Overtaking,2,2554
3,7,18-30,Male,Above 10yr,Asphalt roads,Dry,Daylight,Normal,No priority to vehicle,2,1746
4,8,18-30,Male,1-2yr,Earth roads,Dry,Daylight,Normal,Changing lane to the right,2,14


# PREPROCESSING

In [5]:
# display maximum 99 columns 
pd.options.display.max_columns=99

In [6]:
df=data.copy(deep=True)
df.head()

Unnamed: 0.1,Unnamed: 0,age_of_driver,Sex_of_driver,car_age,Road_surface_type,Road_surface_conditions,Light_conditions,Weather_conditions,Cause_of_accident,Accident_severity,accident history
0,0,18-30,Male,Above 10yr,Asphalt roads,Dry,Daylight,Normal,Moving Backward,2,12
1,1,31-50,Male,5-10yrs,Asphalt roads,Dry,Daylight,Normal,Overtaking,2,1665
2,4,18-30,Male,5-10yrs,Asphalt roads,Dry,Darkness - lights lit,Normal,Overtaking,2,2554
3,7,18-30,Male,Above 10yr,Asphalt roads,Dry,Daylight,Normal,No priority to vehicle,2,1746
4,8,18-30,Male,1-2yr,Earth roads,Dry,Daylight,Normal,Changing lane to the right,2,14


# TREATING WITH NULL VALUES

# REMOVING UNWANTED FEATURES, keeping required columns..

In [10]:
# checking null values
df.isnull().sum()

Unnamed: 0                 0
age_of_driver              0
Sex_of_driver              0
car_age                    0
Road_surface_type          0
Road_surface_conditions    0
Light_conditions           0
Weather_conditions         0
Cause_of_accident          0
Accident_severity          0
accident history           0
dtype: int64

In [15]:
#drop unwanted columns
df.drop("Unnamed: 0",axis=1,inplace=True)

In [16]:
# shape of data
df.shape

(5380, 10)

In [17]:
df.head()

Unnamed: 0,age_of_driver,Sex_of_driver,car_age,Road_surface_type,Road_surface_conditions,Light_conditions,Weather_conditions,Cause_of_accident,Accident_severity,accident history
0,18-30,Male,Above 10yr,Asphalt roads,Dry,Daylight,Normal,Moving Backward,2,12
1,31-50,Male,5-10yrs,Asphalt roads,Dry,Daylight,Normal,Overtaking,2,1665
2,18-30,Male,5-10yrs,Asphalt roads,Dry,Darkness - lights lit,Normal,Overtaking,2,2554
3,18-30,Male,Above 10yr,Asphalt roads,Dry,Daylight,Normal,No priority to vehicle,2,1746
4,18-30,Male,1-2yr,Earth roads,Dry,Daylight,Normal,Changing lane to the right,2,14


In [18]:
# checking for the columns of object dtypes with their unique values
for i in df.columns:
    if df[i].dtypes== object:
        print(i)
        print(df[i].unique())
        print(df[i].nunique())
        print()

age_of_driver
['18-30' '31-50' 'Over 51' 'Under 18' 'Unknown']
5

Sex_of_driver
['Male' 'Unknown' 'Female']
3

car_age
['Above 10yr' '5-10yrs' '1-2yr' '2-5yrs' 'Unknown' 'Below 1yr']
6

Road_surface_type
['Asphalt roads' 'Earth roads' 'Gravel roads' 'Other'
 'Asphalt roads with some distress']
5

Road_surface_conditions
['Dry' 'Wet or damp' 'Snow' 'Flood over 3cm. deep']
4

Light_conditions
['Daylight' 'Darkness - lights lit' 'Darkness - no lighting'
 'Darkness - lights unlit']
4

Weather_conditions
['Normal' 'Raining' 'Raining and Windy' 'Cloudy' 'Windy' 'Other' 'Snow'
 'Unknown' 'Fog or mist']
9

Cause_of_accident
['Moving Backward' 'Overtaking' 'No priority to vehicle'
 'Changing lane to the right' 'Changing lane to the left' 'No distancing'
 'No priority to pedestrian' 'Other' 'Unknown' 'Overturning'
 'Driving carelessly' 'Turnover' 'Driving to the left'
 'Driving at high speed' 'Driving under the influence of drugs'
 'Getting off the vehicle improperly' 'Overspeed' 'Drunk driving'

In [19]:
lst = []
for i in df.columns:
    if df[i].dtypes== object:
        lst.append(i)
print(lst)

['age_of_driver', 'Sex_of_driver', 'car_age', 'Road_surface_type', 'Road_surface_conditions', 'Light_conditions', 'Weather_conditions', 'Cause_of_accident']


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5380 entries, 0 to 5379
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   age_of_driver            5380 non-null   object
 1   Sex_of_driver            5380 non-null   object
 2   car_age                  5380 non-null   object
 3   Road_surface_type        5380 non-null   object
 4   Road_surface_conditions  5380 non-null   object
 5   Light_conditions         5380 non-null   object
 6   Weather_conditions       5380 non-null   object
 7   Cause_of_accident        5380 non-null   object
 8   Accident_severity        5380 non-null   int64 
 9   accident history         5380 non-null   int64 
dtypes: int64(2), object(8)
memory usage: 462.3+ KB


In [21]:
df.tail()

Unnamed: 0,age_of_driver,Sex_of_driver,car_age,Road_surface_type,Road_surface_conditions,Light_conditions,Weather_conditions,Cause_of_accident,Accident_severity,accident history
5375,31-50,Male,Below 1yr,Asphalt roads,Dry,Daylight,Normal,Moving Backward,2,1712
5376,18-30,Male,Above 10yr,Asphalt roads,Dry,Daylight,Normal,Changing lane to the left,1,3723
5377,Over 51,Male,2-5yrs,Asphalt roads,Dry,Daylight,Normal,Changing lane to the right,1,860
5378,18-30,Female,2-5yrs,Asphalt roads,Dry,Darkness - lights lit,Normal,Driving under the influence of drugs,2,846
5379,18-30,Male,2-5yrs,Asphalt roads,Dry,Darkness - lights lit,Normal,Changing lane to the right,2,849


In [22]:
len(df["accident history"].unique())

3869

In [23]:
df.columns

Index(['age_of_driver', 'Sex_of_driver', 'car_age', 'Road_surface_type',
       'Road_surface_conditions', 'Light_conditions', 'Weather_conditions',
       'Cause_of_accident', 'Accident_severity', 'accident history'],
      dtype='object')

In [24]:
df["age_of_driver"].unique()

array(['18-30', '31-50', 'Over 51', 'Under 18', 'Unknown'], dtype=object)

In [26]:
df.head()

Unnamed: 0,age_of_driver,Sex_of_driver,car_age,Road_surface_type,Road_surface_conditions,Light_conditions,Weather_conditions,Cause_of_accident,Accident_severity,accident history
0,18-30,Male,Above 10yr,Asphalt roads,Dry,Daylight,Normal,Moving Backward,2,12
1,31-50,Male,5-10yrs,Asphalt roads,Dry,Daylight,Normal,Overtaking,2,1665
2,18-30,Male,5-10yrs,Asphalt roads,Dry,Darkness - lights lit,Normal,Overtaking,2,2554
3,18-30,Male,Above 10yr,Asphalt roads,Dry,Daylight,Normal,No priority to vehicle,2,1746
4,18-30,Male,1-2yr,Earth roads,Dry,Daylight,Normal,Changing lane to the right,2,14


# Creating Model

In [27]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score,recall_score,f1_score,precision_score
from imblearn.over_sampling import RandomOverSampler

In [28]:
X=df.drop('Accident_severity',axis=1)
y=df['Accident_severity']

# our class in imbalance we can't perform classification model on imbalance class so we need to balance the class

In [31]:
# Create an instance of RandomUnderSampler
over_sampler = RandomOverSampler(random_state=42)

# Perform oversampling on the majority class
X_resampled, y_resampled = over_sampler.fit_resample(X, y)

# Verify the class distribution after oversampling
class_distribution = pd.Series(y_resampled).value_counts()
print("Class distribution after oversampling:")
print(class_distribution)

Class distribution after oversampling:
0    4548
1    4548
2    4548
Name: Accident_severity, dtype: int64


In [30]:
df.shape

(5380, 10)

In [32]:
X_train,X_test,y_train,y_test = train_test_split(X_resampled, y_resampled,test_size=0.3,random_state=40)
column_transformer = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(handle_unknown='ignore'), ['Sex_of_driver', 'car_age', 'Road_surface_type', 'Road_surface_conditions', 'Light_conditions', 'Weather_conditions', 'age_of_driver', 'Cause_of_accident'])  # Specify the columns to encode
    ],
    remainder='passthrough'  # Pass through the remaining columns without any transformation
)

# Create the pipeline
make_pipeline = Pipeline([
    ('preprocessor', column_transformer),
    ('scaler', StandardScaler(with_mean=False)),
    ('model', RandomForestClassifier(max_depth=40, random_state=42))
])

# Fit the pipeline to the training data
make_pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = make_pipeline.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.9433317049340498


In [79]:
X_train.columns

Index(['age_of_driver', 'Sex_of_driver', 'car_age', 'Road_surface_type',
       'Road_surface_conditions', 'Light_conditions', 'Weather_conditions',
       'Cause_of_accident', 'accident history'],
      dtype='object')

In [38]:
print('confusion matrix :\n',confusion_matrix(y_test,y_pred))

print()

print("===============================================")
print('classification report:\n',classification_report(y_test,y_pred))

print()

print("===============================================")
print('accuracy :',round(accuracy_score(y_test,y_pred),2))
print('precision :',round(precision_score(y_test,y_pred,average='weighted'),2))
print('recall :',round(recall_score(y_test,y_pred,average='weighted'),2))
print('f1 :',round(f1_score(y_test,y_pred,average='weighted'),2))
print()

confusion matrix :
 [[1411    0    0]
 [   0 1329   23]
 [  10  199 1122]]

classification report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      1411
           1       0.87      0.98      0.92      1352
           2       0.98      0.84      0.91      1331

    accuracy                           0.94      4094
   macro avg       0.95      0.94      0.94      4094
weighted avg       0.95      0.94      0.94      4094


accuracy : 0.94
precision : 0.95
recall : 0.94
f1 : 0.94



In [39]:
with open("safe_road.pkl", "wb") as f:
    pickle.dump(make_pipeline, f)

In [None]:
import seaborn as sns

In [40]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report

# Assuming you have already calculated y_test and y_pred

# Generate the classification report
report = classification_report(y_test, y_pred)

# Create a DataFrame from the classification report
df_report = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()

# Plot the heatmap
plt.figure(figsize=(8, 5))
sns.heatmap(df_report.iloc[:-1, :].T, annot=True, cmap="YlGnBu", cbar=False)
plt.title('Classification Report Heatmap')
plt.xlabel('Metrics')
plt.ylabel('Classes')
plt.tight_layout()
plt.savefig('Road_data_plot1.png')
plt.close()

In [41]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap="YlGnBu", cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
# Save the plot as an image
plt.savefig('Road_data_plot.png')
plt.close()