In [1]:
# importing pandas
import pandas as pd

# using pandas read_csv function to load the dataset
df = pd.read_csv("/content/RTA Dataset.csv")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/content/RTA Dataset.csv'

In [None]:
# print the dataset information
df.info()


In [None]:
# Find the number of missing values present in each column
df.isnull().sum()

In [None]:
# target variable classes counts and bar plot
import matplotlib.pyplot as plt
print(df['Accident_severity'].value_counts())
df['Accident_severity'].value_counts().plot(kind='bar')

In [None]:
# Education levels of car drivers
df['Educational_level'].value_counts().plot(kind='bar')

In [None]:
pip install git+https://github.com/amueller/dabl/

In [None]:
# Visualizing dataset using dabl library 
import dabl

dabl.plot(df, target_col='Accident_severity')

In [None]:
import seaborn as sns

stackup_username = "Protectorate"
print("Submission by:", stackup_username)

# plot the bar plot of road_surface_type and accident severity feature
plt.figure(figsize=(6,5))
sns.countplot(x="Road_surface_type", hue="Accident_severity", data=df)
plt.xlabel("Road surface type")
plt.xticks(rotation=60)
plt.show()

In [None]:
# convert object type column into datetime datatype column
df['Time'] = pd.to_datetime(df['Time'])

# Extrating 'Hour_of_Day' feature from the Time column
new_df = df.copy()
new_df['Hour_of_Day'] = new_df['Time'].dt.hour
n_df = new_df.drop('Time', axis=1)
n_df.head()

In [None]:
# feature selection based on visualization (to_be_continue)
features = ['Day_of_week','Number_of_vehicles_involved','Number_of_casualties','Area_accident_occured',
      'Types_of_Junction','Age_band_of_driver','Sex_of_driver','Educational_level',
      'Vehicle_driver_relation','Type_of_vehicle','Driving_experience','Service_year_of_vehicle','Type_of_collision',
      'Sex_of_casualty','Age_band_of_casualty','Cause_of_accident','Hour_of_Day']
len(features)

In [None]:
# new dataframe generated
featureset_df = n_df[features]
target = n_df['Accident_severity']

In [None]:
# metadata of the new sub dataset
featureset_df.info()

In [None]:
feature_df = featureset_df.copy()

# NaN are missing because service info might not be available, we will fill as 'Unknown'
feature_df['Service_year_of_vehicle'] = feature_df['Service_year_of_vehicle'].fillna('Unknown')
feature_df['Types_of_Junction'] = feature_df['Types_of_Junction'].fillna('Unknown')
feature_df['Area_accident_occured'] = feature_df['Area_accident_occured'].fillna('Unknown')
feature_df['Driving_experience'] = feature_df['Driving_experience'].fillna('unknown')
feature_df['Type_of_vehicle'] = feature_df['Type_of_vehicle'].fillna('Other')
feature_df['Vehicle_driver_relation'] = feature_df['Vehicle_driver_relation'].fillna('Unknown')
feature_df['Educational_level'] = feature_df['Educational_level'].fillna('Unknown')
feature_df['Type_of_collision'] = feature_df['Type_of_collision'].fillna('Unknown')

# features information
feature_df.info()

In [None]:
# setting input features X and target y 
X = feature_df[features] # here features are selected from 'object' datatype
y = n_df['Accident_severity']

# we will use pandas get_dummies method for on-hot encoding
encoded_df = pd.get_dummies(X, drop_first=True)
encoded_df.shape


In [None]:
# import labelencoder from sklearn.preprocessing
from sklearn.preprocessing import LabelEncoder

# create labelencoder object
lb = LabelEncoder()
lb.fit(y)
y_encoded = lb.transform(y)
print("Encoded labels:",lb.classes_)
y_en = pd.Series(y_encoded)


In [None]:
# feature selection method using chi2 for categorical output, categorical input
from sklearn.feature_selection import SelectKBest, chi2
fs = SelectKBest(chi2, k=50)
X_new = fs.fit_transform(encoded_df, y_en)

# Take the selected features
cols = fs.get_feature_names_out()

# convert selected features into dataframe
fs_df = pd.DataFrame(X_new, columns=cols)


In [None]:
import numpy as np

# importing the SMOTENC object from imblearn library 
from imblearn.over_sampling import SMOTENC

# categorical features for SMOTENC technique for categorical features
n_cat_index = np.array(range(3,50))

# creating smote object with SMOTENC class
smote = SMOTENC(categorical_features=n_cat_index, random_state=42, n_jobs=True)
X_n, y_n = smote.fit_resample(fs_df,y_en)

# print the shape of new upsampled dataset
X_n.shape, y_n.shape

In [None]:
# print the target classes distribution
print(y_n.value_counts())


In [None]:
# import the necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, f1_score

# train and test split and building baseline model to predict target features
X_trn, X_tst, y_trn, y_tst = train_test_split(X_n, y_n, test_size=0.2, random_state=42)

# modelling using random forest baseline
rf = RandomForestClassifier(n_estimators=800, max_depth=20, random_state=42)
rf.fit(X_trn, y_trn)

# predicting on test data
predics = rf.predict(X_tst)

# train score 
rf.score(X_trn, y_trn)

In [None]:
stackup_username = "Protectorate"
print("Submission by:", stackup_username)

# classification report on test dataset
classif_re = classification_report(y_tst,predics)
print(classif_re)

In [None]:
# f1_score of the model
f1score = f1_score(y_tst,predics, average='weighted')
print(f1score)