In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder



# Load the dataset

In [5]:

data = pd.read_csv(r'c:\Users\Lenovo\Desktop\data_analytics\data_set\traffic_accidents.csv')


# Convert crash_date to datetime format

In [6]:
data['crash_date'] = pd.to_datetime(data['crash_date'], errors='coerce')

# Drop rows with missing target or irrelevant features

In [7]:
columns_to_drop = ['crash_date', 'injuries_total', 'crash_hour', 'crash_day_of_week', 'crash_month']
data = data.drop(columns=columns_to_drop, errors='ignore')


# Encode categorical features using LabelEncoder

In [8]:
label_encoders = {}
for col in data.select_dtypes(include='object').columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))
    label_encoders[col] = le

# Define target and features

In [9]:
target = 'most_severe_injury'
X = data.drop(columns=[target], errors='ignore')
y = data[target]


# Split data into training and testing sets

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Train a Random Forest Classifier

In [11]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)



# Make predictions

In [12]:
y_pred = clf.predict(X_test)


# Evaluate the model

In [13]:
print("Classification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.95      0.97        56
           1       1.00      1.00      1.00      1338
           2       1.00      1.00      1.00     30831
           3       1.00      1.00      1.00      6384
           4       1.00      1.00      1.00      3253

    accuracy                           1.00     41862
   macro avg       1.00      0.99      0.99     41862
weighted avg       1.00      1.00      1.00     41862



# Feature importance

In [14]:
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': clf.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(feature_importances)


Feature Importances:
                          Feature  Importance
15    injuries_non_incapacitating    0.373404
16  injuries_reported_not_evident    0.212490
17         injuries_no_indication    0.148141
8                      crash_type    0.111851
14        injuries_incapacitating    0.102024
3                first_crash_type    0.022955
12                      num_units    0.010365
13                 injuries_fatal    0.006153
11        prim_contributory_cause    0.004292
10                         damage    0.003891
0          traffic_control_device    0.001432
4                 trafficway_type    0.000889
2              lighting_condition    0.000682
1               weather_condition    0.000378
9          intersection_related_i    0.000354
6            roadway_surface_cond    0.000287
7                     road_defect    0.000262
5                       alignment    0.000149
