### Import the necessary libraries

In [14]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

#### Read the data

In [15]:
df = pd.read_csv('traffic_accidents.csv')
df.drop(columns=['crash_date'], inplace=True, errors='ignore')

#### Label Encoder

In [16]:
label_encoders = {}
categorical_columns = df.select_dtypes(include=['object']).columns

In [17]:
for column in categorical_columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

In [18]:
with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

In [19]:
X = df.drop(columns=['crash_type'])
y = df['crash_type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
with open('feature_names.pkl', 'wb') as f:
    pickle.dump(list(X_train.columns), f)

#### Load the models

In [21]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree Classifier': DecisionTreeClassifier(),
    'Random Forest Classifier': RandomForestClassifier(),
    'Gradient Boosting Classifier': GradientBoostingClassifier(),
    'Multi-Layer Perceptron': MLPClassifier()
}

In [22]:
param_grids = {
    'Logistic Regression': {'C': [0.1, 1, 10]},
    'Decision Tree Classifier': {'max_depth': [3, 5, 7], 'min_samples_split': [2, 5, 10]}
}

In [23]:

best_model = None
best_score = -np.inf

In [24]:
for model_name, model in models.items():
    if model_name in param_grids:
        grid_search = GridSearchCV(model, param_grids[model_name], cv=5, scoring='accuracy')
        grid_search.fit(X_train, y_train)
        best_estimator = grid_search.best_estimator_
    else:
        model.fit(X_train, y_train)
        best_estimator = model

    y_pred = best_estimator.predict(X_test)

    
    print(f"Model: {model_name}")
    print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("=" * 60)

    
    with open(f'{model_name.replace(" ", "_").lower()}.pkl', 'wb') as file:
        pickle.dump(best_estimator, file)

    
    if accuracy_score(y_test, y_pred) > best_score:
        best_model = best_estimator
        best_score = accuracy_score(y_test, y_pred)


with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

print(f"Best Model: {best_model}")
print(f"Best Accuracy Score: {best_score:.4f}")

Model: Logistic Regression
Accuracy Score: 0.8311
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.67      0.78     18512
           1       0.79      0.96      0.86     23350

    accuracy                           0.83     41862
   macro avg       0.86      0.81      0.82     41862
weighted avg       0.85      0.83      0.83     41862

Confusion Matrix:
 [[12382  6130]
 [  942 22408]]
Model: Decision Tree Classifier
Accuracy Score: 0.8408
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.69      0.79     18512
           1       0.80      0.96      0.87     23350

    accuracy                           0.84     41862
   macro avg       0.87      0.82      0.83     41862
weighted avg       0.86      0.84      0.84     41862

Confusion Matrix:
 [[12732  5780]
 [  883 22467]]
Model: Random Forest Classifier
Accuracy Score: 0.8368
Classification Report:
               