In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import joblib

# Load dataset
df = pd.read_csv(r"D:\\Codes\\Projects\\ML\\Limi\data\\RTA Dataset.csv")

# Drop rows with missing target
df.dropna(subset=['Accident_severity'], inplace=True)

# Encode target
severity_order = ['Slight Injury', 'Serious Injury', 'Fatal injury']
df['Accident_severity'] = pd.Categorical(df['Accident_severity'], categories=severity_order, ordered=True)
le = LabelEncoder()
df['Severity_encoded'] = le.fit_transform(df['Accident_severity'])

# Fill missing values
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna(df[col].mode()[0])
for col in df.select_dtypes(include=['int64', 'float64']).columns:
    df[col] = df[col].fillna(df[col].mean())

# Drop columns
df.drop(columns=['Accident_severity', 'Time'], inplace=True, errors='ignore')

# One-hot encode categorical features
df = pd.get_dummies(df, drop_first=True)

# Features and labels
X = df.drop(columns=['Severity_encoded'])
y = df['Severity_encoded']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Train Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))

# Save model
joblib.dump(rf, 'random_forest_model.pkl')


Accuracy: 0.835985709645989
MSE: 0.2078596947060734
MAE: 0.17862942513803182


['random_forest_model.pkl']