In [1]:


import pandas as pd
import numpy as np
import re
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier


df = pd.read_csv(r"C:\Users\KIIT\Desktop\1\RoadAccidentsInIndia\ModifiedDatabase\reasonOfAccident.csv", on_bad_lines="skip")
print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())



def find_reason_column(df):
    keywords = ["reason", "cause", "fault", "accident"]
    for col in df.columns:
        if any(k in col.lower() for k in keywords):
            return col

    # fallback → largest object/text column
    obj_cols = [c for c in df.columns if df[c].dtype == "object"]
    if obj_cols:
        return obj_cols[0]

    return df.columns[0]


reason_col = find_reason_column(df)
print("Selected reason column →", reason_col)


# ------------------------------------------------------------
# 3. Clean reason text
# ------------------------------------------------------------
def clean_text(s):
    if pd.isna(s):
        return ""
    s = str(s).lower()
    s = re.sub(r"[^\w\s\-]", " ", s)
    s = re.sub(r"\s+", " ", s)
    return s.strip()

df["reason_clean"] = df[reason_col].astype(str).map(clean_text)


# ------------------------------------------------------------
# 4. Create reason categories (target variable)
# ------------------------------------------------------------
# Since this dataset contains statistics by state/fault type,
# create categories based on the fault type columns present
def create_categories_from_columns(df):
    # Extract main fault types from column names
    fault_types = set()
    for col in df.columns:
        if "Fault of Driver-Total" in col:
            fault_types.add("Driver Fault")
        elif "Fault of Driver of other" in col:
            fault_types.add("Other Driver Fault")
        elif "Fault of Pedestrian" in col:
            fault_types.add("Pedestrian Fault")
        elif "Defect in Condition of Motor Vehicle" in col:
            fault_types.add("Vehicle Defect")
        elif "Defect in Road Condition" in col:
            fault_types.add("Road Defect")
        elif "Weather Condition" in col:
            fault_types.add("Weather")
        elif "Fault of Passenger" in col:
            fault_types.add("Passenger Fault")
        elif "Poor light" in col:
            fault_types.add("Poor Light")
    
    # Assign category based on dominant accident count
    categories = []
    for idx, row in df.iterrows():
        max_accidents = -1
        dominant_category = "Other"
        
        checks = [
            ("Driver Fault", "Fault of Driver-Total No. of Road Accidents - 2014"),
            ("Other Driver Fault", "Fault of Driver of other vehicles-Total No. of Road Accidents - 2014"),
            ("Pedestrian Fault", "Fault of Pedestrian-Total No. of Road Accidents - 2014"),
            ("Vehicle Defect", "Defect in Condition of Motor Vehicle-Total No. of Road Accidents - 2014"),
            ("Road Defect", "Defect in Road Condition-Total No. of Road Accidents - 2014"),
            ("Weather", "Weather Condition-Total No. of Road Accidents - 2014"),
            ("Passenger Fault", "Fault of Passenger-Total No. of Road Accidents - 2014"),
            ("Poor Light", "Poor light-Total No. of Road Accidents - 2014"),
        ]
        
        for cat_name, col_name in checks:
            if col_name in df.columns:
                val = row[col_name]
                if pd.notna(val) and val > max_accidents:
                    max_accidents = val
                    dominant_category = cat_name
        
        categories.append(dominant_category)
    
    return categories

df["reason_category"] = create_categories_from_columns(df)

print("\nCategory distribution:")
print(df["reason_category"].value_counts())

# ------------------------------------------------------------
# 5. Select features (X) and target (y) for ML
# ------------------------------------------------------------
# Use all numeric columns as features
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()

if len(numeric_cols) == 0:
    raise Exception("No numeric columns available for ML model!")

X = df[numeric_cols]
y = df["reason_category"]

# if target has missing values drop them (very few / none in this dataset)
mask = y.notna()
X = X[mask]
y = y[mask]


# ------------------------------------------------------------
# 6. Encode target classes
# ------------------------------------------------------------
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


# ------------------------------------------------------------
# 7. Train-test split
# ------------------------------------------------------------
# Check if stratification is possible (all classes must have at least 2 samples)
class_counts = pd.Series(y_encoded).value_counts()
stratify_option = y_encoded if (class_counts >= 2).all() else None

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=stratify_option
)


# Impute missing numeric values (fit on train only)
imputer = SimpleImputer(strategy="median")
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Scale numeric features (fit on train only)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# ------------------------------------------------------------
# 9. Train multiple ML models
# ------------------------------------------------------------

# 1. Logistic Regression (requires scaling)
log_reg = LogisticRegression(max_iter=2000)
log_reg.fit(X_train_scaled, y_train)
pred_lr = log_reg.predict(X_test_scaled)
acc_lr = accuracy_score(y_test, pred_lr)

# 2. Random Forest (tree-based → scaling not required, use imputed data)
rf = RandomForestClassifier(n_estimators=400, random_state=42)
rf.fit(X_train_imputed, y_train)
pred_rf = rf.predict(X_test_imputed)
acc_rf = accuracy_score(y_test, pred_rf)

# 3. XGBoost (tree-based → scaling not required, use imputed data)
xgb = XGBClassifier(
    learning_rate=0.1,
    max_depth=5,
    n_estimators=250,
    subsample=0.8,
    eval_metric="mlogloss"
)
xgb.fit(X_train_imputed, y_train)
pred_xgb = xgb.predict(X_test_imputed)
acc_xgb = accuracy_score(y_test, pred_xgb)

# ------------------------------------------------------------
# 10. Print performance
# ------------------------------------------------------------
print("\n====================== MODEL ACCURACY ======================")
print("Logistic Regression:", acc_lr)
print("Random Forest:", acc_rf)
print("XGBoost:", acc_xgb)

best_model = None
best_name = None
best_acc = max(acc_lr, acc_rf, acc_xgb)

if best_acc == acc_lr:
    best_model = log_reg
    best_name = "Logistic Regression"
elif best_acc == acc_rf:
    best_model = rf
    best_name = "Random Forest"
else:
    best_model = xgb
    best_name = "XGBoost"

# ------------------------------------------------------------
# 11. Save trained model + scaler + label encoder + imputer
# ------------------------------------------------------------
joblib.dump(best_model, "accident_reason_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")
joblib.dump(imputer, "imputer.pkl")

print("\nBest Model:", best_name)
print("\nSaved:")
print("- accident_reason_model.pkl")
print("- scaler.pkl")
print("- label_encoder.pkl")
print("- imputer.pkl")

print("\nTraining Complete!")


Dataset shape: (37, 63)
Columns: ['Sl. No', 'States/UTs', 'Fault of Driver-Total No. of Road Accidents - 2014', 'Fault of Driver-Total No. of Road Accidents - 2014 per 1L people', 'Fault of Driver-Number of Persons-Killed - 2014', 'Fault of Driver-Number of Persons-Killed - 2014 per 1L people', 'Fault of Driver-Number of Persons-Injured - 2014', 'Fault of Driver-Number of Persons-Injured - 2014 per 1L people', 'Fault of Driver of other vehicles-Total No. of Road Accidents - 2014', 'Fault of Driver of other vehicles-Total No. of Road Accidents - 2014 per 1L people', 'Fault of Driver of other vehicles-Number of Persons-Killed - 2014', 'Fault of Driver of other vehicles-Number of Persons-Killed - 2014 per 1L people', 'Fault of Driver of other vehicles-Number of Persons-Injured - 2014', 'Fault of Driver of other vehicles-Number of Persons-Injured - 2014 per 1L people', 'Fault of Pedestrian-Total No. of Road Accidents - 2014', 'Fault of Pedestrian-Total No. of Road Accidents - 2014 per 1L p

In [2]:
# Predict accident reason categories for all states
X_all_imputed = imputer.transform(X)
X_all_scaled = scaler.transform(X_all_imputed)

# Use the best model (Logistic Regression) to predict for all data
predictions_encoded = best_model.predict(X_all_scaled)
predictions_labels = label_encoder.inverse_transform(predictions_encoded)

# Add predictions to dataframe
df['predicted_reason'] = predictions_labels

# Display results
print("\n====================== PREDICTIONS FOR ALL STATES ======================")
print(df[['States/UTs', 'reason_category', 'predicted_reason']].to_string())

# Save results to CSV
df[['States/UTs', 'reason_category', 'predicted_reason']].to_csv('accident_predictions_by_state.csv', index=False)
print("\nResults saved to 'accident_predictions_by_state.csv'")


               States/UTs reason_category predicted_reason
0          Andhra Pradesh    Driver Fault     Driver Fault
1       Arunachal Pradesh      Poor Light       Poor Light
2                   Assam    Driver Fault     Driver Fault
3                   Bihar    Driver Fault     Driver Fault
4            Chhattisgarh    Driver Fault     Driver Fault
5                     Goa    Driver Fault     Driver Fault
6                 Gujarat    Driver Fault     Driver Fault
7                 Haryana    Driver Fault     Driver Fault
8        Himachal Pradesh    Driver Fault     Driver Fault
9         Jammu & Kashmir    Driver Fault     Driver Fault
10              Jharkhand    Driver Fault     Driver Fault
11              Karnataka    Driver Fault     Driver Fault
12                 Kerala    Driver Fault     Driver Fault
13         Madhya Pradesh    Driver Fault     Driver Fault
14            Maharashtra    Driver Fault     Driver Fault
15                Manipur  Vehicle Defect   Vehicle Def