In [1]:

import os
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

DATA_PATH = r"C:\Users\KIIT\Desktop\1\RoadAccidentsInIndia\ModifiedDatabase\typeOfVehicle.csv"
assert os.path.exists(DATA_PATH), f"File not found: {DATA_PATH}"

df = pd.read_csv(DATA_PATH)
print("Loaded:", DATA_PATH, "shape:", df.shape)
display(df.head())


Loaded: C:\Users\KIIT\Desktop\1\RoadAccidentsInIndia\ModifiedDatabase\typeOfVehicle.csv shape: (37, 59)


Unnamed: 0,Sl. No.,States/UTs,Two-Wheelers - Number of Road Accidents - Fatal - 2014,Two-Wheelers - Number of Road Accidents - Fatal - 2014 per 1L people,Two-Wheelers - Number of Road Accidents - Total - 2014,Two-Wheelers - Number of Road Accidents - Total - 2014 per 1L people,Two-Wheelers - Number of Persons - Killed - 2014,Two-Wheelers - Number of Persons - Killed - 2014 per 1L people,Two-Wheelers - Number of Persons - Injured - 2014,Two-Wheelers - Number of Persons - Injured - 2014 per 1L people,...,Other Motor Vehicles - Number of Persons - Injured - 2014 per 1L people,Other Vehicles/Objects - Number of Road Accidents - Fatal - 2014,Other Vehicles/Objects - Number of Road Accidents - Fatal - 2014 per 1L people,Other Vehicles/Objects - Number of Road Accidents - Total - 2014,Other Vehicles/Objects - Number of Road Accidents - Total - 2014 per 1L people,Other Vehicles/Objects - Number of Persons - Killed - 2014,Other Vehicles/Objects - Number of Persons - Killed - 2014 per 1L people,Other Vehicles/Objects - Number of Persons - Injured - 2014,Other Vehicles/Objects - Number of Persons - Injured - 2014 per 1L people,Population
0,1,Andhra Pradesh,1961,3.741606,7239,13.812077,2003,3.821742,7333,13.99143,...,2.52048,466,0.889132,1279,2.440344,884,1.68668,1289,2.459424,52410653
1,2,Arunachal Pradesh,15,1.084029,34,2.457132,21,1.51764,30,2.168058,...,0.0,27,1.951252,55,3.974772,33,2.384863,58,4.191578,1383727
2,3,Assam,489,1.567028,1508,4.83247,537,1.720846,1342,4.300513,...,2.868077,73,0.233933,249,0.797934,93,0.298024,218,0.698593,31205576
3,4,Bihar,769,0.738717,1750,1.681085,820,0.787708,1131,1.086461,...,0.425555,614,0.589821,1577,1.514898,690,0.662828,1165,1.119122,104099452
4,5,Chhattisgarh,1074,4.204313,4787,18.739334,1126,4.407873,4295,16.813336,...,4.662324,142,0.555877,515,2.016034,154,0.602853,438,1.714608,25545198


In [2]:

pd.set_option('display.max_columns', 200)
print("Columns (count={}):".format(len(df.columns)))
for i,c in enumerate(df.columns):
    print(f"{i+1:02d}. {c}  (dtype={df[c].dtype}, non-null={df[c].notna().sum()}, unique={df[c].nunique(dropna=True)})")


display(df.describe(include='all').T)


Columns (count=59):
01. Sl. No.  (dtype=object, non-null=37, unique=37)
02. States/UTs  (dtype=object, non-null=37, unique=37)
03. Two-Wheelers - Number of Road Accidents - Fatal - 2014  (dtype=int64, non-null=37, unique=36)
04. Two-Wheelers - Number of Road Accidents - Fatal - 2014 per 1L people  (dtype=float64, non-null=37, unique=37)
05. Two-Wheelers - Number of Road Accidents - Total - 2014  (dtype=int64, non-null=37, unique=37)
06. Two-Wheelers - Number of Road Accidents - Total - 2014 per 1L people  (dtype=float64, non-null=37, unique=37)
07. Two-Wheelers - Number of Persons - Killed - 2014  (dtype=int64, non-null=37, unique=35)
08. Two-Wheelers - Number of Persons - Killed - 2014 per 1L people  (dtype=float64, non-null=37, unique=37)
09. Two-Wheelers - Number of Persons - Injured - 2014  (dtype=int64, non-null=37, unique=36)
10. Two-Wheelers - Number of Persons - Injured - 2014 per 1L people  (dtype=float64, non-null=37, unique=37)
11. Auto-Rickshaws - Number of Road Accidents -

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Sl. No.,37.0,37.0,1,1.0,,,,,,,
States/UTs,37.0,37.0,Andhra Pradesh,1.0,,,,,,,
Two-Wheelers - Number of Road Accidents - Fatal - 2014,37.0,,,,1625.081081,4928.288707,0.0,20.0,300.0,1342.0,30064.0
Two-Wheelers - Number of Road Accidents - Fatal - 2014 per 1L people,37.0,,,,2.364123,1.773977,0.0,1.084029,1.894926,3.305915,8.912992
Two-Wheelers - Number of Road Accidents - Total - 2014,37.0,,,,7223.459459,22096.152669,1.0,92.0,1305.0,4899.0,133634.0
Two-Wheelers - Number of Road Accidents - Total - 2014 per 1L people,37.0,,,,13.724032,22.256395,1.51674,3.849857,6.282428,13.812077,130.746737
Two-Wheelers - Number of Persons - Killed - 2014,37.0,,,,1758.054054,5331.111256,0.0,21.0,318.0,1399.0,32524.0
Two-Wheelers - Number of Persons - Killed - 2014 per 1L people,37.0,,,,2.555299,1.866243,0.0,1.24717,2.041255,3.620044,9.187238
Two-Wheelers - Number of Persons - Injured - 2014,37.0,,,,6889.297297,21158.70558,1.0,98.0,1083.0,4295.0,127452.0
Two-Wheelers - Number of Persons - Injured - 2014 per 1L people,37.0,,,,11.905215,13.388724,0.842633,2.729338,7.20072,13.99143,57.454518


In [3]:
# Cell 3: create an aggregated "per-1L" accidents score and a binary label HighAccident
# Heuristic:
#  - find all numeric columns whose name contains "per 1L" or "per1L" (case-insensitive).
#  - sum them into a single metric 'accidents_per_1L_sum'
#  - create binary label 'HighAccident' = 1 if above median else 0

# find candidates
per1l_cols = [c for c in df.columns if isinstance(c, str) and ("per 1l" in c.lower() or "per1l" in c.lower() or "per 100000" in c.lower())]
# also include columns with 'per 1L people' or 'per 1L' etc.
if not per1l_cols:
    # backup: include any numeric column with 'per' in name
    per1l_cols = [c for c in df.columns if isinstance(c, str) and "per" in c.lower() and pd.api.types.is_numeric_dtype(df[c])]

print("Per-1L candidate columns ({}):".format(len(per1l_cols)))
for c in per1l_cols:
    print(" -", c)

if not per1l_cols:
    # fallback: choose an obvious numeric total column if present
    fallback = [c for c in df.columns if 'Total' in c and pd.api.types.is_numeric_dtype(df[c])]
    if fallback:
        per1l_cols = [fallback[0]]
        print("No per-1L columns found — using fallback:", fallback[0])
    else:
        raise RuntimeError("Couldn't find suitable numeric columns to build a label. Inspect your dataset manually.")

# create aggregated metric (sum of selected per-1L numeric columns)
df['accidents_per_1L_sum'] = df[per1l_cols].select_dtypes(include='number').sum(axis=1)

# create binary label by median threshold (ensures approx balanced classes)
median_val = df['accidents_per_1L_sum'].median()
df['HighAccident'] = (df['accidents_per_1L_sum'] > median_val).astype(int)

print("Created 'accidents_per_1L_sum' and binary label 'HighAccident' (median={:.4f})".format(median_val))
display(df[['accidents_per_1L_sum', 'HighAccident']].head(10))
print("Label distribution:")
print(df['HighAccident'].value_counts())


Per-1L candidate columns (28):
 - Two-Wheelers - Number of Road Accidents - Fatal - 2014 per 1L people
 - Two-Wheelers - Number of Road Accidents - Total - 2014 per 1L people
 - Two-Wheelers - Number of Persons - Killed - 2014 per 1L people
 - Two-Wheelers - Number of Persons - Injured - 2014 per 1L people
 - Auto-Rickshaws - Number of Road Accidents - Fatal - 2014 per 1L people
 - Auto-Rickshaws - Number of Road Accidents - Total - 2014 per 1L people
 - Auto-Rickshaws - Number of Persons - Killed - 2014 per 1L people
 - Auto-Rickshaws - Number of Persons - Injured - 2014 per 1L people
 - Cars, Jeeps,Taxis - Number of Road Accidents - Fatal - 2014 per 1L people
 - Cars, Jeeps,Taxis - Number of Road Accidents - Total - 2014 per 1L people
 - Cars, Jeeps,Taxis - Number of Persons - Killed - 2014 per 1L people
 - Cars, Jeeps,Taxis - Number of Persons - Injured - 2014 per 1L people
 - Buses - Number of Road Accidents - Fatal - 2014 per 1L people
 - Buses - Number of Road Accidents - Total -

Unnamed: 0,accidents_per_1L_sum,HighAccident
0,132.713477,1
1,52.32246,0
2,58.75232,0
3,24.605317,0
4,135.328761,1
5,458.05923,1
6,101.296016,0
7,111.6977,1
8,155.347681,1
9,125.840204,1


Label distribution:
HighAccident
0    19
1    18
Name: count, dtype: int64


In [4]:
# Cell 4: Preprocessing and train/test split
# Drop identifier-like columns and the target columns from features
# Adjust the drop list if your notebook has specific index/id columns
drop_like = ['Sl. No.', 'Sl No', 'Sl_No', 'Sl', 'States/UTs', 'State/UTs', 'States', 'State']  # common id-like names
features = [c for c in df.columns if c not in drop_like and c not in ['HighAccident', 'accidents_per_1L_sum']]

# Optionally drop any text-heavy or near-unique columns
# filter features to only those present (defensive)
features = [c for c in features if c in df.columns]

X = df[features].copy()
y = df['HighAccident'].copy()

print("Feature count:", len(X.columns))
print("Some features:", X.columns.tolist()[:20])

# quick cleaning: drop columns that are constant or all null
const_or_empty = [c for c in X.columns if X[c].nunique(dropna=True) <= 1 or X[c].isna().all()]
if const_or_empty:
    print("Dropping constant/empty:", const_or_empty)
    X = X.drop(columns=const_or_empty)

# separate numeric & categorical
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
print("Numeric cols:", numeric_cols)
print("Categorical cols:", cat_cols)

# If dataset small (it is ~37 rows), set test_size small but keep stratify
test_size = 0.2 if len(df) >= 10 else 0.25

# be careful: stratify requires both classes present in train/test. Our binary creation uses median -> both classes exist.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=42, stratify=y
)
print("Train/test shapes:", X_train.shape, X_test.shape)


Feature count: 57
Some features: ['Two-Wheelers - Number of Road Accidents - Fatal - 2014', 'Two-Wheelers - Number of Road Accidents - Fatal - 2014 per 1L people', 'Two-Wheelers - Number of Road Accidents - Total - 2014', 'Two-Wheelers - Number of Road Accidents - Total - 2014 per 1L people', 'Two-Wheelers - Number of Persons - Killed - 2014', 'Two-Wheelers - Number of Persons - Killed - 2014 per 1L people', 'Two-Wheelers - Number of Persons - Injured - 2014', 'Two-Wheelers - Number of Persons - Injured - 2014 per 1L people', 'Auto-Rickshaws - Number of Road Accidents - Fatal - 2014', 'Auto-Rickshaws - Number of Road Accidents - Fatal - 2014 per 1L people', 'Auto-Rickshaws - Number of Road Accidents - Total - 2014', 'Auto-Rickshaws - Number of Road Accidents - Total - 2014 per 1L people', 'Auto-Rickshaws - Number of Persons - Killed - 2014', 'Auto-Rickshaws - Number of Persons - Killed - 2014 per 1L people', 'Auto-Rickshaws - Number of Persons - Injured - 2014', 'Auto-Rickshaws - Numbe

In [5]:
# Cell 5: Build pipelines, train models, evaluate and save best
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, cat_cols)
], remainder='drop')

models = {
    "LogisticRegression": LogisticRegression(max_iter=2000),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42)
}

results = {}
best_name = None
best_score = -1
best_pipeline = None

for name, model in models.items():
    pipe = Pipeline(steps=[('pre', preprocessor), ('clf', model)])
    print(f"\nTraining {name} ...")
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    acc = accuracy_score(y_test, preds)
    try:
        prob = pipe.predict_proba(X_test)[:,1]
        auc = roc_auc_score(y_test, prob)
    except Exception:
        auc = None
    print(f"{name} -> accuracy: {acc:.4f}, AUC: {auc}")
    print("Classification report:")
    print(classification_report(y_test, preds, zero_division=0))
    cm = confusion_matrix(y_test, preds)
    print("Confusion matrix:\n", cm)
    results[name] = {'accuracy': acc, 'auc': auc, 'pipeline': pipe}
    if acc > best_score:
        best_score = acc
        best_name = name
        best_pipeline = pipe

print(f"\nBest model: {best_name} with accuracy {best_score:.4f}")



Training LogisticRegression ...
LogisticRegression -> accuracy: 0.7500, AUC: 1.0
Classification report:
              precision    recall  f1-score   support

           0       0.67      1.00      0.80         4
           1       1.00      0.50      0.67         4

    accuracy                           0.75         8
   macro avg       0.83      0.75      0.73         8
weighted avg       0.83      0.75      0.73         8

Confusion matrix:
 [[4 0]
 [2 2]]

Training RandomForest ...
RandomForest -> accuracy: 0.8750, AUC: 0.9375
Classification report:
              precision    recall  f1-score   support

           0       0.80      1.00      0.89         4
           1       1.00      0.75      0.86         4

    accuracy                           0.88         8
   macro avg       0.90      0.88      0.87         8
weighted avg       0.90      0.88      0.87         8

Confusion matrix:
 [[4 0]
 [1 3]]

Best model: RandomForest with accuracy 0.8750


In [6]:
# Cell 6: Save best model pipeline and show how to load & predict later
out_path = r"C:\Users\KIIT\Desktop\1\RoadAccidentsInIndia\best_model.pkl"
to_save = {
    'model_pipeline': best_pipeline,
    'feature_columns': X.columns.tolist(),
    'label_name': 'HighAccident',
    'aggregation_cols_used_for_label': per1l_cols
}
with open(out_path, "wb") as f:
    pickle.dump(to_save, f)
print("Saved best pipeline to:", out_path)

# Usage example
print("""
To load and predict later:

import pickle
import pandas as pd
with open('/mnt/data/best_model.pkl','rb') as f:
    saved = pickle.load(f)
model = saved['model_pipeline']
# prepare new_X that has columns saved['feature_columns'] in same order
# preds = model.predict(new_X)
""")


Saved best pipeline to: C:\Users\KIIT\Desktop\1\RoadAccidentsInIndia\best_model.pkl

To load and predict later:

import pickle
import pandas as pd
with open('/mnt/data/best_model.pkl','rb') as f:
    saved = pickle.load(f)
model = saved['model_pipeline']
# prepare new_X that has columns saved['feature_columns'] in same order
# preds = model.predict(new_X)



In [7]:
# Show predicted labels for the test set using the best model pipeline
predicted_labels = best_pipeline.predict(X_test)
print("Predicted labels for test set:", predicted_labels)
print("True labels:", y_test.values)

Predicted labels for test set: [0 0 1 0 0 1 0 1]
True labels: [0 0 1 1 0 1 0 1]


In [8]:
# Explanation of binary labels used in this notebook
mapping = {
    0: f"LowAccident  -> accidents_per_1L_sum <= median ({median_val:.4f})",
    1: f"HighAccident -> accidents_per_1L_sum >  median ({median_val:.4f})"
}

print("Binary label meaning:")
for k, v in mapping.items():
    print(f" {k} : {v}")

print("\nClass distribution in 'HighAccident':")
print(y.value_counts().sort_index())

# show a few examples from each class
print("\nExamples (accidents_per_1L_sum, HighAccident):")
display(df.loc[y==1, ['accidents_per_1L_sum', 'HighAccident']].head(3))
display(df.loc[y==0, ['accidents_per_1L_sum', 'HighAccident']].head(3))

Binary label meaning:
 0 : LowAccident  -> accidents_per_1L_sum <= median (101.2960)
 1 : HighAccident -> accidents_per_1L_sum >  median (101.2960)

Class distribution in 'HighAccident':
HighAccident
0    19
1    18
Name: count, dtype: int64

Examples (accidents_per_1L_sum, HighAccident):


Unnamed: 0,accidents_per_1L_sum,HighAccident
0,132.713477,1
4,135.328761,1
5,458.05923,1


Unnamed: 0,accidents_per_1L_sum,HighAccident
1,52.32246,0
2,58.75232,0
3,24.605317,0
