# Accident Severity Project — Merged Notebook

This merged notebook combines the **clean, production-ready pipeline** (no leakage, ColumnTransformer pipeline, sensible defaults) with the **exploratory experiments** from your original notebook (KNN, Decision Tree, bagging/boosting experiments, elbow plots, extra visualizations).  

**How it's organized**:
1. Data load & inspection
2. Cleaning: sentinel handling, leakage decision, identifiers removal
3. Feature engineering & cardinality reduction
4. Preprocessing Pipeline (ColumnTransformer)
5. Main Models: Random Forest & Logistic Regression (GridSearchCV)
6. Experiments: KNN (elbow + bagging/boosting), DecisionTree with corrected GridSearch
7. Evaluation, permutation importance, saving pipeline

Run the notebook sequentially. If runtime is long, reduce CV folds or grid sizes in the 'Model grids' section.

In [None]:
# Imports & global settings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from pathlib import Path
sns.set(style='darkgrid')

RANDOM_STATE = 42
DATA_PATH = 'merged_data.csv'  # change if your merged CSV is elsewhere


In [None]:
# Load merged data
df = pd.read_csv(DATA_PATH)
print('Loaded dataframe shape:', df.shape)
display(df.head())


In [None]:
# Quick inspection
display(df.info())
display(df.isnull().sum().sort_values(ascending=False).head(40))


In [None]:
# Convert -1 sentinel to NaN for numeric columns where appropriate
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cols_replaced = []
for c in num_cols:
    if (df[c] == -1).sum() > 0 and df[c].min() == -1:
        cnt = int((df[c] == -1).sum())
        df[c] = df[c].replace(-1, np.nan)
        cols_replaced.append((c, cnt))
print('Columns where -1 replaced with NaN (col, count):', cols_replaced)


In [None]:
# Decide on leakage: By default we drop casualty-derived features for a pre-event model.
leakage_keywords = ['casualty', 'avg_casualty', 'casualty_count']
leak_cols = [c for c in df.columns if any(k in c.lower() for k in leakage_keywords)]
print('Detected possible casualty-derived columns:', leak_cols)
# Drop them by default -- if you want retrospective model, comment out the next line
df = df.drop(columns=[c for c in leak_cols if c in df.columns], errors='ignore')
print('Shape after dropping casualty-derived columns:', df.shape)


In [None]:
# Drop identifiers and fully-empty geolocation columns
ids_to_drop = ['collision_index','collision_reference','status']
for c in ids_to_drop:
    if c in df.columns:
        df = df.drop(columns=[c])
# Drop lat/lon if empty or near-empty
if 'latitude' in df.columns and df['latitude'].isna().all():
    df = df.drop(columns=['latitude'], errors='ignore')
if 'longitude' in df.columns and df['longitude'].isna().all():
    df = df.drop(columns=['longitude'], errors='ignore')
print('Shape after dropping ids/empty geos:', df.shape)


In [None]:
# Feature engineering: datetime features
if 'date' in df.columns and 'time' in df.columns:
    df['datetime'] = pd.to_datetime(df['date'].astype(str) + ' ' + df['time'].astype(str), errors='coerce')
    df['hour_of_day'] = df['datetime'].dt.hour
    df['day_of_week'] = df['datetime'].dt.day_name()
    df['month'] = df['datetime'].dt.month
    df = df.drop(columns=['date','time','datetime'])
    print('Created temporal features: hour_of_day, day_of_week, month')

# Reduce cardinality for very high-cardinality object columns
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
high_card = [c for c in cat_cols if df[c].nunique() > 50]
TOP_N = 50
for c in high_card:
    top_vals = df[c].value_counts().nlargest(TOP_N).index
    df[c] = df[c].where(df[c].isin(top_vals), other='OTHER')
print('Reduced high-cardinality columns:', high_card)


In [None]:
# Prepare target & features
TARGET = 'legacy_collision_severity'
assert TARGET in df.columns, f"Target column {TARGET} not found."

df = df[~df[TARGET].isna()].copy()
X = df.drop(columns=[TARGET])
y = df[TARGET].astype('int')

print('Target distribution:')
display(y.value_counts(normalize=True))
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE)
print('Train/test shapes:', X_train.shape, X_test.shape)


In [None]:
# Preprocessing pipeline with ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object','category']).columns.tolist()

print('Numeric cols:', len(numeric_cols), 'Categorical cols:', len(categorical_cols))

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=True))
])
preprocessor = ColumnTransformer([
    ('num', num_pipeline, numeric_cols),
    ('cat', cat_pipeline, categorical_cols)
], n_jobs=-1)


In [None]:
# Main models: RandomForest and LogisticRegression (GridSearchCV)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold

rf_pipe = Pipeline([('preproc', preprocessor),
                    ('clf', RandomForestClassifier(random_state=RANDOM_STATE, class_weight='balanced', n_jobs=-1))])

rf_param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [10, None],
    'clf__max_features': ['sqrt']
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
rf_gs = GridSearchCV(rf_pipe, rf_param_grid, cv=cv, scoring='f1_macro', n_jobs=-1, verbose=1)
print('Fitting RandomForest GridSearch (this may take a while)...')
rf_gs.fit(X_train, y_train)
print('Best RF params:', rf_gs.best_params_)
print('Best RF CV score:', rf_gs.best_score_)

lr_pipe = Pipeline([('preproc', preprocessor),
                    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_STATE))])
lr_param_grid = {'clf__C': [0.1, 1.0, 10.0]}
lr_gs = GridSearchCV(lr_pipe, lr_param_grid, cv=cv, scoring='f1_macro', n_jobs=-1, verbose=1)
print('Fitting Logistic Regression GridSearch...')
lr_gs.fit(X_train, y_train)
print('Best LR params:', lr_gs.best_params_)
print('Best LR CV score:', lr_gs.best_score_)


## Experiments: KNN, K-Optimal (Elbow), Bagging/Boosting, Decision Tree GridSearch (corrected)

In [None]:
# KNN Experiment: Elbow method and optional Bagging/Boosting wrappers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.model_selection import cross_val_score

# Elbow method (error rate) - small sample for speed
sample = X_train.sample(n=min(2000, X_train.shape[0]), random_state=RANDOM_STATE)
y_sample = y_train.loc[sample.index]
# Minimal preprocessing: use preprocessor to transform sample once
X_sample_trans = preprocessor.fit_transform(sample)
error_rates = []
K_vals = range(1, 16)
for k in K_vals:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_sample_trans, y_sample, cv=3, scoring='accuracy', n_jobs=-1)
    error_rates.append(1 - scores.mean())
print('Error rates (1-accuracy) for K=1..15:', error_rates)

# Choose a small grid for KNN with bagging/boosting (optional heavy)
knn_pipe = Pipeline([('preproc', preprocessor), ('clf', KNeighborsClassifier())])
knn_param_grid = {'clf__n_neighbors': [3,5,7]}
knn_gs = GridSearchCV(knn_pipe, knn_param_grid, cv=3, scoring='f1_macro', n_jobs=-1, verbose=1)
knn_gs.fit(X_train, y_train)
print('Best KNN params:', knn_gs.best_params_, 'Best CV score:', knn_gs.best_score_)

# Bagging KNN
bag_knn = BaggingClassifier(KNeighborsClassifier(n_neighbors=knn_gs.best_params_['clf__n_neighbors']), n_estimators=10, random_state=RANDOM_STATE)
bag_pipe = Pipeline([('preproc', preprocessor), ('clf', bag_knn)])
bag_pipe.fit(X_train, y_train)
print('Trained Bagging KNN (10 estimators)')


In [None]:
# Decision Tree with corrected GridSearch (valid parameters)
from sklearn.tree import DecisionTreeClassifier
dt_pipe = Pipeline([('preproc', preprocessor), ('clf', DecisionTreeClassifier(random_state=RANDOM_STATE))])

dt_param_grid = {
    'clf__criterion': ['gini', 'entropy'],
    'clf__max_depth': [3, 5, 10, None],
    'clf__min_samples_leaf': [1, 2, 5],
    'clf__min_samples_split': [2, 5, 10]
}
dt_gs = GridSearchCV(dt_pipe, dt_param_grid, cv=3, scoring='f1_macro', n_jobs=-1, verbose=1)
dt_gs.fit(X_train, y_train)
print('Best DT params:', dt_gs.best_params_, 'Best score:', dt_gs.best_score_)


In [None]:
# Final evaluation: RF, LR, KNN (from grids) and Decision Tree
from sklearn.metrics import classification_report, confusion_matrix, f1_score

models = {
    'RandomForest': rf_gs.best_estimator_,
    'LogisticRegression': lr_gs.best_estimator_,
    'KNN': knn_gs.best_estimator_,
    'DecisionTree': dt_gs.best_estimator_,
    'BaggingKNN': bag_pipe
}

for name, model in models.items():
    print('---', name, '---')
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred, digits=4))
    print('Confusion matrix:')
    print(confusion_matrix(y_test, y_pred))
    print('Macro F1:', f1_score(y_test, y_pred, average='macro'))
    print('\n')


In [None]:
# Choose the best model by macro F1 on test set and save pipeline
from sklearn.metrics import f1_score
best_name, best_score, best_model = None, -1, None
for name, model in models.items():
    score = f1_score(y_test, model.predict(X_test), average='macro')
    if score > best_score:
        best_score = score
        best_name = name
        best_model = model
print('Best on test set:', best_name, 'with macro F1:', best_score)
joblib.dump(best_model, '/mnt/data/accident_severity_pipeline_merged.joblib')
print('Saved merged best pipeline to /mnt/data/accident_severity_pipeline_merged.joblib')


In [None]:
# Permutation importance (sampled) for the chosen best_model
from sklearn.inspection import permutation_importance
sample_idx = np.random.choice(range(X_test.shape[0]), size=min(2000, X_test.shape[0]), replace=False)
X_test_sample = X_test.iloc[sample_idx]
y_test_sample = y_test.iloc[sample_idx]
r = permutation_importance(best_model, X_test_sample, y_test_sample, n_repeats=10, random_state=RANDOM_STATE, n_jobs=-1)
# Attempt to retrieve feature names
try:
    num_feats = numeric_cols
    ohe = best_model.named_steps['preproc'].named_transformers_['cat'].named_steps['ohe']
    ohe_names = ohe.get_feature_names_out(categorical_cols).tolist() if hasattr(ohe, 'get_feature_names_out') else []
    feature_names = num_feats + ohe_names
    imp = pd.Series(r.importances_mean, index=feature_names).sort_values(ascending=False)
    display(imp.head(30))
except Exception as e:
    print('Could not map all feature names:', e)
    print('Raw importances (first 40):', r.importances_mean[:40])


## Conclusion & Next Steps

This merged notebook retains best practices (pipelines, no leakage) while including all exploratory experiments. Next steps:
- Run SHAP on the best tree-based model for deeper explainability.
- Consider balanced-batch training or SMOTE for rare classes if you want to improve recall for the minority class.
- Prepare a small Flask/FastAPI wrapper to serve the saved pipeline.
