In [1]:
!pip install catboost
!pip install fastparquet

import pandas as pd
pd.set_option('display.max_columns', None)

import matplotlib.pyplot as plt
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import IsolationForest

from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler

from catboost import CatBoostClassifier

import joblib
import json



In [5]:
pd.set_option('display.max_columns', None)

df = pd.read_parquet('/content/train_val_data.parquet', engine='fastparquet')
df_test = pd.read_parquet('/content/test_data.parquet', engine='fastparquet')

In [6]:
columns_to_drop = [
    'latitude', 'longitude',
    'pedestrian_location',
    'pedestrian_action',
    'hour_sin', 'hour_cos', 'day_of_week_sin', 'day_of_week_cos', 'month_sin', 'month_cos', 'day_of_year_sin', 'day_of_year_cos',
    'age',
]

na_with_meaning = [
    'fixed_obstacle_struck', 'mobile_obstacle_struck', 'initial_point_of_impact', 'main_maneuver_before_accident', 'motor_type',
    'fixed_obstacle_struck_other', 'mobile_obstacle_struck_other', 'initial_point_of_impact_other', 'main_maneuver_before_accident_other', 'motor_type_other',
    'sex', 'vehicle_category_simplified', 'vehicle_category_simplified_other',
]

In [7]:
df = df.drop(columns=columns_to_drop, errors='ignore')
df_test = df_test.drop(columns=columns_to_drop, errors='ignore')

In [8]:
categorical_unordered = ['location', 'type_of_collision', 'horizontal_alignment', 'infrastructure', 'accident_situation', 'position', 'sex',
                         'fixed_obstacle_struck', 'mobile_obstacle_struck', 'initial_point_of_impact', 'main_maneuver_before_accident', 'motor_type',
                         'fixed_obstacle_struck_other', 'mobile_obstacle_struck_other', 'initial_point_of_impact_other', 'main_maneuver_before_accident_other',
                         'motor_type_other', 'used_belt', 'used_helmet', 'used_child_restraint', 'used_airbag', 'vehicle_category_simplified', 'vehicle_category_simplified_other',
                         'surface_quality_indicator', 'role', 'cluster']

categorical_ordered = {
    'time_of_day': pd.CategoricalDtype(['Morning_Rush', 'Midday', 'Evening_Rush', 'Night'], ordered=True),
    'day_of_week': pd.CategoricalDtype([0,1,2,3,4,5,6], ordered=True),
    'age_group': pd.CategoricalDtype(['Unknown', 'child_teen', 'young_adult', 'adult', 'middle_aged', 'senior'], ordered=True),
    'lighting_ordinal': pd.CategoricalDtype([0,1,2,3], ordered=True),
    'weather_ordinal': pd.CategoricalDtype([0,1,2,3,4], ordered=True),
    'injury_target': pd.CategoricalDtype([0,1,2], ordered=True),
    'reserved_lane_present': pd.CategoricalDtype(ordered=True),
    'impact_score': pd.CategoricalDtype(ordered=True),
    'impact_score_other': pd.CategoricalDtype(ordered=True),
    'impact_delta': pd.CategoricalDtype(ordered=True)
}

numeric = ['speed_limit']

df[categorical_unordered] = df[categorical_unordered].astype('category')
df_test[categorical_unordered] = df_test[categorical_unordered].astype('category')

df = df.astype(categorical_ordered)
df_test = df_test.astype(categorical_ordered)

df[numeric] = df[numeric].astype('int16')
df_test[numeric] = df_test[numeric].astype('int16')

In [9]:
X_train = df.drop(columns='injury_target')
y_train = df['injury_target']

X_test = df_test.drop(columns='injury_target')
y_test = df_test['injury_target']

In [10]:
cat_idx = [i for i, (col, dt) in enumerate(X_train.dtypes.items())
           if isinstance(dt, pd.CategoricalDtype)]

# 1) Build a numeric view of X_train for IsolationForest
cat_cols = [c for c in X_train.columns if isinstance(X_train[c].dtype, pd.CategoricalDtype)]
num_cols = [c for c in X_train.columns if c not in cat_cols]

iso_transformer = ColumnTransformer(
    transformers=[
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), cat_cols),
        ('num', 'passthrough', num_cols),
    ],
    remainder='drop'
)

X_train_iso = iso_transformer.fit_transform(X_train)

# 2) Isolation Forest on the numeric representation
pruner = IsolationForest(n_jobs=-1, random_state=42, n_estimators=1000)
mask = pruner.fit_predict(X_train_iso)   # 1 = inlier, -1 = outlier
inlier_mask = (mask == 1)

# Use the mask on the *original* X_train / y_train
X_train, y_train = X_train[inlier_mask], y_train[inlier_mask]

In [11]:
under_sampler = RandomUnderSampler(random_state=42, sampling_strategy='not minority')

X_train_rebalanced, y_train_rebalanced = under_sampler.fit_resample(X_train, y_train)
X_train_unbalanced, y_train_unbalanced = X_train, y_train
X_train, y_train = None, None # Reset to avoid mixups

In [13]:
# --- 1. Train Model ---

print("üöÄ Starting training...")

cat_features = list(X_train_rebalanced.select_dtypes(include=['category']).columns)
print(f"‚ÑπÔ∏è Found {len(cat_features)} categorical features.")

# Use the Best Params found in model_selection.ipynb
classifier_cat = CatBoostClassifier(
    loss_function='MultiClassOneVsAll',
    eval_metric='TotalF1:average=Macro',
    task_type='GPU',
    devices='0',
    auto_class_weights='Balanced',
    cat_features=cat_idx,
    random_seed=42,
    verbose=500,
    border_count=255,
    depth=10,
    iterations=4139,
    l2_leaf_reg=0.18865998362606232,
    learning_rate=0.01
)

# Correctly training on the rebalanced data
classifier_cat.fit(X_train_rebalanced, y_train_rebalanced)
print("‚úÖ Training complete!")

# --- 2. Export for Dashboard ---
joblib.dump(classifier_cat, '/content/catboost_model.pkl')

# Save metadata so the app knows your exact column names
meta_data = {
    # CORRECTION: Use X_train_rebalanced here as well
    "feature_names": list(X_train_rebalanced.columns),
    "cat_features": cat_features
}

with open('/content/model_metadata.json', 'w') as f:
    json.dump(meta_data, f)

print("‚úÖ Export successful! The files 'catboost_model.pkl' and 'model_metadata.json' are now in your folder.")

üöÄ Starting training...
‚ÑπÔ∏è Found 35 categorical features.
0:	learn: 0.5797780	total: 75.1ms	remaining: 5m 10s
500:	learn: 0.6872410	total: 23s	remaining: 2m 47s
1000:	learn: 0.7070811	total: 45.4s	remaining: 2m 22s
1500:	learn: 0.7210474	total: 1m 12s	remaining: 2m 7s
2000:	learn: 0.7318924	total: 1m 43s	remaining: 1m 50s
2500:	learn: 0.7417645	total: 2m 6s	remaining: 1m 23s
3000:	learn: 0.7505742	total: 2m 30s	remaining: 57s
3500:	learn: 0.7588952	total: 2m 53s	remaining: 31.6s
4000:	learn: 0.7670101	total: 3m 16s	remaining: 6.77s
4138:	learn: 0.7693255	total: 3m 23s	remaining: 0us
‚úÖ Training complete!
‚úÖ Export successful! The files 'catboost_model.pkl' and 'model_metadata.json' are now in your folder.
