In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

df = pd.concat([pd.read_csv(f'../data/F_feature_selection/feature_selection-{year}.csv', sep=';') for year in [2019, 2020, 2021, 2022]])
df_test = pd.read_csv('../data/F_feature_selection/feature_selection-2023.csv', sep=';')

In [None]:
columns_to_drop = [
    'latitude', 'longitude',
    'pedestrian_location', 'pedestrian_action',
    'hour_sin', 'hour_cos', 'day_of_week_sin', 'day_of_week_cos', 'month_sin', 'month_cos', 'day_of_year_sin', 'day_of_year_cos', 'age'
]

object_type = ['role', 'age_group', 'vehicle_category_simplified', 'vehicle_category_simplified_other', 'time_of_day']

na_with_meaning = [
    'fixed_obstacle_struck', 'mobile_obstacle_struck', 'initial_point_of_impact', 'main_maneuver_before_accident', 'motor_type',
    'fixed_obstacle_struck_other', 'mobile_obstacle_struck_other', 'initial_point_of_impact_other', 'main_maneuver_before_accident_other', 'motor_type_other',
    'sex'
]

columns_to_int32 = [
    'location', 'type_of_collision', 'reserved_lane_present', 'horizontal_alignment', 'infrastructure', 'accident_situation', 'position',
    'fixed_obstacle_struck', 'mobile_obstacle_struck', 'initial_point_of_impact', 'main_maneuver_before_accident', 'motor_type',
    'fixed_obstacle_struck_other', 'mobile_obstacle_struck_other', 'initial_point_of_impact_other', 'main_maneuver_before_accident_other',
    'motor_type_other','vehicle_category_involved_bicycle', 'vehicle_category_involved_bus_coach', 'vehicle_category_involved_hgv_truck',
    'vehicle_category_involved_light_motor_vehicle', 'vehicle_category_involved_other', 'vehicle_category_involved_powered_2_3_wheeler',
    'used_belt', 'used_helmet', 'used_child_restraint', 'used_airbag', 'impact_score', 'impact_score_other', 'impact_delta', 'surface_quality_indicator',
    'lighting_ordinal', 'weather_ordinal', 'injury_target', 'sex', 'day_of_week', 'speed_limit'
]

In [None]:
for x in [df, df_test]:
    # Encode Na Values -----------------------------------

    # Remove rows or columns with missing values that do not encode info.
    x.drop(columns=columns_to_drop, inplace=True)
    x.replace(-1, pd.NA, inplace=True)
    x[na_with_meaning] = x[na_with_meaning].fillna(-1)
    x[['vehicle_category_simplified', 'vehicle_category_simplified_other']] = x[['vehicle_category_simplified', 'vehicle_category_simplified_other']].fillna('no vehicle')
    x.dropna(inplace=True)
    x[columns_to_int32] = x[columns_to_int32].astype('int32')

    # Encodings ------------------------------------------

    # Encode Ordinal Text Attributes
    x['time_of_day'] = x['time_of_day'].map({'Morning_Rush': 0, 'Midday': 1, 'Evening_Rush': 2, 'Night': 3}).astype('int16')
    x['age_group'] = x['age_group'].map({'Unknown': -1, 'child_teen': 0, 'young_adult': 1, 'adult': 2, 'middle_aged': 3, 'senior': 4}).astype('int16')

    # Data Cleanup ---------------------------------------

    # Drop rows with impossible speed limit
    # France has a maximum speed limit of 130
    x.drop(index=x[x['speed_limit'] > 130].index, inplace=True)

    # Data Type ------------------------------------------
    cols = list(x.select_dtypes('int32').columns) + list(x.select_dtypes('int64').columns)

In [None]:
from catboost import CatBoostClassifier
import joblib
import json

# df contains data from 2019-2022
X_train = df.drop(columns='injury_target')
y_train = df['injury_target']

# --- 1. Train Model (Optimized for Mac CPU) ---
# We reuse your existing X_train and y_train variables.
print("üöÄ Starting training on CPU...")

# Automatically detect categorical features from your dataframe
cat_features = list(X_train.select_dtypes(include=['category']).columns)
print(f"‚ÑπÔ∏è Found {len(cat_features)} categorical features.")

classifier_cat = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    task_type="CPU",       # <--- Crucial for Mac M1/M2/M3
    cat_features=cat_features,
    verbose=100
)

classifier_cat.fit(X_train, y_train)
print("‚úÖ Training complete!")

# --- 2. Export for Dashboard ---
joblib.dump(classifier_cat, '../dashboard/catboost_model.pkl')

# Save metadata so the app knows your exact column names
meta_data = {
    "feature_names": list(X_train.columns),
    "cat_features": cat_features
}

with open('../dashboard/model_metadata.json', 'w') as f:
    json.dump(meta_data, f)

print("‚úÖ Export successful! The files 'catboost_model.pkl' and 'model_metadata.json' are now in your folder.")
