## Load the Data

In [2]:
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report#
from sklearn.isotonic import IsotonicRegression

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE, RandomOverSampler, SMOTEN, BorderlineSMOTE, SMOTENC
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN, SMOTETomek

from mord import LogisticAT

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
df_train_2021 = pd.read_csv('../data/new/feature_selection/feature_selection-2021.csv', sep=';')
df_train_2022 = pd.read_csv('../data/new/feature_selection/feature_selection-2022.csv', sep=';')
df = pd.concat([df_train_2021, df_train_2022])

df_test = pd.read_csv('../data/new/feature_selection/feature_selection-2023.csv', sep=';')

In [5]:
df.head()

Unnamed: 0,day,month,lighting_condition,location,intersection,weather_condition,type_of_collision,latitude,longitude,road_category,traffic_regime,number_of_traffic_lanes,reserved_lane_present,longitudinal_profile,horizontal_alignment,pavement_condition,infrastructure,accident_situation,speed_limit,position,user_category,injury_severity,sex,pedestrian_location,pedestrian_action,direction_of_travel,vehicle_category,fixed_obstacle_struck,mobile_obstacle_struck,initial_point_of_impact,main_maneuver_before_accident,motor_type,direction_of_travel_other,vehicle_category_other,fixed_obstacle_struck_other,mobile_obstacle_struck_other,initial_point_of_impact_other,main_maneuver_before_accident_other,motor_type_other,vehicle_category_involved_bicycle,vehicle_category_involved_bus_coach,vehicle_category_involved_hgv_truck,vehicle_category_involved_light_motor_vehicle,vehicle_category_involved_other,vehicle_category_involved_powered_2_3_wheeler,hour,minute,age,used_belt,used_helmet,used_child_restraint,used_airbag
0,30,11,2,1,1,1,1,44.038958,4.348022,3,2,2.0,0,1,1,1,0,1,80,1,1,3,1,0,0,1.0,bicycle,0.0,2.0,1.0,1.0,5.0,1.0,light_motor_vehicle,0.0,9.0,3.0,17.0,1.0,1,0,0,1,0,0,7,32,21.0,0,0,0,0
1,30,11,2,1,1,1,1,44.038958,4.348022,3,2,2.0,0,1,1,1,0,1,80,1,1,1,1,0,0,1.0,light_motor_vehicle,0.0,9.0,3.0,17.0,1.0,1.0,bicycle,0.0,2.0,1.0,1.0,5.0,1,0,0,1,0,0,7,32,43.0,1,0,0,0
2,25,9,1,1,3,1,3,49.242129,4.554546,3,2,2.0,0,1,1,1,0,1,80,1,1,4,1,0,0,0.0,light_motor_vehicle,2.0,2.0,1.0,1.0,0.0,0.0,light_motor_vehicle,0.0,2.0,1.0,9.0,0.0,0,0,0,2,0,0,14,20,38.0,1,0,0,0
3,25,9,1,1,3,1,3,49.242129,4.554546,3,2,2.0,0,1,1,1,0,1,80,1,1,3,1,0,0,0.0,light_motor_vehicle,0.0,2.0,1.0,9.0,0.0,0.0,light_motor_vehicle,2.0,2.0,1.0,1.0,0.0,0,0,0,2,0,0,14,20,28.0,1,0,0,0
4,15,7,1,2,1,7,6,46.92195,-0.96446,4,2,2.0,0,1,1,1,0,1,50,1,1,1,1,0,0,1.0,light_motor_vehicle,0.0,1.0,3.0,1.0,1.0,,,,,,,,0,0,0,1,0,0,7,55,26.0,1,0,0,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 255910 entries, 0 to 126661
Data columns (total 52 columns):
 #   Column                                         Non-Null Count   Dtype  
---  ------                                         --------------   -----  
 0   day                                            255910 non-null  int64  
 1   month                                          255910 non-null  int64  
 2   lighting_condition                             255910 non-null  int64  
 3   location                                       255910 non-null  int64  
 4   intersection                                   255910 non-null  int64  
 5   weather_condition                              255910 non-null  int64  
 6   type_of_collision                              255910 non-null  int64  
 7   latitude                                       255910 non-null  float64
 8   longitude                                      255910 non-null  float64
 9   road_category                             

In [7]:
# AI generated helper function
def not_reported_summary(
        df: pd.DataFrame,
        zeros_as_na_cols=None,  # columns where 0 means "Sans objet/Aucun"
        extra_markers_by_col=None  # dict like {"col": {"-9","99"}}
) -> pd.DataFrame:
    zeros_as_na_cols = set(zeros_as_na_cols or [])
    extra_markers_by_col = extra_markers_by_col or {}

    rows = []
    for col in df.columns:
        s = df[col]

        # work on strings to catch both string and numeric codes
        s_str = s.astype(str).str.strip()
        s_num = pd.to_numeric(s_str, errors="coerce")

        nulls = s.isna().sum()
        blanks = (s_str == "").sum()
        dots = (s_str == ".").sum()
        minus1 = (s_num == -1).sum()  # "-1 = Non renseign√©" in many fields
        zero_na = ((s_num == 0) | (s_str == "0")).sum() if col in zeros_as_na_cols else 0

        # column-specific extra markers (e.g., {"grav": {"9"}, "trajet": {"0"}})
        extras = 0
        if col in extra_markers_by_col:
            extras = s_str.isin(set(extra_markers_by_col[col])).sum()

        any_na = nulls + blanks + dots + minus1 + zero_na + extras
        pct = any_na / len(df) * 100 if len(df) else 0

        rows.append({
            "column": col,
            "rows": len(df),
            "null": int(nulls),
            "blank": int(blanks),
            "dot(.)": int(dots),
            "-1_not_reported": int(minus1),
            "zero_as_na": int(zero_na),
            "extra_markers": int(extras),
            "any_missing_like": int(any_na),
            "any_missing_like_pct": round(pct, 2),
        })

    out = pd.DataFrame(rows).sort_values("any_missing_like_pct", ascending=False)
    return out

In [8]:
not_reported_summary(df)

Unnamed: 0,column,rows,null,blank,dot(.),-1_not_reported,zero_as_na,extra_markers,any_missing_like,any_missing_like_pct
23,pedestrian_location,255910,0,0,0,115712,0,0,115712,45.22
24,pedestrian_action,255910,0,0,0,112732,0,0,112732,44.05
32,direction_of_travel_other,255910,54481,0,0,0,0,0,54481,21.29
38,motor_type_other,255910,54116,0,0,0,0,0,54116,21.15
36,initial_point_of_impact_other,255910,53861,0,0,0,0,0,53861,21.05
34,fixed_obstacle_struck_other,255910,53849,0,0,0,0,0,53849,21.04
35,mobile_obstacle_struck_other,255910,53835,0,0,0,0,0,53835,21.04
37,main_maneuver_before_accident_other,255910,53804,0,0,0,0,0,53804,21.02
33,vehicle_category_other,255910,53759,0,0,0,0,0,53759,21.01
25,direction_of_travel,255910,19898,0,0,0,0,0,19898,7.78


## Some Preprocessing

In [9]:
columns_to_drop = [
    'pedestrian_location', 'pedestrian_action', # Too many na values.
    'day', 'month', 'latitude', 'longitude', 'hour', 'minute'
]

na_with_meaning = [
    'fixed_obstacle_struck', 'mobile_obstacle_struck', 'initial_point_of_impact', 'main_maneuver_before_accident', 'motor_type',
    'fixed_obstacle_struck_other', 'mobile_obstacle_struck_other', 'initial_point_of_impact_other', 'main_maneuver_before_accident_other', 'motor_type_other',
    'age', 'sex', 'direction_of_travel', 'direction_of_travel_other'
]

extra = ['vehicle_category', 'vehicle_category_other']


for x in [df, df_test]:
    # Remove rows or columns with missing values that do not encode info.
    x.drop(columns=columns_to_drop, inplace=True)
    x.replace(-1, pd.NA, inplace=True)
    x[na_with_meaning] = x[na_with_meaning].fillna(-1)
    x[extra] = x[extra].fillna('no vehicle')
    x.dropna(inplace=True)

    # Make sure injury severity score is ascending
    x['injury_severity'] = x['injury_severity'].map({1:1, 2:4, 3:3, 4:2}) # Mapping injury severity to be ordinal ascending.
    x[x.columns.difference(extra)] = x[x.columns.difference(extra)].astype(int)



  x[na_with_meaning] = x[na_with_meaning].fillna(-1)
  x[na_with_meaning] = x[na_with_meaning].fillna(-1)


In [10]:
not_reported_summary(df)

Unnamed: 0,column,rows,null,blank,dot(.),-1_not_reported,zero_as_na,extra_markers,any_missing_like,any_missing_like_pct
26,direction_of_travel_other,234572,0,0,0,49091,0,0,49091,20.93
32,motor_type_other,234572,0,0,0,48728,0,0,48728,20.77
30,initial_point_of_impact_other,234572,0,0,0,48555,0,0,48555,20.7
29,mobile_obstacle_struck_other,234572,0,0,0,48531,0,0,48531,20.69
28,fixed_obstacle_struck_other,234572,0,0,0,48543,0,0,48543,20.69
31,main_maneuver_before_accident_other,234572,0,0,0,48503,0,0,48503,20.68
19,direction_of_travel,234572,0,0,0,17548,0,0,17548,7.48
25,motor_type,234572,0,0,0,17044,0,0,17044,7.27
23,initial_point_of_impact,234572,0,0,0,16935,0,0,16935,7.22
22,mobile_obstacle_struck,234572,0,0,0,16931,0,0,16931,7.22


In [11]:
X_train = df.drop(columns='injury_severity')
y_train = df['injury_severity']

In [12]:
X_test = df_test.drop(columns='injury_severity')
y_test = df_test['injury_severity']

## Model Exploration

### Baseline: K-NN Classifier with Resampling

In [13]:
num_cols = ['age', 'number_of_traffic_lanes', 'speed_limit']
cat_cols = X_train.columns.difference(num_cols)

# cat_cols = [
#     'lighting_condition', 'location', 'intersection', 'weather_condition', 'type_of_collision', 'road_category', 'traffic_regime', 'reserved_lane_present', 'longitudinal_profile',
#     'horizontal_alignment', 'pavement_condition', 'infrastructure', 'accident_situation', 'user_category', 'sex', 'direction_of_travel', 'vehicle_category', 'fixed_obstacle_struck',
#     'mobile_obstacle_struck', 'initial_point_of_impact', 'main_maneuver_before_accident', 'motor_type', 'direction_of_travel_other', 'vehicle_category_other', 'fixed_obstacle_struck_other', 'mobile_obstacle_struck_other',
#     'initial_point_of_impact_other', 'main_maneuver_before_accident_other', 'motor_type_other'
# ]
#num_cols = X_train.columns.difference(cat_cols)

In [None]:
transformer = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False), cat_cols),
    ('num', MinMaxScaler(), num_cols),
], remainder='passthrough')

pipeline = ImbPipeline(steps=[
    ('pre', transformer),
    ('os', BorderlineSMOTE(k_neighbors=NearestNeighbors(n_jobs=-1), m_neighbors=NearestNeighbors(n_jobs=-1))),  # try RandomUnderSampler() or SMOTE for numeric-only
    ('clf', KNeighborsClassifier(n_neighbors=20, n_jobs=-1)),
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)



In [None]:
print(classification_report(y_test, y_pred, target_names=['not injured', 'slightly injured', 'heavily injured', 'killed']))

### Logistic Regression

In [14]:
transformer = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False), cat_cols)
], remainder='passthrough')

pipeline = ImbPipeline(steps=[
    ('pre', transformer),
    ('os', BorderlineSMOTE(k_neighbors=NearestNeighbors(n_jobs=-1), m_neighbors=NearestNeighbors(n_jobs=-1), random_state=42, sampling_strategy='minority')),  # try RandomUnderSampler() or SMOTE for numeric-only
    ('clf', LogisticRegression(max_iter=100, n_jobs=-1, class_weight={1:1, 2:2, 3:3, 4:5}, penalty='l2')),
])
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('pre', ...), ('os', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,sampling_strategy,'minority'
,random_state,42
,k_neighbors,NearestNeighbors(n_jobs=-1)
,m_neighbors,NearestNeighbors(n_jobs=-1)
,kind,'borderline-1'

0,1,2
,n_neighbors,5
,radius,1.0
,algorithm,'auto'
,leaf_size,30
,metric,'minkowski'
,p,2
,metric_params,
,n_jobs,-1

0,1,2
,n_neighbors,5
,radius,1.0
,algorithm,'auto'
,leaf_size,30
,metric,'minkowski'
,p,2
,metric_params,
,n_jobs,-1

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,"{1: 1, 2: 2, 3: 3, 4: 5}"
,random_state,
,solver,'lbfgs'
,max_iter,100


In [15]:
y_predict = pipeline.predict(X_test)
print(classification_report(y_test, y_predict))



              precision    recall  f1-score   support

           1       0.87      0.53      0.66     48829
           2       0.57      0.64      0.60     45378
           3       0.36      0.04      0.08     17845
           4       0.08      0.82      0.15      3143

    accuracy                           0.51    115195
   macro avg       0.47      0.51      0.37    115195
weighted avg       0.65      0.51      0.53    115195



## CatBoost Model

### With Resampling

In [None]:
from catboost import CatBoostClassifier, Pool
import numpy as np

transformer = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False), cat_cols),
    ('num', MinMaxScaler(), num_cols),
], remainder='passthrough')

X_train_en = transformer.fit_transform(X_train, y_train)  # returns X only
X_test_en  = transformer.transform(X_test)

resampler = BorderlineSMOTE(k_neighbors=NearestNeighbors(n_jobs=-1), m_neighbors=NearestNeighbors(n_jobs=-1))
X_train_sam, y_train_sam = resampler.fit_resample(X_train_en, y_train)  # use original y

# Do NOT pass cat_features after OHE
train_pool = Pool(X_train_sam, y_train_sam)
test_pool  = Pool(X_test_en)

model = CatBoostClassifier(
    loss_function='MultiClass',
    eval_metric='TotalF1',
    task_type='GPU',
    devices='0',
    auto_class_weights='Balanced'
)
model.fit(train_pool, verbose=False)

y_pred = model.predict(test_pool)

In [None]:
print(classification_report(y_test, y_pred))

### With Under and Oversampling

In [18]:
from catboost import CatBoostClassifier, Pool
from sklearn.preprocessing import OrdinalEncoder
from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import TomekLinks

ord_enc = OrdinalEncoder()
X_train_ord = X_train.copy()
X_test_ord = X_test.copy()

#X_train_ord[['vehicle_category', 'vehicle_category_other']] = ord_enc.fit_transform(X_train_ord[['vehicle_category', 'vehicle_category_other']], y_train).astype(int)
#X_test_ord[['vehicle_category', 'vehicle_category_other']] = ord_enc.transform(X_test_ord[['vehicle_category', 'vehicle_category_other']]).astype(int)

# 1) Oversample mixed-type data (categoricals handled correctly)
over_sampler = SMOTENC(sampling_strategy='minority', categorical_features=list(X_train.columns.difference(['age'])))
under_sampler = RandomUnderSampler(sampling_strategy='majority')

X_train_res, y_train_res = over_sampler.fit_resample(X_train_ord, y_train)
X_train_res, y_train_res = under_sampler.fit_resample(X_train_res, y_train_res)

# 43 Train CatBoost on raw categoricals
train_pool = Pool(X_train_res, y_train_res, cat_features=range(len(X_train_res.columns)))
test_pool  = Pool(X_test_ord, cat_features=range(len(X_train_ord.columns)))

model = CatBoostClassifier(
    loss_function='MultiClass',
    eval_metric='TotalF1',
    task_type='GPU',
    devices='0',
    auto_class_weights='Balanced'
)
model.fit(train_pool, verbose=False)

y_pred = model.predict(test_pool)

['accident_situation',
 'direction_of_travel',
 'direction_of_travel_other',
 'fixed_obstacle_struck',
 'fixed_obstacle_struck_other',
 'horizontal_alignment',
 'infrastructure',
 'initial_point_of_impact',
 'initial_point_of_impact_other',
 'intersection',
 'lighting_condition',
 'location',
 'longitudinal_profile',
 'main_maneuver_before_accident',
 'main_maneuver_before_accident_other',
 'mobile_obstacle_struck',
 'mobile_obstacle_struck_other',
 'motor_type',
 'motor_type_other',
 'number_of_traffic_lanes',
 'pavement_condition',
 'position',
 'reserved_lane_present',
 'road_category',
 'sex',
 'speed_limit',
 'traffic_regime',
 'type_of_collision',
 'used_airbag',
 'used_belt',
 'used_child_restraint',
 'used_helmet',
 'user_category',
 'vehicle_category',
 'vehicle_category_involved_bicycle',
 'vehicle_category_involved_bus_coach',
 'vehicle_category_involved_hgv_truck',
 'vehicle_category_involved_light_motor_vehicle',
 'vehicle_category_involved_other',
 'vehicle_category_invol

In [19]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.85      0.75      0.80     48829
           2       0.67      0.59      0.62     45378
           3       0.39      0.57      0.46     17845
           4       0.16      0.29      0.20      3143

    accuracy                           0.65    115195
   macro avg       0.51      0.55      0.52    115195
weighted avg       0.69      0.65      0.66    115195



### Without Resampling

In [16]:
from catboost import CatBoostClassifier, Pool
import numpy as np

# keep only categorical columns that still exist
cat_cols = [c for c in cat_cols if c in X_train.columns]

# build 0-based positions
cat_features_idx = [X_train.columns.get_loc(c) for c in cat_cols]

train_pool = Pool(X_train, y_train, cat_features=cat_features_idx)
test_pool  = Pool(X_test, cat_features=cat_features_idx)

model = CatBoostClassifier(
    # depth=6,
    # learning_rate=0.005,
    # iterations=1600,
    loss_function='MultiClass',  # or RMSE
    eval_metric='TotalF1',
    task_type='GPU',
    devices='0',
    auto_class_weights='Balanced'
)
model.fit(train_pool, verbose=False)

y_pred = model.predict(test_pool)

In [17]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.84      0.77      0.80     48829
           2       0.68      0.57      0.62     45378
           3       0.39      0.49      0.43     17845
           4       0.17      0.56      0.27      3143

    accuracy                           0.64    115195
   macro avg       0.52      0.60      0.53    115195
weighted avg       0.69      0.64      0.66    115195



## Isotonic Regression

In [None]:
# y in {0,1,...,K-1}; s_val = regressor predictions on a *validation* set
transformer = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False), cat_cols),
    ('num', MinMaxScaler(), num_cols),
], remainder='passthrough')

iso = IsotonicRegression(y_min=1, y_max=None, increasing=True, out_of_bounds="clip")

pipeline = Pipeline([
    ('pre', transformer),
    ('reg', iso)
])

pipeline.fit(X_train, y_train)

def predict_classes(s):
    y_hat = np.rint(iso.predict(s))        # round to nearest integer
    return np.clip(y_hat, 1, 4).astype(int)

y_pred = predict_classes(X_test)

ValueError: Isotonic regression input X should be a 1d array or 2d array with 1 feature

In [None]:
print(classification_report(y_test, y_pred))

## EasyEnsembleClassifier

In [18]:
from imblearn.ensemble import EasyEnsembleClassifier

In [19]:
transformer = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False), cat_cols),
    ('num', MinMaxScaler(), num_cols),
], remainder='passthrough')

pipeline = Pipeline([
    ('pre', transformer),
    ('en', EasyEnsembleClassifier(random_state=42, n_jobs=-1))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)



In [20]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.73      0.66      0.69     48829
           2       0.56      0.45      0.50     45378
           3       0.32      0.33      0.33     17845
           4       0.10      0.53      0.17      3143

    accuracy                           0.52    115195
   macro avg       0.43      0.49      0.42    115195
weighted avg       0.58      0.52      0.54    115195



## Balanced Random Forest Classifier

In [21]:
from imblearn.ensemble import BalancedRandomForestClassifier

In [36]:
transformer = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False), cat_cols),
    ('num', MinMaxScaler(), num_cols),
], remainder='passthrough')

pipeline = Pipeline([
    ('pre', transformer),
    ('en', BalancedRandomForestClassifier(n_jobs=-1, n_estimators=100, sampling_strategy='all'))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)



In [41]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.82      0.77      0.80     48829
           2       0.67      0.57      0.62     45378
           3       0.39      0.50      0.44     17845
           4       0.18      0.49      0.27      3143

    accuracy                           0.64    115195
   macro avg       0.52      0.58      0.53    115195
weighted avg       0.68      0.64      0.65    115195



## BalancedBaggingClassifier

In [50]:
from imblearn.ensemble import BalancedBaggingClassifier

In [55]:
transformer = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False), cat_cols),
    ('num', MinMaxScaler(), num_cols),
], remainder='passthrough')

pipeline = Pipeline([
    ('pre', transformer),
    ('en', BalancedBaggingClassifier(n_estimators=100, n_jobs=8))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)



In [58]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.84      0.75      0.79     48829
           2       0.66      0.57      0.61     45378
           3       0.38      0.49      0.43     17845
           4       0.17      0.52      0.26      3143

    accuracy                           0.63    115195
   macro avg       0.51      0.58      0.52    115195
weighted avg       0.68      0.63      0.65    115195

