# Load the Dataset

In [1]:
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE, RandomOverSampler, SMOTEN, SMOTENC, BorderlineSMOTE
from imblearn.combine import SMOTEENN, SMOTETomek


from mord import LogisticAT, LogisticIT

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('../data/new/feature_selection/feature_selection-2022.csv', sep=';')
df_test = pd.read_csv('../data/new/feature_selection/feature_selection-2023.csv', sep=';')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126662 entries, 0 to 126661
Data columns (total 52 columns):
 #   Column                                         Non-Null Count   Dtype  
---  ------                                         --------------   -----  
 0   day                                            126662 non-null  int64  
 1   month                                          126662 non-null  int64  
 2   lighting_condition                             126662 non-null  int64  
 3   location                                       126662 non-null  int64  
 4   intersection                                   126662 non-null  int64  
 5   weather_condition                              126662 non-null  int64  
 6   type_of_collision                              126662 non-null  int64  
 7   latitude                                       126662 non-null  float64
 8   longitude                                      126662 non-null  float64
 9   road_category                        

In [5]:
def not_reported_summary(
        df: pd.DataFrame,
        zeros_as_na_cols=None,  # columns where 0 means "Sans objet/Aucun"
        extra_markers_by_col=None  # dict like {"col": {"-9","99"}}
) -> pd.DataFrame:
    zeros_as_na_cols = set(zeros_as_na_cols or [])
    extra_markers_by_col = extra_markers_by_col or {}

    rows = []
    for col in df.columns:
        s = df[col]

        # work on strings to catch both string and numeric codes
        s_str = s.astype(str).str.strip()
        s_num = pd.to_numeric(s_str, errors="coerce")

        nulls = s.isna().sum()
        blanks = (s_str == "").sum()
        dots = (s_str == ".").sum()
        minus1 = (s_num == -1).sum()  # "-1 = Non renseigné" in many fields
        zero_na = ((s_num == 0) | (s_str == "0")).sum() if col in zeros_as_na_cols else 0

        # column-specific extra markers (e.g., {"grav": {"9"}, "trajet": {"0"}})
        extras = 0
        if col in extra_markers_by_col:
            extras = s_str.isin(set(extra_markers_by_col[col])).sum()

        any_na = nulls + blanks + dots + minus1 + zero_na + extras
        pct = any_na / len(df) * 100 if len(df) else 0

        rows.append({
            "column": col,
            "rows": len(df),
            "null": int(nulls),
            "blank": int(blanks),
            "dot(.)": int(dots),
            "-1_not_reported": int(minus1),
            "zero_as_na": int(zero_na),
            "extra_markers": int(extras),
            "any_missing_like": int(any_na),
            "any_missing_like_pct": round(pct, 2),
        })

    out = pd.DataFrame(rows).sort_values("any_missing_like_pct", ascending=False)
    return out

In [6]:
not_reported_summary(df)

Unnamed: 0,column,rows,null,blank,dot(.),-1_not_reported,zero_as_na,extra_markers,any_missing_like,any_missing_like_pct
23,pedestrian_location,126662,0,0,0,51092,0,0,51092,40.34
24,pedestrian_action,126662,0,0,0,49604,0,0,49604,39.16
32,direction_of_travel_other,126662,27430,0,0,0,0,0,27430,21.66
38,motor_type_other,126662,27314,0,0,0,0,0,27314,21.56
34,fixed_obstacle_struck_other,126662,27205,0,0,0,0,0,27205,21.48
36,initial_point_of_impact_other,126662,27200,0,0,0,0,0,27200,21.47
35,mobile_obstacle_struck_other,126662,27177,0,0,0,0,0,27177,21.46
37,main_maneuver_before_accident_other,126662,27162,0,0,0,0,0,27162,21.44
33,vehicle_category_other,126662,27129,0,0,0,0,0,27129,21.42
25,direction_of_travel,126662,9895,0,0,0,0,0,9895,7.81


In [7]:
df.head()

Unnamed: 0,day,month,lighting_condition,location,intersection,weather_condition,type_of_collision,latitude,longitude,road_category,traffic_regime,number_of_traffic_lanes,reserved_lane_present,longitudinal_profile,horizontal_alignment,pavement_condition,infrastructure,accident_situation,speed_limit,position,user_category,injury_severity,sex,pedestrian_location,pedestrian_action,direction_of_travel,vehicle_category,fixed_obstacle_struck,mobile_obstacle_struck,initial_point_of_impact,main_maneuver_before_accident,motor_type,direction_of_travel_other,vehicle_category_other,fixed_obstacle_struck_other,mobile_obstacle_struck_other,initial_point_of_impact_other,main_maneuver_before_accident_other,motor_type_other,vehicle_category_involved_bicycle,vehicle_category_involved_bus_coach,vehicle_category_involved_hgv_truck,vehicle_category_involved_light_motor_vehicle,vehicle_category_involved_other,vehicle_category_involved_powered_2_3_wheeler,hour,minute,age,used_belt,used_helmet,used_child_restraint,used_airbag
0,19,10,1,2,3,1,3,44.55942,4.72572,4,2,2.0,0,1,1,1,0,1,50,1,1,3,1,-1,-1,1.0,powered_2_3_wheeler,0.0,2.0,1.0,9.0,1.0,1.0,light_motor_vehicle,0.0,2.0,2.0,1.0,1.0,0,0,0,1,0,1,16,15,14.0,0,1,0,0
1,19,10,1,2,3,1,3,44.55942,4.72572,4,2,2.0,0,1,1,1,0,1,50,1,1,1,1,-1,-1,1.0,light_motor_vehicle,0.0,2.0,2.0,1.0,1.0,1.0,powered_2_3_wheeler,0.0,2.0,1.0,9.0,1.0,0,0,0,1,0,1,16,15,74.0,1,0,0,0
2,20,10,1,2,3,1,3,46.92581,6.3462,4,2,2.0,0,1,1,1,0,1,50,1,1,4,1,0,0,2.0,light_motor_vehicle,0.0,2.0,8.0,15.0,1.0,2.0,light_motor_vehicle,0.0,2.0,1.0,1.0,1.0,0,0,0,2,0,0,8,34,34.0,1,0,0,0
3,20,10,1,2,3,1,3,46.92581,6.3462,4,2,2.0,0,1,1,1,0,1,50,1,1,1,1,0,0,2.0,light_motor_vehicle,0.0,2.0,1.0,1.0,1.0,2.0,light_motor_vehicle,0.0,2.0,8.0,15.0,1.0,0,0,0,2,0,0,8,34,52.0,1,0,0,0
4,20,10,1,2,6,1,2,48.493162,-2.760439,3,-1,2.0,0,1,1,1,5,1,50,1,1,1,1,-1,-1,2.0,light_motor_vehicle,0.0,2.0,1.0,2.0,1.0,1.0,light_motor_vehicle,0.0,2.0,4.0,2.0,1.0,0,0,0,2,0,0,17,15,20.0,1,0,0,0


In [8]:
for x in [df, df_test]:
    x['injury_severity'] = x['injury_severity'].map({1:1, 2:4, 3:3, 4:2})
    x[['vehicle_category', 'vehicle_category_other']] = x[['vehicle_category', 'vehicle_category_other']].fillna('no_vehicle')
    x.fillna(-1, inplace=True)
    x.drop(columns=['day', 'month', 'latitude', 'longitude', 'hour', 'minute'], inplace=True)
    x.drop(index=x.index[x['injury_severity'] == -1], inplace=True)
    x['pedestrian_action'] = pd.to_numeric(x['pedestrian_action'], errors='coerce').fillna(-1)
    x[x.select_dtypes(include='float64').columns] = x[x.select_dtypes(include='float64').columns].astype(int)
    x.dropna(inplace=True)

In [9]:
df

Unnamed: 0,lighting_condition,location,intersection,weather_condition,type_of_collision,road_category,traffic_regime,number_of_traffic_lanes,reserved_lane_present,longitudinal_profile,horizontal_alignment,pavement_condition,infrastructure,accident_situation,speed_limit,position,user_category,injury_severity,sex,pedestrian_location,pedestrian_action,direction_of_travel,vehicle_category,fixed_obstacle_struck,mobile_obstacle_struck,initial_point_of_impact,main_maneuver_before_accident,motor_type,direction_of_travel_other,vehicle_category_other,fixed_obstacle_struck_other,mobile_obstacle_struck_other,initial_point_of_impact_other,main_maneuver_before_accident_other,motor_type_other,vehicle_category_involved_bicycle,vehicle_category_involved_bus_coach,vehicle_category_involved_hgv_truck,vehicle_category_involved_light_motor_vehicle,vehicle_category_involved_other,vehicle_category_involved_powered_2_3_wheeler,age,used_belt,used_helmet,used_child_restraint,used_airbag
0,1,2,3,1,3,4,2,2,0,1,1,1,0,1,50,1,1,3,1,-1,-1,1,powered_2_3_wheeler,0,2,1,9,1,1,light_motor_vehicle,0,2,2,1,1,0,0,0,1,0,1,14,0,1,0,0
1,1,2,3,1,3,4,2,2,0,1,1,1,0,1,50,1,1,1,1,-1,-1,1,light_motor_vehicle,0,2,2,1,1,1,powered_2_3_wheeler,0,2,1,9,1,0,0,0,1,0,1,74,1,0,0,0
2,1,2,3,1,3,4,2,2,0,1,1,1,0,1,50,1,1,2,1,0,0,2,light_motor_vehicle,0,2,8,15,1,2,light_motor_vehicle,0,2,1,1,1,0,0,0,2,0,0,34,1,0,0,0
3,1,2,3,1,3,4,2,2,0,1,1,1,0,1,50,1,1,1,1,0,0,2,light_motor_vehicle,0,2,1,1,1,2,light_motor_vehicle,0,2,8,15,1,0,0,0,2,0,0,52,1,0,0,0
4,1,2,6,1,2,3,-1,2,0,1,1,1,5,1,50,1,1,1,1,-1,-1,2,light_motor_vehicle,0,2,1,2,1,1,light_motor_vehicle,0,2,4,2,1,0,0,0,2,0,0,20,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126657,1,1,3,1,3,3,2,2,0,1,1,1,0,1,80,1,1,1,2,0,0,2,light_motor_vehicle,0,0,8,19,1,2,light_motor_vehicle,0,2,1,1,1,0,0,0,2,0,0,20,1,0,0,0
126658,1,1,3,1,3,3,2,2,0,1,1,1,0,1,80,8,2,3,2,0,0,2,light_motor_vehicle,0,0,8,19,1,2,light_motor_vehicle,0,2,1,1,1,0,0,0,2,0,0,18,1,0,0,0
126659,1,1,3,1,3,3,2,2,0,1,1,1,0,1,80,1,1,2,2,0,0,2,light_motor_vehicle,0,2,1,1,1,2,light_motor_vehicle,0,0,8,19,1,0,0,0,2,0,0,69,1,0,0,0
126660,1,2,1,1,2,3,3,4,0,1,1,1,0,1,70,1,1,3,1,-1,-1,1,powered_2_3_wheeler,0,2,1,1,1,1,light_motor_vehicle,0,0,4,21,1,0,0,0,1,0,1,30,0,1,0,0


## Create Train and Test Set

In [27]:
X_train = df.drop(columns='injury_severity')
y_train = df['injury_severity']

In [28]:
X_test = df_test.drop(columns='injury_severity')
y_test = df_test['injury_severity']

## Create a Baseline Model - K Nearest Neighbor Classifier

In [None]:
cat_cols = [
    'lighting_condition', 'location', 'intersection', 'weather_condition', 'type_of_collision', 'road_category', 'traffic_regime', 'reserved_lane_present', 'longitudinal_profile',
    'horizontal_alignment', 'pavement_condition', 'infrastructure', 'accident_situation', 'user_category', 'sex', 'pedestrian_location', 'pedestrian_action', 'direction_of_travel', 'vehicle_category', 'fixed_obstacle_struck',
    'mobile_obstacle_struck', 'initial_point_of_impact', 'main_maneuver_before_accident', 'motor_type', 'direction_of_travel_other', 'vehicle_category_other', 'fixed_obstacle_struck_other', 'mobile_obstacle_struck_other',
    'initial_point_of_impact_other', 'main_maneuver_before_accident_other', 'motor_type_other'
]
num_cols = X_train.columns.difference(cat_cols)

In [None]:
transformer = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'), cat_cols),
    ('num', MinMaxScaler(), num_cols),
], remainder='passthrough')

pipeline = ImbPipeline(steps=[
    ('pre', transformer),
    ('os', SMOTE(random_state=42, k_neighbors=50)),  # try RandomUnderSampler() or SMOTE for numeric-only
    ('clf', KNeighborsClassifier(n_neighbors=50, n_jobs=-1)),
])

pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('pre', ...), ('os', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,sampling_strategy,'auto'
,random_state,42
,k_neighbors,50

0,1,2
,n_neighbors,50
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,-1


In [32]:
y_predict = pipeline.predict(X_test)
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           1       0.83      0.68      0.75     53399
           2       0.66      0.33      0.44     49603
           3       0.28      0.49      0.36     19271
           4       0.09      0.62      0.15      3398

    accuracy                           0.51    125671
   macro avg       0.47      0.53      0.42    125671
weighted avg       0.66      0.51      0.55    125671



## Logistic Regression Model

In [33]:
transformer = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False), cat_cols)
], remainder='passthrough')

pipeline = ImbPipeline(steps=[
    ('pre', transformer),
    ('os', SMOTEN(k_neighbors=50, random_state=42, sampling_strategy='minority')),  # try RandomUnderSampler() or SMOTE for numeric-only
    ('clf', LogisticRegression(max_iter=100, n_jobs=-1, class_weight={1:1, 2:8, 3:27, 4:64}, penalty='l2', multi_class='multinomial')),
])

pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('pre', ...), ('os', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categorical_encoder,
,sampling_strategy,'minority'
,random_state,42
,k_neighbors,50

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,"{1: 1, 2: 8, 3: 27, 4: 64}"
,random_state,
,solver,'lbfgs'
,max_iter,2000


In [34]:
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.96      0.50      0.66     53399
           2       0.57      0.61      0.59     49603
           3       0.34      0.58      0.43     19271
           4       0.14      0.43      0.21      3398

    accuracy                           0.56    125671
   macro avg       0.50      0.53      0.47    125671
weighted avg       0.69      0.56      0.58    125671



## Ordinal Logistic Regression

In [56]:
pre = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), cat_cols),
    ('num', StandardScaler(), num_cols)
], remainder='passthrough')

col_dict = {x:i for i, x in enumerate(X_train.columns)}
col_numbers = [col_dict[x] for x in cat_cols]

pipeline = ImbPipeline([
    ('sd', SMOTENC(k_neighbors=50, categorical_features=col_numbers, random_state=42, sampling_strategy='minority')),
    ('pre', pre),
    ('ord', LogisticIT())  # ordinal logistic (proportional odds)
])

pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('sd', ...), ('pre', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,categorical_features,"[0, 1, ...]"
,categorical_encoder,
,sampling_strategy,'minority'
,random_state,42
,k_neighbors,50

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,alpha,1.0
,verbose,0.0
,max_iter,1000.0


In [57]:
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.78      0.77      0.77     53399
           2       0.56      0.70      0.63     49603
           3       0.46      0.23      0.31     19271
           4       0.17      0.05      0.08      3398

    accuracy                           0.64    125671
   macro avg       0.49      0.44      0.45    125671
weighted avg       0.63      0.64      0.63    125671



In [59]:
pre = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), cat_cols),
    ('num', StandardScaler(), num_cols)
], remainder='passthrough')

col_dict = {x:i for i, x in enumerate(X_train.columns)}
col_numbers = [col_dict[x] for x in cat_cols]

pipeline = ImbPipeline([
    ('pre', pre),
    ('sd', BorderlineSMOTE()),
    ('ord', LogisticIT())  # ordinal logistic (proportional odds)
])

pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('pre', ...), ('sd', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,sampling_strategy,'auto'
,random_state,
,k_neighbors,5
,m_neighbors,10
,kind,'borderline-1'

0,1,2
,alpha,1.0
,verbose,0.0
,max_iter,1000.0


In [60]:
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.77      0.74      0.76     53399
           2       0.59      0.45      0.51     49603
           3       0.31      0.38      0.34     19271
           4       0.15      0.56      0.24      3398

    accuracy                           0.57    125671
   macro avg       0.46      0.53      0.46    125671
weighted avg       0.61      0.57      0.58    125671



In [None]:
import statsmodels.api as sm
from statsmodels.miscmodels.ordinal_model import OrderedModel

pre = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), cat_cols),
], remainder='passthrough')


Xtr = pre.fit_transform(X_train)
Xte = pre.transform(X_test)

import numpy as np


# 2. Build ordered logit model
#    `distr="logit"` gives proportional-odds logistic regression.
model = OrderedModel(
    y_train.astype(int),
    Xtr,   # add intercept column
    distr="logit"
)

# 3. Fit (we'll use method="bfgs" which is common for this model)
res = model.fit(method="bfgs")
print(res.summary())

In [None]:
# --- predict class probabilities for first 5 rows ---
probs = res.model.predict(res.params, exog=Xte, which='prob')
print("Class probability rows:\n", probs)

classes = np.sort(y_train.unique())

y_pred = classes[probs.argmax(axis=1)]

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
pre = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols),
    ('num', MinMaxScaler(), list(num_cols)),
], remainder='passthrough')

pipeline = Pipeline([
    ('pre', pre),
    ('ord',mord.LAD())  # ordinal logistic (proportional odds)
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Compute correlation matrix
corr_matrix = df.select_dtypes(include=['number']).corr()

# Plot heatmap
plt.figure(figsize=(30, 20))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5, vmin=-1, vmax=1)
plt.title("Feature Correlation Matrix With Target", fontsize=16)
plt.show()

In [None]:
from catboost import CatBoostRegressor, Pool
import numpy as np

cat_features_idx = range(len(X_train.columns))  # indices of categorical columns in X





train_pool = Pool(X_train, y_train, cat_features=cat_features_idx)
test_pool  = Pool(X_test,  y_test,  cat_features=cat_features_idx)

model = CatBoostRegressor(
    depth=10,
    learning_rate=0.05,
    num_trees=200,
    loss_function='RMSE',  # or RMSE
    task_type='GPU',
    devices='0',
)

model.fit(train_pool, verbose=False)

y_pred_cont = model.predict(test_pool)
y_pred = np.clip(np.rint(y_pred_cont), 1, 4).astype(int)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
cat_cols = ['vehicle_category','vehicle_category_other']
num_cols = X_train.select_dtypes(include='number').columns


X_res, y_res = RandomOverSampler(sampling_strategy='minority').fit_resample(X_train, y_train)

cat_features_idx = range(len(X_train.columns))  # indices of categorical columns in X

train_pool = Pool(X_res, y_res, cat_features=range(len(X_train.columns)))
test_pool  = Pool(X_test, y_test, cat_features=range(len(X_train.columns)))

model = CatBoostRegressor(
    depth=10,
    learning_rate=0.05,
    iterations=800,
    loss_function='MAPE',  # or RMSE
    task_type='GPU',
    devices='0'
)

model.fit(train_pool, verbose=False)
y_pred_cont = model.predict(test_pool)
y_pred = np.clip(np.rint(y_pred_cont), 1, 4).astype(int)

In [None]:
print(classification_report(y_test, y_pred))


In [None]:
from catboost import CatBoostClassifier, Pool
import numpy as np

cat_features_idx = range(len(X_train.columns))  # indices of categorical columns in X

train_pool = Pool(X_train, y_train, cat_features=cat_features_idx)
test_pool  = Pool(X_test,  y_test,  cat_features=cat_features_idx)

model = CatBoostClassifier(
    depth=6,
    learning_rate=0.005,
    iterations=1600,
    loss_function='MultiClass',  # or RMSE
    eval_metric='TotalF1',
    task_type='GPU',
    devices='0',
    auto_class_weights='Balanced'
)
model.fit(train_pool, verbose=False)

y_pred = model.predict(test_pool)

In [None]:
print(classification_report(y_test, y_pred))