In [1]:
# Import libraries
import pandas as pd
import numpy as np

#make the split here
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import StratifiedKFold
import time

from utils import *
from utils_feature_selection import check_performace
from utils_dicts import *

import warnings
warnings.filterwarnings('ignore')

In [2]:
config = {
    # Model Dependent
    "iterations": 500,  
    "learning_rate": 0.05,  
    "depth": 6,           
    "l2_leaf_reg": 6,                 
    "bagging_temperature":0.4,                         
    "random_state":69
}

In [3]:
# Import dataset
train_df = pd.read_csv('preprocessed_data/train_data.csv', index_col="Claim Identifier")

In [4]:
train_df["Claim Injury Type Encoded"].value_counts()

Claim Injury Type Encoded
1    290695
3    148370
2     68858
4     48248
0     12414
5      4203
7       457
6        97
Name: count, dtype: int64

In [5]:
def map_claim_injury_type(y_series):
    # Define the mapping rules
    mapping = {
        0: [1, 3],        # Map values 1 and 3 to 0
        1: [2, 4, 0, 5, 6, 7]   # Map values 2, 4, and 0 to 1
    }

    # Create a reverse mapping for efficient lookup
    reverse_mapping = {value: key for key, values in mapping.items() for value in values}

    # Map the series using the reverse mapping
    return y_series.map(reverse_mapping)

In [6]:
X = train_df.drop(["Claim Injury Type Encoded"], axis = 1)
y = train_df["Claim Injury Type Encoded"]

In [7]:
feature_selection = []

In [8]:
y_mapped = map_claim_injury_type(train_df["Claim Injury Type Encoded"])

In [9]:
y.value_counts()

Claim Injury Type Encoded
1    290695
3    148370
2     68858
4     48248
0     12414
5      4203
7       457
6        97
Name: count, dtype: int64

In [10]:
y_mapped.value_counts()

Claim Injury Type Encoded
0    439065
1    117106
2     17171
Name: count, dtype: int64

In [11]:
X_train, X_val, y_train, y_val = train_test_split(X,y_mapped, test_size = 0.2, stratify = y, shuffle = True)

In [12]:
X_train, X_val = apply_frequency_encoding(X_train, X_val)

NA_imputer(X_train,X_val)
create_new_features(X_train,X_val)

In [13]:
scaler = StandardScaler().fit(X_train[numerical_features])
X_train[numerical_features]  = scaler.transform(X_train[numerical_features])
X_val[numerical_features]  = scaler.transform(X_val[numerical_features]) 

In [14]:
drop_list = ["Average Weekly Wage"]
if feature_selection != []:
    for col in X_train.columns:
        if col not in feature_selection:
            drop_list.append(col)
X_train = X_train.drop(drop_list, axis=1)
X_val = X_val.drop(drop_list, axis=1)

In [15]:
# Model para 0, 1 ou 2

In [16]:
model = CatBoostClassifier(
        iterations=config["iterations"],
        learning_rate=config["learning_rate"],
        depth=config["depth"],
        l2_leaf_reg=config["l2_leaf_reg"],
        bagging_temperature=config["bagging_temperature"],
        # -------------------
        random_state=config["random_state"],
        custom_metric='F1', 
        early_stopping_rounds=50,
        verbose=0
    )

In [17]:
model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x242b65db7d0>

In [18]:
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)

In [19]:
print("\nClassification Report:")
print(classification_report(y_train, y_train_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.96      0.89    351252
           1       0.71      0.33      0.45     93684
           2       0.76      0.34      0.47     13737

    accuracy                           0.81    458673
   macro avg       0.76      0.54      0.60    458673
weighted avg       0.80      0.81      0.79    458673



In [20]:
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.96      0.89     87813
           1       0.70      0.32      0.44     23422
           2       0.75      0.34      0.47      3434

    accuracy                           0.81    114669
   macro avg       0.76      0.54      0.60    114669
weighted avg       0.80      0.81      0.78    114669



In [21]:
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.2, stratify = y, shuffle = True)

In [22]:
X_train, X_val = apply_frequency_encoding(X_train, X_val)
NA_imputer(X_train,X_val)
create_new_features(X_train,X_val)

In [23]:
scaler = StandardScaler().fit(X_train[numerical_features])
X_train[numerical_features]  = scaler.transform(X_train[numerical_features])
X_val[numerical_features]  = scaler.transform(X_val[numerical_features]) 

In [24]:
drop_list = ["Average Weekly Wage"]
if feature_selection != []:
    for col in X_train.columns:
        if col not in feature_selection:
            drop_list.append(col)
X_train = X_train.drop(drop_list, axis=1)
X_val = X_val.drop(drop_list, axis=1)

In [25]:
y_mapped_train = model.predict(X_train)
y_mapped_val = model.predict(X_val)
X_train["rarity"] = y_mapped_train
X_val["rarity"] = y_mapped_val

In [26]:
model = CatBoostClassifier(
        iterations=config["iterations"],
        learning_rate=config["learning_rate"],
        depth=config["depth"],
        l2_leaf_reg=config["l2_leaf_reg"],
        bagging_temperature=config["bagging_temperature"],
        # -------------------
        random_state=config["random_state"],
        custom_metric='F1', 
        early_stopping_rounds=50,
        verbose=0
    )

In [27]:
model.fit(X_train,y_train)

<catboost.core.CatBoostClassifier at 0x242b6506e40>

In [28]:
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)

In [29]:
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))
print("\nClassification Report:")
print(classification_report(y_train, y_train_pred))

Confusion Matrix:
[[  4812   4383    126    577     28      0      0      5]
 [  1559 223783    942   5818    432      0      0     22]
 [    51  32541   3750  15778   2956      0      0     10]
 [    46  28489   1919  80530   7656      3      0     53]
 [     4   1538    546  13706  22803      1      0      0]
 [     0      8     36   2796    519      3      0      0]
 [     0      2      0     66     10      0      0      0]
 [     3     51      7    178      8      0      0    119]]

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.48      0.59      9931
           1       0.77      0.96      0.86    232556
           2       0.51      0.07      0.12     55086
           3       0.67      0.68      0.68    118696
           4       0.66      0.59      0.62     38598
           5       0.43      0.00      0.00      3362
           6       0.00      0.00      0.00        78
           7       0.57      0.33      0.41       36

In [30]:
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))

Confusion Matrix:
[[  4812   4383    126    577     28      0      0      5]
 [  1559 223783    942   5818    432      0      0     22]
 [    51  32541   3750  15778   2956      0      0     10]
 [    46  28489   1919  80530   7656      3      0     53]
 [     4   1538    546  13706  22803      1      0      0]
 [     0      8     36   2796    519      3      0      0]
 [     0      2      0     66     10      0      0      0]
 [     3     51      7    178      8      0      0    119]]

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.47      0.58      2483
           1       0.77      0.96      0.85     58139
           2       0.50      0.07      0.12     13772
           3       0.67      0.67      0.67     29674
           4       0.66      0.59      0.62      9650
           5       0.00      0.00      0.00       841
           6       0.00      0.00      0.00        19
           7       0.44      0.19      0.26        9