In [3]:
# General Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Sklearn packages
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_selection import RFE

from sklearn.model_selection import StratifiedKFold

# Ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.air import session

import xgboost as xgb

from sklearn.preprocessing import LabelEncoder

#from imblearn.over_sampling import SMOTE
#from imblearn.under_sampling import RandomUnderSampler

from sklearn.metrics import classification_report, f1_score

from utils import *
from utils_dicts import *

import warnings
warnings.filterwarnings("ignore")

%load_ext autoreload
%autoreload 2

In [4]:
# Fazer para CatBoosted, XGBoosted, GB, LightGBM e um modelo da lista

In [5]:
from sklearn.ensemble import IsolationForest
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [6]:
import ray
ray.init()

2024-12-11 11:54:20,946	INFO worker.py:1810 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.12.6
Ray version:,2.39.0
Dashboard:,http://127.0.0.1:8265


In [7]:
train_df = pd.read_csv("./preprocessed_data/train_data.csv", index_col="Claim Identifier")

In [8]:
for col in numerical_features+categorical_features:
    if col not in train_df.columns:
        print(col)
print("-------")
for col in train_df.columns:
    if col not in numerical_features+categorical_features:
        print(col)

-------
Accident Date
Claim Injury Type Encoded


In [9]:
# Install Ray and Cuda

# What to do with NAs in Wage (and Industry Code)

# Model Selection

    # Feature Selection

    # Kfold load (Have a script that will create 3 versions for Kfold for 4-6 folds)
    # Impute

    # Which models (catboosted)
    # Compare validation to 10% test

# Model Gridsearch
    # Find 2-3 models that are good
    # Look at the parameters and create dict with them
    # Run Ray.Tune on models and paramenter - how?
    # Compare best for each model to 10%

# Test Predict
    # Train best model
    # Predict
    # Profit

Claim Injury Type
- 2. NON-COMP        291078
- 4. TEMPORARY       148507
- 3. MED ONLY         68906
- 5. PPD SCH LOSS     48280
- 1. CANCELLED        12477
- 6. PPD NSL           4211
- 8. DEATH              470
- 7. PTD                 97

In [10]:
X = train_df.drop(["Claim Injury Type Encoded"], axis = 1)
y = train_df["Claim Injury Type Encoded"]

In [11]:
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.75, stratify = y, shuffle = True)

In [12]:
NA_imputer(X_train,X_val)

In [13]:
create_new_features(X_train,X_val)

In [14]:
scaler_train = StandardScaler()
X_train[numerical_features] = scaler_train.fit_transform(X_train[numerical_features])
X_val[numerical_features] = scaler_train.transform(X_val[numerical_features])

In [15]:
# Instantiate the model
iso_forest = IsolationForest()

In [26]:
iso_forest.fit(X_train)
y_pred = iso_forest.predict(X_train)

In [27]:
train_remove = y_val[y_pred == -1]

In [None]:
#X_train = X_train[features_to_use]
#X_val = X_val[features_to_use]

In [None]:
X_train_ray = ray.put(X_train)
y_train_ray = ray.put(y_train)
X_val_ray = ray.put(X_val)
y_val_ray = ray.put(y_val)

In [None]:
search_space = {
    # Model Dependent
    "iterations": tune.choice([500, 1000, 1500]),      
    "learning_rate": tune.loguniform(0.01, 0.3),      
    "depth": tune.choice([4, 6, 8, 10]),              
    "l2_leaf_reg": tune.uniform(1, 10),
    # Always Use
    "use_feature_selection": tune.choice([essential_features, reduced_features, all_features)
    "use_SMOTE":tune.choice([True,False]),
    "use_RandomUnderSampler":tune.choice([True,False]),
    "use_Isolation_Forests":.choice([True,False]
}

In [22]:
def CatBoosted_GridSearch(config):
    X_train_gridsearch = ray.get(X_train_ray)
    y_train_gridsearch = ray.get(y_train_ray)
    X_val_gridsearch = ray.get(X_val_ray)
    y_val_gridsearch = ray.get(y_val_ray)
    
    # Use feature selection
    features_to_drop = [feature for feature in X_train_gridsearch.columns if feature not in config["use_feature_selection"]]
    X_train_gridsearch = X_train_gridsearch.drop(features_to_drop, axis=1)
    X_val_gridsearch = X_val_gridsearch.drop(features_to_drop, axis=1)

    # SMOTE and RandomUnderSampling
    if config["use_SMOTE"] and not config["use_RandomUnderSampler"]:
        smote = SMOTE()
        X_train_gridsearch, y_train_gridsearch = smote.fit_resample(X_train_gridsearch, y_train_gridsearch)
    elif config["use_RandomUnderSampler"] and not config["use_SMOTE"]:
        rus = RandomUnderSampler()
        X_train_gridsearch, y_train_gridsearch = rus.fit_resample(X_train_gridsearch, y_train_gridsearch)

    #use isolation Forests
    if config["use_Isolation_Forests"]:
        iso_forest.fit(X_train)
        y_pred = iso_forest.predict(X_train_gridsearch)
        train_remove = y_val[y_pred == -1]
        X_train_gridsearch = X_train_gridsearch.drop(train_remove, axis=1)
        
    
    model = CatBoostClassifier(
        iterations=config["iterations"],
        learning_rate=config["learning_rate"],
        depth=config["depth"],
        l2_leaf_reg=config["l2_leaf_reg"],
        loss_function="MultiClass", 
        eval_metric="MultiClass",  
        custom_metric=['F1'], 
        verbose=0
    )

    model.fit(X_train_gridsearch,y_train_gridsearch)
    
    # Predict on validation data
    y_pred = model.predict(X_val_gridsearch)

    # Compute F1 score
    f1 = f1_score(y_val_gridsearch, y_pred, average="macro")

    # Report results to Ray Tune
    session.report({"f1_score": f1})

In [23]:
model_name = "CatBoosted"
analysis = tune.run(
    CatBoosted_GridSearch,
    config=search_space,
    resources_per_trial={"cpu": 1},  
    scheduler=ASHAScheduler(metric="f1_score", mode="max"),
    trial_dirname_creator=custom_trial_dirname_creator
    verbose=1
)

0,1
Current time:,2024-11-27 12:26:42
Running for:,00:01:45.01
Memory:,11.7/15.8 GiB

Trial name,status,loc,depth,iterations,l2_leaf_reg,learning_rate,iter,total time (s),f1_score
CatBoosted_GridSearch_9d2af_00000,TERMINATED,127.0.0.1:7152,6,500,9.9155,0.155753,1,100.11,0.409527


2024-11-27 12:26:42,266	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to 'C:/Users/ruipb/ray_results/CatBoosted_GridSearch_2024-11-27_12-24-57' in 0.0101s.
2024-11-27 12:26:42,284	INFO tune.py:1041 -- Total run time: 105.05 seconds (105.00 seconds for the tuning loop).


In [24]:
best_trial = analysis.get_best_trial(metric="f1_score", mode="max")
print("Best trial config: ", best_trial.config)
print("Best trial final F1 score: ", best_trial.last_result["f1_score"])

Best trial config:  {'iterations': 500, 'learning_rate': 0.1557532043305807, 'depth': 6, 'l2_leaf_reg': 9.91550442068039}
Best trial final F1 score:  0.40952725824329306


In [None]:
break

In [26]:
import joblib 

best_config = best_trial.config

best_model = CatBoostClassifier(
    iterations=best_config["iterations"],
    learning_rate=best_config["learning_rate"],
    depth=best_config["depth"],
    l2_leaf_reg=best_config["l2_leaf_reg"],
    loss_function="MultiClass",  
    eval_metric="MultiClass",  
    custom_metric=['F1'],  
    verbose =  0
)

# Fit the model
best_model.fit(X_train, y_train)

# Save the trained model to a file
model_path = "./best_catboost_model.joblib"
joblib.dump(best_model, model_path)

print(f"Best model saved to {model_path}")

0:	learn: 1.5023354	total: 411ms	remaining: 3m 25s
1:	learn: 1.2829187	total: 684ms	remaining: 2m 50s
2:	learn: 1.1416639	total: 958ms	remaining: 2m 38s
3:	learn: 1.0396625	total: 1.19s	remaining: 2m 28s
4:	learn: 0.9617694	total: 1.47s	remaining: 2m 25s
5:	learn: 0.9026528	total: 1.71s	remaining: 2m 21s
6:	learn: 0.8552358	total: 1.93s	remaining: 2m 15s
7:	learn: 0.8158666	total: 2.22s	remaining: 2m 16s
8:	learn: 0.7843445	total: 2.44s	remaining: 2m 12s
9:	learn: 0.7585362	total: 2.65s	remaining: 2m 9s
10:	learn: 0.7367144	total: 2.87s	remaining: 2m 7s
11:	learn: 0.7193482	total: 3.08s	remaining: 2m 5s
12:	learn: 0.7044131	total: 3.32s	remaining: 2m 4s
13:	learn: 0.6917866	total: 3.52s	remaining: 2m 2s
14:	learn: 0.6806786	total: 3.74s	remaining: 2m
15:	learn: 0.6717311	total: 3.94s	remaining: 1m 59s
16:	learn: 0.6635316	total: 4.16s	remaining: 1m 58s
17:	learn: 0.6565800	total: 4.37s	remaining: 1m 57s
18:	learn: 0.6508537	total: 4.59s	remaining: 1m 56s
19:	learn: 0.6457028	total: 4.8

In [27]:
# Predict on validation data
y_pred = best_model.predict(X_val)

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.44      0.56      2417
           1       0.85      0.98      0.91     64807
           2       0.53      0.08      0.13     15439
           3       0.71      0.88      0.79     33281
           4       0.69      0.53      0.60     10858
           5       0.00      0.00      0.00       947
           6       0.00      0.00      0.00        22
           7       0.51      0.20      0.29       105

    accuracy                           0.79    127876
   macro avg       0.51      0.39      0.41    127876
weighted avg       0.75      0.79      0.74    127876

