In [1]:
# General Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Sklearn packages
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_selection import RFE

from sklearn.model_selection import StratifiedKFold

# Ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.air import session

import xgboost as xgb

from sklearn.preprocessing import LabelEncoder

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import IsolationForest

from sklearn.metrics import classification_report, f1_score

from utils import *
from utils_dicts import *

from functools import partial

import warnings
warnings.filterwarnings("ignore")

%load_ext autoreload
%autoreload 2

In [2]:
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [3]:
import os
os.environ["RAY_AIR_NEW_OUTPUT"] = "1"

In [4]:
try:
    import torch
    import ray
except:
    !pip install torch
    import torch
    import ray
device = "GPU" if torch.cuda.is_available() else "CPU"

In [5]:
import time
start = start_time = time.time()

In [6]:
# Fazer para CatBoosted, XGBoosted, GB, LightGBM e um modelo da lista

In [7]:
ray.init(local_mode=True,include_dashboard=True)

2024-12-13 08:24:32,576	INFO worker.py:1777 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.11.9
Ray version:,2.37.0
Dashboard:,http://127.0.0.1:8265


In [8]:
print(ray.available_resources())

{'accelerator_type:G': 1.0, 'node:__internal_head__': 1.0, 'CPU': 24.0, 'memory': 13583740110.0, 'node:127.0.0.1': 1.0, 'object_store_memory': 6791870054.0, 'GPU': 1.0}


In [9]:
train_df = pd.read_csv("./preprocessed_data/train_data.csv", index_col="Claim Identifier")

In [10]:
for col in numerical_features+categorical_features:
    if col not in train_df.columns:
        print(col)
print("-------")
for col in train_df.columns:
    if col not in numerical_features+categorical_features:
        print(col)

Enc County of Injury
Enc District Name
Enc Industry Code
Enc WCIO Cause of Injury Code
Enc WCIO Nature of Injury Code
Enc WCIO Part Of Body Code
Enc Zip Code
Relative_Wage
Financial Impact Category
Age_Group
-------
Accident Date
County of Injury
District Name
Industry Code
WCIO Cause of Injury Code
WCIO Nature of Injury Code
WCIO Part Of Body Code
Zip Code
Claim Injury Type Encoded


In [11]:
# Install Ray and Cuda

# What to do with NAs in Wage (and Industry Code)

# Model Selection

    # Feature Selection

    # Kfold load (Have a script that will create 3 versions for Kfold for 4-6 folds)
    # Impute

    # Which models (catboosted)
    # Compare validation to 10% test

# Model Gridsearch
    # Find 2-3 models that are good
    # Look at the parameters and create dict with them
    # Run Ray.Tune on models and paramenter - how?
    # Compare best for each model to 10%

# Test Predict
    # Train best model
    # Predict
    # Profit

Claim Injury Type
- 2. NON-COMP        291078
- 4. TEMPORARY       148507
- 3. MED ONLY         68906
- 5. PPD SCH LOSS     48280
- 1. CANCELLED        12477
- 6. PPD NSL           4211
- 8. DEATH              470
- 7. PTD                 97

In [12]:
X = train_df.drop(["Claim Injury Type Encoded"], axis = 1)
y = train_df["Claim Injury Type Encoded"]

In [13]:
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.75, stratify = y, shuffle = True)

In [14]:
X_train, X_val = apply_frequency_encoding(X_train, X_val)

In [15]:
NA_imputer(X_train,X_val)

In [16]:
create_new_features(X_train,X_val)

In [17]:
scaler_train = StandardScaler()
X_train[numerical_features] = scaler_train.fit_transform(X_train[numerical_features])
X_val[numerical_features] = scaler_train.transform(X_val[numerical_features])

In [18]:
#X_train = X_train[features_to_use]
#X_val = X_val[features_to_use]

In [19]:
X_train_ray = ray.put(X_train)
y_train_ray = ray.put(y_train)
X_val_ray = ray.put(X_val)
y_val_ray = ray.put(y_val)

In [20]:
search_space = {
    # Model Dependent
    "iterations": tune.grid_search([200, 500, 800, 1000, 1500, 2000]),  
    "learning_rate": tune.grid_search([0.03, 0.05, 0.1, 0.15, 0.2,]),        
    "depth": tune.grid_search([6, 8, 10]),           
    "l2_leaf_reg": tune.grid_search([5, 10, 15]),                 
    "bagging_temperature": tune.grid_search([0, 0.1, 0.5, 1]),         
    "border_count": tune.grid_search([32, 64, 128]),    
    "random_strength": tune.grid_search([0, 0.1, 0.5, 1]),              
    "grow_policy": tune.grid_search(["SymmetricTree", "Depthwise"]), 
    "min_data_in_leaf": tune.grid_search([1, 5, 10]),                  
    "colsample_bylevel": tune.grid_search([0.5, 0.7, 0.9]),          
    
    # Always Use
    #"use_feature_selection": tune.choice([essential_features, reduced_features, all_features]),
    #"task_type" : device,
    "use_SMOTE": tune.grid_search([True, False]),
    "use_RandomUnderSampler": tune.grid_search([True, False]),
    #"use_Isolation_Forests":tune.choice([True,False])
}

In [21]:
def CatBoosted_GridSearch(config):
    X_train_gridsearch = ray.get(X_train_ray)
    y_train_gridsearch = ray.get(y_train_ray)
    X_val_gridsearch = ray.get(X_val_ray)
    y_val_gridsearch = ray.get(y_val_ray)
    
    # Use feature selection
    #features_to_drop = [feature for feature in X_train_gridsearch.columns if feature not in config["use_feature_selection"]]
    #X_train_gridsearch = X_train_gridsearch.drop(features_to_drop, axis=1)
    #X_val_gridsearch = X_val_gridsearch.drop(features_to_drop, axis=1)

    # SMOTE and RandomUnderSampling
    if config["use_SMOTE"] and not config["use_RandomUnderSampler"]:
        smote = SMOTE()
        X_train_gridsearch, y_train_gridsearch = smote.fit_resample(X_train_gridsearch, y_train_gridsearch)
    elif config["use_RandomUnderSampler"] and not config["use_SMOTE"]:
        rus = RandomUnderSampler()
        X_train_gridsearch, y_train_gridsearch = rus.fit_resample(X_train_gridsearch, y_train_gridsearch)


    #use isolation Forests
    #if config["use_Isolation_Forests"]:
    #    iso_forest = IsolationForest()
    #    y_pred = iso_forest.fit_predict(X_train_gridsearch)
    #    train_remove = X_train_gridsearch[y_pred == -1]
    #    X_train_gridsearch = X_train_gridsearch.drop(train_remove.index, axis=0)     
    #    y_val_gridsearch = y_val_gridsearch.drop(train_remove.index, axis=0) 
    
    model = CatBoostClassifier(
        iterations=config["iterations"],
        learning_rate=config["learning_rate"],
        depth=config["depth"],
        l2_leaf_reg=config["l2_leaf_reg"],
        bagging_temperature=config["bagging_temperature"],
        border_count=config["border_count"],
        random_strength=config["random_strength"],
        grow_policy=config["grow_policy"],
        min_data_in_leaf=config["min_data_in_leaf"],
        colsample_bylevel=config["colsample_bylevel"],
        # -------------------
        loss_function="MultiClass", 
        eval_metric="MultiClass",  
        custom_metric='F1', 
        verbose=0
    )

    model.fit(X_train_gridsearch,y_train_gridsearch)
    
    # Predict on validation data
    y_pred = model.predict(X_val_gridsearch)

    # Compute F1 score
    f1 = f1_score(y_val_gridsearch, y_pred, average="macro")

    # Report results to Ray Tune
    session.report({"f1_score": f1})

In [None]:
analysis = tune.run(
    CatBoosted_GridSearch,
    config=search_space, 
    scheduler=ASHAScheduler(metric="f1_score", mode="max", grace_period=5),
    resources_per_trial={"cpu": 8, "gpu": 1},
    trial_dirname_creator=custom_trial_dirname,
    verbose=1
)

In [None]:
best_trial = analysis.get_best_trial(metric="f1_score", mode="max")
print("Best trial config: ", best_trial.config)
print("Best trial final F1 score: ", best_trial.last_result["f1_score"])

In [None]:
end_time = time.time()
hours_passed = (end_time - start_time) / 3600
print(f"It took {hours_passed:.2f} hours")

In [None]:
total_trials = len(analysis.trials)
print(f"Total number of trials: {total_trials}")

In [None]:
break

In [None]:
import joblib 

best_config = best_trial.config

best_model = CatBoostClassifier(
    iterations=best_config["iterations"],
    learning_rate=best_config["learning_rate"],
    depth=best_config["depth"],
    l2_leaf_reg=best_config["l2_leaf_reg"],
    loss_function="MultiClass",  
    eval_metric="MultiClass",  
    custom_metric=['F1'],  
    verbose =  0
)

# Fit the model
best_model.fit(X_train, y_train)

# Save the trained model to a file
model_path = "./best_catboost_model.joblib"
joblib.dump(best_model, model_path)

print(f"Best model saved to {model_path}")

In [None]:
# Predict on validation data
y_pred = best_model.predict(X_val)

print(classification_report(y_val, y_pred))