In [1]:
# General Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Sklearn packages
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_selection import RFE

from sklearn.model_selection import StratifiedKFold

# Ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.air import session

# embedded methods
from sklearn.linear_model import LassoCV
import scipy.stats as stats
from scipy.stats import chi2_contingency

from sklearn.preprocessing import LabelEncoder

#from imblearn.over_sampling import SMOTE
#from imblearn.under_sampling import RandomUnderSampler

from sklearn.metrics import classification_report, f1_score

#from utils_train import *
from utils import *
import xgboost as xgb

import warnings
warnings.filterwarnings("ignore")

%load_ext autoreload
%autoreload 2

In [2]:
import ray
ray.init()

2024-11-25 15:10:50,929	INFO worker.py:1810 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.12.6
Ray version:,2.39.0
Dashboard:,http://127.0.0.1:8265


In [3]:
reference_date = pd.to_datetime("2023-01-01")

In [4]:
train_df = pd.read_csv("./preprocessed_data/train_data_after_EDA.csv", index_col="Claim Identifier")

In [5]:
numerical_features = [
    "Age at Injury",
    "WCIO Cause of Injury Code",
    "WCIO Nature of Injury Code",
    "WCIO Part Of Body Code",
    "Number of Dependents",
    "Years Past Accident",
    "Assembly Years past Accident",
    "Industry Code",
    "Birth Year",
    "Average Weekly Wage",
    "IME-4 Count"
]

categorical_features = [
    "Carrier Name",
    "Carrier Type",
    "County of Injury",
    "District Name",
    "Gender",
    "Medical Fee Region",
    "Zip Code"
]

binary_features = [
    "Alternative Dispute Resolution",
    "Attorney/Representative",
    "COVID-19 Indicator",
    "First Hearing Date Occurred",
    "C-2 Date Occurred",
    "C-3 Date Occurred"
]


In [6]:
for col in numerical_features+categorical_features+binary_features:
    if col not in train_df.columns:
        print(col)
print("-------")
for col in train_df.columns:
    if col not in numerical_features+categorical_features+binary_features:
        print(col)

-------
Accident Date
Claim Injury Type


# Remove Some NAs

In [7]:
train_df.dropna(subset = ["Accident Date", "Age at Injury","Birth Year","Years Past Accident","Assembly Years past Accident"], inplace=True)

In [8]:
X = train_df.drop(["Claim Injury Type"], axis = 1)
y = train_df["Claim Injury Type"]

In [9]:
# Install Ray and Cuda

# What to do with NAs in Wage (and Industry Code)

# Model Selection

    # Feature Selection

    # Kfold load (Have a script that will create 3 versions for Kfold for 4-6 folds)
    # Impute

    # Which models (catboosted)
    # Compare validation to 10% test

# Model Gridsearch
    # Find 2-3 models that are good
    # Look at the parameters and create dict with them
    # Run Ray.Tune on models and paramenter - how?
    # Compare best for each model to 10%

# Test Predict
    # Train best model
    # Predict
    # Profit

# Need to create function

# 10% data split

In [10]:
 X, X_test, y, y_test = train_test_split(X,y, test_size = 0.1, random_state = None, stratify = y, shuffle = True)

## Train Val Split

In [11]:
 X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.25, random_state = None, stratify = y, shuffle = True)

## Impute NAs

In [12]:
to_impute = ["Average Weekly Wage","Industry Code"]
percent_missing = X_train[to_impute].isnull().mean()
imputation_value = percent_missing / ((1 / 0.97) - 1)
for col in to_impute:
        X_train[col].fillna(imputation_value[col], inplace=True)
        X_val[col].fillna(imputation_value[col], inplace=True)

# Apply Feature Selection

Claim Injury Type
- 2. NON-COMP        291078
- 4. TEMPORARY       148507
- 3. MED ONLY         68906
- 5. PPD SCH LOSS     48280
- 1. CANCELLED        12477
- 6. PPD NSL           4211
- 8. DEATH              470
- 7. PTD                 97

In [13]:
features_to_drop = ["Number of Dependents","Assembly Years past Accident","Birth Year",
                   "Age at Injury","WCIO Part Of Body Code","WCIO Cause of Injury Code","Accident Date"
                   ]
features_to_scale = ["IME-4 Count","WCIO Nature of Injury Code",
                     "Years Past Accident","Industry Code","Average Weekly Wage"] + categorical_features

In [14]:
scaler = StandardScaler().fit(X_train[features_to_scale])
X_train[features_to_scale] = scaler.transform(X_train[features_to_scale])
X_val[features_to_scale] = scaler.transform(X_val[features_to_scale])

In [15]:
X_train.drop(features_to_drop, axis=1, inplace=True)
X_val.drop(features_to_drop, axis=1, inplace=True)

In [16]:
train_df["Claim Injury Type"].value_counts()

Claim Injury Type
1    288028
3    147915
2     68618
4     48257
0     10743
5      4210
7       467
6        97
Name: count, dtype: int64

In [17]:
rarity_mapping = {
    1: -1,  # Extremely Common
    3: -1,  # Extremely Common
    2: 0,   # Common
    4: 0,   # Common
    0: 0,   # Common
    5: 1,   # Rare
    6: 1,   # Rare
    7: 1    # Rare
}

In [18]:
y_rarity = y_train.map(rarity_mapping)

In [19]:
from sklearn.ensemble import IsolationForest

In [20]:
# Train Isolation Forest
iso_forest = IsolationForest(random_state=42)
iso_forest.fit(X_train)

# Get anomaly scores
anomaly_scores = iso_forest.decision_function(X_train)

# Define thresholds for rarity categories
threshold_rare = np.percentile(anomaly_scores, 90)  # Top 10% rare
threshold_common = np.percentile(anomaly_scores, 50)  # Middle range for neutral/common

# Predict rarity categories
prep_rarity = np.where(
    anomaly_scores >= threshold_rare,  # Rare
    1,
    np.where(anomaly_scores < threshold_common, -1, 0)  # Neutral or extremely common
)


In [21]:
print(classification_report(y_rarity, prep_rarity))

              precision    recall  f1-score   support

          -1       0.66      0.43      0.52    294261
           0       0.14      0.24      0.18     86142
           1       0.00      0.00      0.00      3222

    accuracy                           0.39    383625
   macro avg       0.27      0.23      0.23    383625
weighted avg       0.54      0.39      0.44    383625



In [None]:
break

In [24]:
def Iso_Forest_GridSearch(config):
    X_train_rarity = ray.get(X_train_ISO)
    y_train_rarity = ray.get(y_train_ISO)

    model = IsolationForest(
         n_estimators = config["n_estimators"],
        max_samples = config["max_samples"],
        contamination = config["contamination"],
        max_features = config["max_features"],
        random_state = config["random_state"]
    )

    iso_forest.fit(X_train_rarity)
    
    # Get anomaly scores
    anomaly_scores = iso_forest.decision_function(X_train_rarity)
    
    # Define thresholds for rarity categories
    threshold_rare = np.percentile(anomaly_scores, config["threshold_rare_percentile"])  # Top 10% rare
    threshold_common = np.percentile(anomaly_scores, config["threshold_common_percentile"])  # Middle range for neutral/common
    
    # Predict rarity categories
    prep_rarity = np.where(
        anomaly_scores >= threshold_rare,  # Rare
        1,
        np.where(anomaly_scores < threshold_common, -1, 0)  # Neutral or extremely common
    )

    f1 = f1_score(y_train_rarity, prep_rarity, average="macro")

    # Report Results to Ray
    session.report({"f1_score": f1})
    

In [None]:
X_train_ISO = ray.put(X_train)
y_train_ISO = ray.put(y_rarity)

# Define the hyperparameter search space
search_space = {
    
    "n_estimators": tune.grid_search([100, 200, 300]),  
    "max_samples": tune.grid_search([0.1, 0.5, 1.0]),  
    "contamination": tune.grid_search([0.01, 0.05, 0.1, 0.2]), 
    "max_features": tune.grid_search([0.5, 1.0]), 

    # Thresholds for rarity categorization
    "threshold_rare_percentile": tune.grid_search([85, 90, 95]), 
    "threshold_common_percentile": tune.grid_search([45, 50, 55]), 

    "random_state": 42
}

# Execute the grid search
analysis = tune.run(
    Iso_Forest_GridSearch,
    config=search_space,
    resources_per_trial={"cpu": 1},  # Allocate 1 CPU per trial
    scheduler=ASHAScheduler(metric="f1_score", mode="max"),  # Manage trials efficiently
    verbose=1
)

0,1
Current time:,2024-11-25 15:35:13
Running for:,00:18:31.36
Memory:,15.0/15.8 GiB

Trial name,status,loc,contamination,max_features,max_samples,n_estimators,threshold_common_per centile,threshold_rare_perce ntile,iter,total time (s),f1_score
Iso_Forest_GridSearch_468b6_00256,RUNNING,127.0.0.1:4920,0.01,0.5,1.0,200,45,90,,,
Iso_Forest_GridSearch_468b6_00257,RUNNING,127.0.0.1:20208,0.05,0.5,1.0,200,45,90,,,
Iso_Forest_GridSearch_468b6_00258,RUNNING,127.0.0.1:12516,0.1,0.5,1.0,200,45,90,,,
Iso_Forest_GridSearch_468b6_00259,RUNNING,127.0.0.1:22580,0.2,0.5,1.0,200,45,90,,,
Iso_Forest_GridSearch_468b6_00260,RUNNING,127.0.0.1:8284,0.01,1.0,1.0,200,45,90,,,
Iso_Forest_GridSearch_468b6_00261,PENDING,,0.05,1.0,1.0,200,45,90,,,
Iso_Forest_GridSearch_468b6_00262,PENDING,,0.1,1.0,1.0,200,45,90,,,
Iso_Forest_GridSearch_468b6_00263,PENDING,,0.2,1.0,1.0,200,45,90,,,
Iso_Forest_GridSearch_468b6_00264,PENDING,,0.01,0.5,0.1,300,45,90,,,
Iso_Forest_GridSearch_468b6_00265,PENDING,,0.05,0.5,0.1,300,45,90,,,




In [None]:
# Retrieve the best configuration
best_config = analysis.get_best_config(metric="f1_score", mode="max")
print(f"Best hyperparameters: {best_config}")

In [None]:
break

In [None]:
def XGB_GridSearch(config):

    X_train = ray.get(X_train_ref)
    y_train = ray.get(y_train_ref)
    
    # Create and fit the model
    model = xgb.XGBClassifier(
        max_depth=config["max_depth"],
        learning_rate=config["learning_rate"],
        n_estimators=config["n_estimators"]
    )
    
    model.fit(X_train, y_train)
    
    # Validation Predictions and F1 Score
    preds = model.predict(X_val)
    f1 = f1_score(y_val, preds, average="macro")

    # Report Results to Ray
    session.report({"f1_score": f1})

In [None]:
X_train_ref = ray.put(X_train)
y_train_ref = ray.put(y_train)

# Define the hyperparameter search space
search_space = {
    "max_depth": tune.grid_search([3, 5, 7]),
    "learning_rate": tune.grid_search([0.01, 0.1, 0.2]),
    "n_estimators": tune.grid_search([50, 100, 200])
}

# Execute the grid search
analysis = tune.run(
    XGB_GridSearch,
    config=search_space,
    resources_per_trial={"cpu": 1},  # Allocate 1 CPU per trial
    scheduler=ASHAScheduler(metric="f1_score", mode="max"),  # Manage trials efficiently
    verbose=1
)

In [None]:
# Retrieve the best configuration
best_config = analysis.get_best_config(metric="f1_score", mode="max")
print(f"Best hyperparameters: {best_config}")

In [None]:
X_test[features_to_scale] = scaler.transform(X_test[features_to_scale])
X_test.drop(features_to_drop, axis=1, inplace=True)

In [None]:
# Instantiate the model with the best configuration
model = xgb.XGBClassifier(**best_config)

# Train the model on the training dataset
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
#import joblib

#joblib.dump(model, "best_xgb_model.pkl")

# model = joblib.load("best_xgb_model.pkl")