# <center>Machine Learning Project</center>

** **
## <center>*04.2- XGBBoosted*</center>

** **

The members of the `team` are:
- Ana Farinha - 20211514
- Francisco Capontes - 20211692
- Sofia Gomes - 20240848
- Rui Lourenço - 2021639



In [1]:
# Import libraries
import pandas as pd
import numpy as np

#make the split here
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold
import time

from utils import *
from utils_feature_selection import check_performace
from utils_dicts import *

import warnings
warnings.filterwarnings('ignore')

random_state=68+1

## <span style="color:salmon"> 1. Import Dataset </span> 

In [2]:
# Import dataset
train_df = pd.read_csv('preprocessed_data/train_data.csv', index_col="Claim Identifier")

In [3]:
# Import dataset
test_df = pd.read_csv('./preprocessed_data/test_data.csv', index_col = 'Claim Identifier')

In [4]:
# Define Feature Selection: essential_features, reduced_features or [] (No Feature Selection)
feature_selection = []

Define y as a target "Claim Injury Type Encoded" and X with all the other columns

In [5]:
X = train_df.drop(["Claim Injury Type Encoded"], axis = 1)
y = train_df["Claim Injury Type Encoded"]

## <span style="color:salmon"> 2. XGBoosted </span> 

#### <span style="color:salmon"> 2.1  Model K-fold cross validation </span> 

In [6]:
config = {
        "n_estimators": 200,
        "learning_rate": 0.2,
        "max_depth": 7,
        "subsample": 0.9,
        "colsample_bytree": 0.9,
        "gamma": 0.3,
        "random_state":random_state
        }
#Same config for both no feature selection and only the essential features

In [7]:
model = XGBClassifier(
                    n_estimators=config["n_estimators"],        
                    learning_rate=config["learning_rate"],      
                    max_depth=config["max_depth"],                          
                    subsample=config["subsample"],              
                    colsample_bytree=config["colsample_bytree"],
                    gamma=config["gamma"],                     
                    # --------------
                    objective="multi:softmax",                  
                    num_class=8,                                
                    eval_metric="merror",   
                    random_state = config["random_state"],                                      
                    verbosity=0                                 
                )

In [8]:
check_performace(model,X,y,numerical_features,essential_features,n_folds = 3, random_state=random_state)

Fold 1 train F1 score: 0.3782
Fold 1 validation F1 score: 0.3157
------------------------------
Fold 2 train F1 score: 0.3730
Fold 2 validation F1 score: 0.3190
------------------------------
Fold 3 train F1 score: 0.3635
Fold 3 validation F1 score: 0.3191
------------------------------
Average Train F1 score: 0.37157415318240733
Average Validation F1 score: 0.3179499968124273


In [9]:
# The use of the Reduced Features made the model worse than no feature selection or only the essential features
#check_performace(model,X,y,numerical_features,reduced_features,n_folds = 3, random_state=random_state)

In [10]:
check_performace(model,X,y,numerical_features,[],n_folds = 3, random_state=random_state)

Fold 1 train F1 score: 0.6511
Fold 1 validation F1 score: 0.4079
------------------------------
Fold 2 train F1 score: 0.6484
Fold 2 validation F1 score: 0.4103
------------------------------
Fold 3 train F1 score: 0.6523
Fold 3 validation F1 score: 0.4087
------------------------------
Average Train F1 score: 0.6505748717881059
Average Validation F1 score: 0.4089805548043732


#### <span style="color:salmon"> 2.2  Train the model </span> 

In [11]:
selected_config = {
        "n_estimators": 200,
        "learning_rate": 0.1,
        "max_depth": 7,
        "subsample": 0.9,
        "colsample_bytree": 0.9,
        "gamma": 0.3,
        "random_state": random_state,
    }

In [12]:
model = XGBClassifier(
                    n_estimators=selected_config["n_estimators"],        
                    learning_rate=selected_config["learning_rate"],      
                    max_depth=selected_config["max_depth"],                          
                    subsample=selected_config["subsample"],              
                    colsample_bytree=selected_config["colsample_bytree"],
                    gamma=selected_config["gamma"],                     
                    # --------------
                    objective="multi:softmax",                  
                    num_class=8,                                
                    eval_metric="merror",   
                    random_state = selected_config["random_state"],                                      
                    verbosity=0                                 
                )

In [13]:
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.2, stratify = y, shuffle = True, random_state=random_state)
X_train_to_preprocess=X_train.copy()

In [14]:
remove_outliers(X_train)
X_train, X_val = apply_frequency_encoding(X_train, X_val)
NA_imputer(X_train,X_val)
create_new_features(X_train,X_val)

In [15]:
scaler = StandardScaler().fit(X_train[numerical_features])
X_train[numerical_features]  = scaler.transform(X_train[numerical_features])
X_val[numerical_features]  = scaler.transform(X_val[numerical_features])  

In [16]:
drop_list = ["Average Weekly Wage"]
if feature_selection != []:
    for col in X.columns:
        if col not in feature_selection:
            drop_list.append(col)
X_train = X_train.drop(drop_list, axis=1)
X_val = X_val.drop(drop_list, axis=1)

In [17]:
model.fit(X_train, y_train)

In [18]:
y_train_pred = model.predict(X_train)

In [19]:
y_val_pred = model.predict(X_val)

#### <span style="color:salmon"> 2.3  Model Results </span> 

In [20]:
class_mapping = {
    0:'1. CANCELLED', 
    1:'2. NON-COMP',
    2:'3. MED ONLY', 
    3:'4. TEMPORARY',
    4:'5. PPD SCH LOSS', 
    5:'6. PPD NSL', 
    6:'7. PTD', 
    7:'8. DEATH'
}

# Use the values from class_mapping as the target names
target_names = list(class_mapping.values())

Compute confusion matrix to evaluate the accuracy of a classification

In [21]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))
print("\nClassification Report:")
print(classification_report(y_train, y_train_pred, target_names=target_names))

Confusion Matrix:
[[  5574   3772    129    477     28      0      0      0]
 [  1366 224562    984   5505    441      0      0      4]
 [    37  31600   5696  15384   2405      1      0      2]
 [    60  26608   1791  84419   5905      9      0     13]
 [     1   1356    473  12319  24474      1      0      0]
 [     0      7     29   2670    408    255      0      0]
 [     0      0      0      3      2      0     73      0]
 [     0      7      3     21      1      0      0    344]]

Classification Report:
                 precision    recall  f1-score   support

   1. CANCELLED       0.79      0.56      0.66      9980
    2. NON-COMP       0.78      0.96      0.86    232862
    3. MED ONLY       0.63      0.10      0.18     55125
   4. TEMPORARY       0.70      0.71      0.70    118805
5. PPD SCH LOSS       0.73      0.63      0.68     38624
     6. PPD NSL       0.96      0.08      0.14      3369
         7. PTD       1.00      0.94      0.97        78
       8. DEATH       0.95  

In [22]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred, target_names=target_names))


Confusion Matrix:
[[ 1260  1044    42   138    11     0     0     0]
 [  408 55835   306  1539   124     0     0     4]
 [   23  8026  1053  4030   647     2     0     0]
 [   26  6949   630 20373  1702     9     0    13]
 [    1   366   161  3411  5716     1     0     0]
 [    0     3     8   721   107     3     0     0]
 [    0     0     1    16     2     0     0     0]
 [    2     9     1    49     0     0     0    33]]

Classification Report:
                 precision    recall  f1-score   support

   1. CANCELLED       0.73      0.51      0.60      2495
    2. NON-COMP       0.77      0.96      0.86     58216
    3. MED ONLY       0.48      0.08      0.13     13781
   4. TEMPORARY       0.67      0.69      0.68     29702
5. PPD SCH LOSS       0.69      0.59      0.64      9656
     6. PPD NSL       0.20      0.00      0.01       842
         7. PTD       0.00      0.00      0.00        19
       8. DEATH       0.66      0.35      0.46        94

       accuracy                   

## <span style="color:salmon"> 3. Test Predictions </span> 

In [23]:
remove_outliers(X_train_to_preprocess)
X_train_to_preprocess, test_df = apply_frequency_encoding(X_train_to_preprocess, test_df)
NA_imputer(X_train_to_preprocess, test_df)
create_new_features(X_train_to_preprocess, test_df)

In [24]:
test_df[numerical_features]  = scaler.transform(test_df[numerical_features])   

In [25]:
test_df = test_df.drop(drop_list, axis=1)

In [26]:
y_test_pred = model.predict(test_df)
y_test_pred = y_test_pred.ravel()

In [27]:
y_test_final = np.array([class_mapping[i] for i in y_test_pred])

In [28]:
test_id = test_df.index

In [29]:
submission_df = pd.DataFrame({
    'Claim Identifier': test_id,
    'Claim Injury Type': y_test_final
})

In [30]:
if False:
    version = version_control()
    submission_df.to_csv(f'./submissions/Group49_Version{version:02}.csv', index=False)