# <center>Machine Learning Project</center>

** **
## <center>*03.6 - Voting Emsemble*</center>

** **

The members of the `team` are:
- Ana Farinha - 20211514
- Francisco Capontes - 20211692
- Sofia Gomes - 20240848
- Rui Lourenço - 2021639



In [1]:
# Import libraries
import pandas as pd
import numpy as np

#make the split here
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import StratifiedKFold
import time

from utils import *
from utils_feature_selection import check_performace
from utils_dicts import *

import warnings
warnings.filterwarnings('ignore')

random_state=68+1

## <span style="color:salmon"> 1. Import Dataset </span> 

In [2]:
# Import dataset
train_df = pd.read_csv('preprocessed_data/train_data.csv', index_col="Claim Identifier")

In [3]:
# Import dataset
test_df = pd.read_csv('./preprocessed_data/test_data.csv', index_col = 'Claim Identifier')

In [4]:
# Define Feature Selection: essential_features, reduced_features or []
feature_selection = []

## <span style="color:salmon"> 2. Prepare Dataset </span> 

Define y as a target "Claim Injury Type Encoded" and X with all the other columns

In [5]:
X = train_df.drop(["Claim Injury Type Encoded"], axis = 1)
y = train_df["Claim Injury Type Encoded"]

## <span style="color:salmon"> 3. Vote Emsemble </span> 

In [6]:
cb_config ={
        "iterations": 300,
        "learning_rate": 0.7,
        "depth": 6,
        "l2_leaf_reg": 6,
        "bagging_temperature": 0.7,
        "random_state": random_state,
        "custom_metric":"F1",
        "early_stopping_rounds":50,
        "verbose":0
    }
xgb_config = {
        "n_estimators": 200,
        "learning_rate": 0.1,
        "max_depth": 7,
        "subsample": 0.9,
        "colsample_bytree": 0.9,
        "gamma": 0.3,
        "random_state": random_state,
        "objective":"multi:softmax",                  
        "num_class":8,                                
        "eval_metric":"merror",                                       
        "verbosity":0   
    }
dt_config = {
        "min_samples_split": 10,
        "min_samples_leaf": 4,
        "max_depth": 20,
        "criterion": "entropy",
        "random_state": random_state,
    }

In [7]:
base_models = [
    ("NaiveBayes", GaussianNB()),
    ("CatBoostClassifier", CatBoostClassifier(**cb_config)),
    ("XGBClassifier", XGBClassifier(**xgb_config)),
    ("DecisionTreeClassifier", DecisionTreeClassifier(**dt_config)),
]

In [30]:
model = VotingClassifier(
    estimators=base_models,
    voting='soft',
    weights=[1.0, 2.0, 2.0, 1.0]  
)

In [9]:
check_performace(model,X,y,numerical_features,essential_features,n_folds = 3, random_state=random_state)

Fold 1 train F1 score: 0.3842
Fold 1 validation F1 score: 0.3260
------------------------------
Fold 2 train F1 score: 0.4143
Fold 2 validation F1 score: 0.3376
------------------------------
Fold 3 train F1 score: 0.4148
Fold 3 validation F1 score: 0.3382
------------------------------
Average Train F1 score: 0.40443994525114496
Average Validation F1 score: 0.3339274205792875


In [10]:
check_performace(model,X,y,numerical_features,[],n_folds = 3, random_state=random_state)

Fold 1 train F1 score: 0.5971
Fold 1 validation F1 score: 0.4062
------------------------------
Fold 2 train F1 score: 0.6383
Fold 2 validation F1 score: 0.4176
------------------------------
Fold 3 train F1 score: 0.5853
Fold 3 validation F1 score: 0.4065
------------------------------
Average Train F1 score: 0.6068928065758405
Average Validation F1 score: 0.4100819001773477


#### <span style="color:salmon"> 3.1  Evaluate the model </span> 


In [11]:
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.2, stratify = y, shuffle = True, random_state=random_state)
X_train_to_preprocess=X_train.copy()

In [12]:
remove_outliers(X_train)
X_train, X_val = apply_frequency_encoding(X_train, X_val)
NA_imputer(X_train,X_val)
create_new_features(X_train,X_val)

In [13]:
scaler = StandardScaler().fit(X_train[numerical_features])
X_train[numerical_features]  = scaler.transform(X_train[numerical_features])
X_val[numerical_features]  = scaler.transform(X_val[numerical_features]) 

In [14]:
drop_list = ["Average Weekly Wage"]
if feature_selection != []:
    for col in X.columns:
        if col not in feature_selection:
            drop_list.append(col)
X_train = X_train.drop(drop_list, axis=1)
X_val = X_val.drop(drop_list, axis=1)

In [31]:
model.fit(X_train, y_train)

In [32]:
y_train_pred = model.predict(X_train)

In [33]:
y_val_pred = model.predict(X_val)

In [34]:
class_mapping = {
    0:'1. CANCELLED', 
    1:'2. NON-COMP',
    2:'3. MED ONLY', 
    3:'4. TEMPORARY',
    4:'5. PPD SCH LOSS', 
    5:'6. PPD NSL', 
    6:'7. PTD', 
    7:'8. DEATH'
}

# Use the values from class_mapping as the target names
target_names = list(class_mapping.values())

Compute confusion matrix to evaluate the accuracy of a classification

In [35]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))
print("\nClassification Report:")
print(classification_report(y_train, y_train_pred, target_names=target_names))

Confusion Matrix:
[[  5997   3337    159    445     42      0      0      0]
 [  1582 224303   1924   4319    729      2      0      3]
 [    55  29202  10690  12568   2603      1      0      6]
 [    71  24350   2359  86341   5671      5      0      8]
 [     5   1038    474   9955  27146      6      0      0]
 [     0      8     31   2492    439    399      0      0]
 [     0      0      0      0      1      0     77      0]
 [     0      6      1     18      1      0      0    350]]

Classification Report:
                 precision    recall  f1-score   support

   1. CANCELLED       0.78      0.60      0.68      9980
    2. NON-COMP       0.79      0.96      0.87    232862
    3. MED ONLY       0.68      0.19      0.30     55125
   4. TEMPORARY       0.74      0.73      0.73    118805
5. PPD SCH LOSS       0.74      0.70      0.72     38624
     6. PPD NSL       0.97      0.12      0.21      3369
         7. PTD       1.00      0.99      0.99        78
       8. DEATH       0.95  

In [36]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred, target_names=target_names))

Confusion Matrix:
[[ 1324   972    59   123    16     0     0     1]
 [  492 55266   784  1433   236     0     0     5]
 [   24  7707  1460  3834   754     2     0     0]
 [   23  6602  1130 19866  2057    11     0    13]
 [    2   315   183  3234  5921     1     0     0]
 [    0     3     8   727    97     7     0     0]
 [    0     0     1    16     2     0     0     0]
 [    3    10     5    45     2     0     0    29]]

Classification Report:
                 precision    recall  f1-score   support

   1. CANCELLED       0.71      0.53      0.61      2495
    2. NON-COMP       0.78      0.95      0.86     58216
    3. MED ONLY       0.40      0.11      0.17     13781
   4. TEMPORARY       0.68      0.67      0.67     29702
5. PPD SCH LOSS       0.65      0.61      0.63      9656
     6. PPD NSL       0.33      0.01      0.02       842
         7. PTD       0.00      0.00      0.00        19
       8. DEATH       0.60      0.31      0.41        94

       accuracy                   

## <span style="color:salmon"> 4. Test Predictions </span> 

Make validation predictions:

In [37]:
remove_outliers(X_train_to_preprocess)
X_train_to_preprocess, test_df = apply_frequency_encoding(X_train_to_preprocess, test_df)
NA_imputer(X_train_to_preprocess, test_df)
create_new_features(X_train_to_preprocess, test_df)

KeyError: 'County of Injury'

In [39]:
test_df[numerical_features]  = scaler.transform(test_df[numerical_features])  

KeyError: "['Average Weekly Wage'] not in index"

In [40]:
test_df = test_df.drop(drop_list, axis=1)

KeyError: "['Average Weekly Wage'] not found in axis"

In [44]:
# Make validation predictions
y_test_pred = model.predict(test_df)
y_test_pred = y_test_pred.ravel()

In [45]:
y_test_final = np.array([class_mapping[i] for i in y_test_pred])

In [46]:
test_id = test_df.index

In [47]:
submission_df = pd.DataFrame({
    'Claim Identifier': test_id,
    'Claim Injury Type': y_test_final
})

In [48]:
if True:
    version = version_control()
    submission_df.to_csv(f'./submissions/Group49_Version{version:02}.csv', index=False)