# <center>Machine Learning Project</center>

** **
## <center>*03.5 - Stack Emsemble*</center>

** **

The members of the `team` are:
- Ana Farinha - 20211514
- Francisco Capontes - 20211692
- Sofia Gomes - 20240848
- Rui Lourenço - 2021639



In [1]:
# Import libraries
import pandas as pd
import numpy as np

#make the split here
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import StratifiedKFold
import time

from utils import *
from utils_feature_selection import check_performace
from utils_dicts import *

import warnings
warnings.filterwarnings('ignore')

random_state=68+1

## <span style="color:salmon"> 1. Import Dataset </span> 

In [2]:
# Import dataset
train_df = pd.read_csv('preprocessed_data/train_data.csv', index_col="Claim Identifier")

In [3]:
# Import dataset
test_df = pd.read_csv('./preprocessed_data/test_data.csv', index_col = 'Claim Identifier')

In [4]:
# Define Feature Selection: essential_features, reduced_features or []
feature_selection = []

## <span style="color:salmon"> 2. Prepare Dataset </span> 

Define y as a target "Claim Injury Type Encoded" and X with all the other columns

In [5]:
X = train_df.drop(["Claim Injury Type Encoded"], axis = 1)
y = train_df["Claim Injury Type Encoded"]

## <span style="color:salmon"> 3. Stack Emsemble </span> 

In [6]:
cb_config ={
        "iterations": 1000,
        "learning_rate": 0.11,
        "depth": 6,
        "l2_leaf_reg": 5,
        "bagging_temperature": 0.4,
        "random_state": random_state,
        "custom_metric":"F1",
        "early_stopping_rounds":50,
        "verbose":0
    }
xgb_config = {
        "n_estimators": 200,
        "learning_rate": 0.1,
        "max_depth": 7,
        "subsample": 0.9,
        "colsample_bytree": 0.9,
        "gamma": 0.3,
        "random_state": random_state,
        "objective":"multi:softmax",                  
        "num_class":8,                                
        "eval_metric":"merror",                                       
        "verbosity":0   
    }
dt_config = {
        "min_samples_split": 10,
        "min_samples_leaf": 4,
        "max_depth": 20,
        "criterion": "entropy",
        "random_state": random_state,
    }

In [7]:
base_models = [
    ("NaiveBayes", GaussianNB()),
    ("CatBoostClassifier", CatBoostClassifier(**cb_config)),
    ("XGBClassifier", XGBClassifier(**xgb_config)),
    ("DecisionTreeClassifier", DecisionTreeClassifier(**dt_config)),
]
meta_model = LogisticRegression(random_state=random_state)

In [8]:
model = StackingClassifier(estimators=base_models, final_estimator=meta_model)

In [9]:
check_performace(model,X,y,numerical_features,essential_features,n_folds = 5, random_state=random_state)

Fold 1 train F1 score: 0.3111
Fold 1 validation F1 score: 0.3018
------------------------------
Fold 2 train F1 score: 0.3105
Fold 2 validation F1 score: 0.3066
------------------------------
Fold 3 train F1 score: 0.3116
Fold 3 validation F1 score: 0.3074
------------------------------
Fold 4 train F1 score: 0.3086
Fold 4 validation F1 score: 0.3027
------------------------------
Fold 5 train F1 score: 0.3104
Fold 5 validation F1 score: 0.3070
------------------------------
Average Train F1 score: 0.3104437223928164
Average Validation F1 score: 0.3051176209555525


In [10]:
check_performace(model,X,y,numerical_features,reduced_features,n_folds = 5, random_state=random_state)

Fold 1 train F1 score: 0.2534
Fold 1 validation F1 score: 0.2412
------------------------------
Fold 2 train F1 score: 0.2760
Fold 2 validation F1 score: 0.2676
------------------------------
Fold 3 train F1 score: 0.2737
Fold 3 validation F1 score: 0.2566
------------------------------
Fold 4 train F1 score: 0.2908
Fold 4 validation F1 score: 0.2688
------------------------------
Fold 5 train F1 score: 0.2645
Fold 5 validation F1 score: 0.2479
------------------------------
Average Train F1 score: 0.27170237433324224
Average Validation F1 score: 0.25640741792055616


In [None]:
check_performace(model,X,y,numerical_features,[],n_folds = 5, random_state=random_state)

Fold 1 train F1 score: 0.3157
Fold 1 validation F1 score: 0.2653
------------------------------
Fold 2 train F1 score: 0.3352
Fold 2 validation F1 score: 0.2873
------------------------------
Fold 3 train F1 score: 0.3431
Fold 3 validation F1 score: 0.2771
------------------------------


#### <span style="color:salmon"> 3.1  Evaluate the model </span> 


In [None]:
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.2, stratify = y, shuffle = True, random_state=random_state)
X_train_to_preprocess=X_train.copy()

In [None]:
remove_outliers(X_train)
X_train, X_val = apply_frequency_encoding(X_train, X_val)
NA_imputer(X_train,X_val)
create_new_features(X_train,X_val)

In [None]:
scaler = StandardScaler().fit(X_train[numerical_features])
X_train[numerical_features]  = scaler.transform(X_train[numerical_features])
X_val[numerical_features]  = scaler.transform(X_val[numerical_features]) 

In [None]:
drop_list = ["Average Weekly Wage"]
if feature_selection != []:
    for col in X.columns:s
        if col not in feature_selection:
            drop_list.append(col)
X_train = X_train.drop(drop_list, axis=1)
X_val = X_val.drop(drop_list, axis=1)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_train_pred = model.predict(X_train)

In [None]:
y_val_pred = model.predict(X_val)

In [None]:
class_mapping = {
    0:'1. CANCELLED', 
    1:'2. NON-COMP',
    2:'3. MED ONLY', 
    3:'4. TEMPORARY',
    4:'5. PPD SCH LOSS', 
    5:'6. PPD NSL', 
    6:'7. PTD', 
    7:'8. DEATH'
}

# Use the values from class_mapping as the target names
target_names = list(class_mapping.values())

Compute confusion matrix to evaluate the accuracy of a classification

In [None]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))
print("\nClassification Report:")
print(classification_report(y_train, y_train_pred, target_names=target_names))

In [None]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred, target_names=target_names))

## <span style="color:salmon"> 4. Test Predictions </span> 

Make validation predictions:

In [None]:
remove_outliers(X_train_to_preprocess)
X_train_to_preprocess, test_df = apply_frequency_encoding(X_train_to_preprocess, test_df)
NA_imputer(X_train_to_preprocess, test_df)
create_new_features(X_train_to_preprocess, test_df)

In [None]:
test_df[numerical_features]  = scaler.transform(test_df[numerical_features])  

In [None]:
test_df = test_df.drop(drop_list, axis=1)

In [None]:
# Make validation predictions
y_test_pred = model.predict(test_df)
y_test_pred = y_test_pred.ravel()

In [None]:
y_test_final = np.array([class_mapping[i] for i in y_test_pred])

In [None]:
test_id = test_df.index

In [None]:
submission_df = pd.DataFrame({
    'Claim Identifier': test_id,
    'Claim Injury Type': y_test_final
})

In [None]:
if True:
    version = version_control()
    submission_df.to_csv(f'./submissions/Group49_Version{version:02}.csv', index=False)