# <center>Machine Learning Project</center>

** **
## <center>*03.10 - Gradient Boost*</center>

** **

The members of the `team` are:
- Ana Farinha - 20211514
- Francisco Capontes - 20211692
- Sofia Gomes - 20240848
- Rui Lourenço - 2021639



In [1]:
# Import libraries
import pandas as pd
import numpy as np

#make the split here
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import StratifiedKFold
import time

from utils import *
from utils_feature_selection import check_performace
from utils_dicts import *

import warnings
warnings.filterwarnings('ignore')

## <span style="color:salmon"> 1. Import Dataset </span> 

In [2]:
# Import dataset
train_df = pd.read_csv('preprocessed_data/train_data.csv', index_col="Claim Identifier")

In [3]:
# Import dataset
test_df = pd.read_csv('./preprocessed_data/test_data.csv', index_col = 'Claim Identifier')

In [4]:
# Define Feature Selection: essential_features, reduced_features or []
feature_selection = []

In [5]:
missing_percentage = train_df.isna().sum() / len(train_df) * 100
for col, percent in missing_percentage.items():
    if not percent == 0:
        print(f"{col}: {percent:.2f}% missing values")

Age at Injury: 0.40% missing values
Average Weekly Wage: 63.41% missing values
Birth Year: 0.40% missing values
Industry Code: 1.73% missing values
WCIO Cause of Injury Code: 2.72% missing values
WCIO Nature of Injury Code: 2.72% missing values
WCIO Part Of Body Code: 2.98% missing values
Zip Code: 4.99% missing values


## <span style="color:salmon"> 2. Prepare Dataset </span> 

Define y as a target "Claim Injury Type Encoded" and X with all the other columns

In [5]:
X = train_df.drop(["Claim Injury Type Encoded"], axis = 1)
y = train_df["Claim Injury Type Encoded"]

## <span style="color:salmon"> 3. Gradient Boosting</span> 

In [6]:
#config = {'max_depth': 5, 'learning_rate': 0.1, 'n_estimators': 200}

In [7]:
model = GradientBoostingClassifier()
#        max_depth=config["max_depth"],
#        learning_rate=config["learning_rate"],
#        n_estimators=config["n_estimators"],
#        verbose = 0
#    )

In [8]:
check_performace(model,X,y,numerical_features,essential_features,n_folds = 5)

------------------------------------------------


In [9]:
check_performace(model,X,y,numerical_features,reduced_features,n_folds = 5)

------------------------------------------------


In [10]:
check_performace(model,X,y,numerical_features,[],n_folds = 5)

------------------------------------------------


#### <span style="color:salmon"> 3.1  Evaluate the model </span> 


In [11]:
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.25, stratify = y, shuffle = True)

In [12]:
X_train, X_val = apply_frequency_encoding(X_train, X_val)
NA_imputer(X_train,X_val)
create_new_features(X_train,X_val)

In [13]:
scaler = StandardScaler().fit(X_train[numerical_features])
X_train[numerical_features]  = scaler.transform(X_train[numerical_features])
X_val[numerical_features]  = scaler.transform(X_val[numerical_features])  

In [14]:
drop_list = ["Average Weekly Wage"]
if feature_selection != []:
    for col in X.columns:
        if col not in feature_selection:
            drop_list.append(col)
X_train = X_train.drop(drop_list, axis=1)
X_val = X_val.drop(drop_list, axis=1)

In [15]:
model.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
y_train_pred = model.predict(X_train)

In [None]:
y_val_pred = model.predict(X_val)

In [None]:
class_mapping = {
    0:'1. CANCELLED', 
    1:'2. NON-COMP',
    2:'3. MED ONLY', 
    3:'4. TEMPORARY',
    4:'5. PPD SCH LOSS', 
    5:'6. PPD NSL', 
    6:'7. PTD', 
    7:'8. DEATH'
}

# Use the values from class_mapping as the target names
target_names = list(class_mapping.values())

Compute confusion matrix to evaluate the accuracy of a classification

In [None]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))
print("\nClassification Report:")
print(classification_report(y_train, y_train_pred, target_names=target_names))

In [None]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred, target_names=target_names))

## <span style="color:salmon"> 4. Test Predictions </span> 

Make validation predictions:

In [None]:
X, test_df = apply_frequency_encoding(X, test_df)
NA_imputer(X, test_df)
create_new_features(X, test_df)

In [None]:
scaler = StandardScaler().fit(X[numerical_features])
X[numerical_features]  = scaler.transform(X[numerical_features])
test_df[numerical_features]  = scaler.transform(test_df[numerical_features])  

In [None]:
drop_list = ["Average Weekly Wage"]
if feature_selection != []:
    for col in X.columns:
        if col not in feature_selection:
            drop_list.append(col)
test_df = test_df.drop(drop_list, axis=1)

In [None]:
# Make validation predictions
y_test_pred = model.predict(test_df)
y_test_pred = y_test_pred.ravel()

In [None]:
y_test_final = np.array([class_mapping[i] for i in y_test_pred])

In [None]:
test_id = model.index

In [None]:
submission_df = pd.DataFrame({
    'Claim Identifier': test_id,
    'Claim Injury Type': y_test_final
})

In [None]:
if False:
    version = version_control()
    submission_df.to_csv(f'./submissions/Group49_Version{version:02}.csv', index=False)