# <center>Machine Learning Project</center>

** **
## <center>*03.7 - Neural Networks*</center>

** **

The members of the `team` are:
- Ana Farinha - 20211514
- Francisco Capontes - 20211692
- Sofia Gomes - 20240848
- Rui Lourenço - 2021639



In [1]:
# Import libraries
import pandas as pd
import numpy as np

#make the split here
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import StratifiedKFold
import time

from utils import *
from utils_feature_selection import check_performace
from utils_dicts import *

import warnings
warnings.filterwarnings('ignore')

## <span style="color:salmon"> 1. Import Dataset </span> 

In [2]:
# Import dataset
train_df = pd.read_csv('preprocessed_data/train_data.csv', index_col="Claim Identifier")

In [3]:
# Import dataset
test_df = pd.read_csv('./preprocessed_data/test_data.csv', index_col = 'Claim Identifier')

In [4]:
# Define Feature Selection: essential_features, reduced_features or []
feature_selection = []

## <span style="color:salmon"> 2. Prepare Dataset </span> 

Define y as a target "Claim Injury Type Encoded" and X with all the other columns

In [5]:
X = train_df.drop(["Claim Injury Type Encoded"], axis = 1)
y = train_df["Claim Injury Type Encoded"]

## <span style="color:salmon"> 3. CatBoosted </span> 

In [6]:
config = {'iterations': 500, 'learning_rate': 0.15, 'depth': 6, 'l2_leaf_reg': 9.5}

In [7]:
model = CatBoostClassifier(
        iterations=config["iterations"],
        learning_rate=config["learning_rate"],
        depth=config["depth"],
        l2_leaf_reg=config["l2_leaf_reg"],
        loss_function="MultiClass", 
        eval_metric="MultiClass",  
        custom_metric=['F1'], 
        verbose=0
    )

In [8]:
# For essential_features

In [9]:
check_performace(model,X,y,numerical_features,essential_features,n_folds = 5)

Fold 1 train F1 score: 0.3197
Fold 1 validation F1 score: 0.3138
------------------------------
Fold 2 train F1 score: 0.3209
Fold 2 validation F1 score: 0.3107
------------------------------
Fold 3 train F1 score: 0.3208
Fold 3 validation F1 score: 0.3133
------------------------------
Fold 4 train F1 score: 0.3202
Fold 4 validation F1 score: 0.3157
------------------------------
Fold 5 train F1 score: 0.3193
Fold 5 validation F1 score: 0.3161
------------------------------
Average Train F1 score: 0.32019046998467127
Average Validation F1 score: 0.3139111523894428


In [10]:
# For reduced_features

In [11]:
check_performace(model,X,y,numerical_features,reduced_features,n_folds = 5)

Fold 1 train F1 score: 0.3982
Fold 1 validation F1 score: 0.3718
------------------------------
Fold 2 train F1 score: 0.4047
Fold 2 validation F1 score: 0.3636
------------------------------
Fold 3 train F1 score: 0.3998
Fold 3 validation F1 score: 0.3644
------------------------------
Fold 4 train F1 score: 0.3982
Fold 4 validation F1 score: 0.3627
------------------------------
Fold 5 train F1 score: 0.3933
Fold 5 validation F1 score: 0.3642
------------------------------
Average Train F1 score: 0.3988685646010806
Average Validation F1 score: 0.36532291183553756


In [12]:
# For no feature selection

In [13]:
check_performace(model,X,y,numerical_features,[],n_folds = 5)

Fold 1 train F1 score: 0.4868
Fold 1 validation F1 score: 0.4436
------------------------------
Fold 2 train F1 score: 0.4889
Fold 2 validation F1 score: 0.4369
------------------------------
Fold 3 train F1 score: 0.4849
Fold 3 validation F1 score: 0.4356
------------------------------
Fold 4 train F1 score: 0.4838
Fold 4 validation F1 score: 0.4314
------------------------------
Fold 5 train F1 score: 0.4875
Fold 5 validation F1 score: 0.4342
------------------------------
Average Train F1 score: 0.4863739253669673
Average Validation F1 score: 0.43633805907702217


#### <span style="color:salmon"> 3.1  Evaluate the model </span> 


In [14]:
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.75, stratify = y, shuffle = True)

In [15]:
X_train, X_val = apply_frequency_encoding(X_train, X_val)
NA_imputer(X_train,X_val)
create_new_features(X_train,X_val)

In [16]:
scaler = StandardScaler().fit(X_train[numerical_features])
X_train[numerical_features]  = scaler.transform(X_train[numerical_features])
X_val[numerical_features]  = scaler.transform(X_val[numerical_features])  

In [17]:
drop_list = [] #["Average Weekly Wage"]
if feature_selection != []:
    for col in X_train.columns:
        if col not in feature_selection:
            drop_list.append(col)
X_train = X_train.drop(drop_list, axis=1)
X_val = X_val.drop(drop_list, axis=1)

In [18]:
model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x1dd8ede2590>

In [19]:
y_train_pred = model.predict(X_train)

In [20]:
y_val_pred = model.predict(X_val)

In [21]:
class_mapping = {
    0:'1. CANCELLED', 
    1:'2. NON-COMP',
    2:'3. MED ONLY', 
    3:'4. TEMPORARY',
    4:'5. PPD SCH LOSS', 
    5:'6. PPD NSL', 
    6:'7. PTD', 
    7:'8. DEATH'
}

# Use the values from class_mapping as the target names
target_names = list(class_mapping.values())

Compute confusion matrix to evaluate the accuracy of a classification

In [22]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))
print("\nClassification Report:")
print(classification_report(y_train, y_train_pred, target_names=target_names))

Confusion Matrix:
[[ 1558  1427    47    68     2     0     0     2]
 [  441 71422   307   479    23     0     0     2]
 [   13  9084  1832  5632   651     0     0     2]
 [   23  1892   469 32990  1710     2     0     6]
 [    0   163   173  4314  7412     0     0     0]
 [    0     2    13   861   152    23     0     0]
 [    0     0     0    19     0     0     5     0]
 [    1     5     0    36     2     0     0    70]]

Classification Report:
                 precision    recall  f1-score   support

   1. CANCELLED       0.77      0.50      0.61      3104
    2. NON-COMP       0.85      0.98      0.91     72674
    3. MED ONLY       0.64      0.11      0.18     17214
   4. TEMPORARY       0.74      0.89      0.81     37092
5. PPD SCH LOSS       0.74      0.61      0.67     12062
     6. PPD NSL       0.92      0.02      0.04      1051
         7. PTD       1.00      0.21      0.34        24
       8. DEATH       0.85      0.61      0.71       114

       accuracy                   

In [23]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred, target_names=target_names))


Confusion Matrix:
[[  4459   4440    200    194     13      0      0      4]
 [  1563 213436   1261   1654     88      0      0     19]
 [    53  27314   4146  18049   2071      3      0      8]
 [    75   5813   1833  97001   6498     16      0     42]
 [     0    529    549  14375  20727      6      0      0]
 [     0      3     38   2648    452     11      0      0]
 [     0      0      0     61     12      0      0      0]
 [     7     50     10    185     10      0      0     81]]

Classification Report:
                 precision    recall  f1-score   support

   1. CANCELLED       0.72      0.48      0.58      9310
    2. NON-COMP       0.85      0.98      0.91    218021
    3. MED ONLY       0.52      0.08      0.14     51644
   4. TEMPORARY       0.72      0.87      0.79    111278
5. PPD SCH LOSS       0.69      0.57      0.63     36186
     6. PPD NSL       0.31      0.00      0.01      3152
         7. PTD       0.00      0.00      0.00        73
       8. DEATH       0.53  

## <span style="color:salmon"> 4. Test Predictions </span> 

In [24]:
X, test_df = apply_frequency_encoding(X, test_df)
NA_imputer(X, test_df)
create_new_features(X, test_df)

In [25]:
scaler = StandardScaler().fit(X[numerical_features])
X[numerical_features]  = scaler.transform(X[numerical_features])
test_df[numerical_features]  = scaler.transform(test_df[numerical_features])  

In [26]:
drop_list = [] #["Average Weekly Wage"]
if feature_selection != []:
    for col in X.columns:
        if col not in feature_selection:
            drop_list.append(col)
test_df = test_df.drop(drop_list, axis=1)

In [27]:
# Make validation predictions
y_test_pred = model.predict(test_df)
y_test_pred = y_test_pred.ravel()

In [28]:
y_test_final = np.array([class_mapping[i] for i in y_test_pred])

In [29]:
test_id = test_df.index

In [30]:
submission_df = pd.DataFrame({
    'Claim Identifier': test_id,
    'Claim Injury Type': y_test_final
})

In [31]:
if False:
    version = version_control()
    submission_df.to_csv(f'./submissions/Group49_Version{version:02}.csv', index=False)