# <center>Machine Learning Project</center>

** **
## <center>*03.7 - Neural Networks*</center>

** **

The members of the `team` are:
- Ana Farinha - 20211514
- Francisco Capontes - 20211692
- Sofia Gomes - 20240848
- Rui Lourenço - 2021639



In [1]:
# Import libraries
import pandas as pd
import numpy as np

#make the split here
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier
from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold
import time

from utils import *
from utils_feature_selection import check_performace
from utils_dicts import *

import warnings
warnings.filterwarnings('ignore')

## <span style="color:salmon"> 1. Import Dataset </span> 

In [2]:
# Import dataset
train_df = pd.read_csv('preprocessed_data/train_data.csv', index_col="Claim Identifier")

In [3]:
# Import dataset
test_df = pd.read_csv('./preprocessed_data/test_data.csv', index_col = 'Claim Identifier')

In [4]:
# Define Feature Selection: essential_features, reduced_features or []
feature_selection = essential_features

## <span style="color:salmon"> 2. Prepare Dataset </span> 

Define y as a target "Claim Injury Type Encoded" and X with all the other columns

In [7]:
X = train_df.drop(["Claim Injury Type Encoded"], axis = 1)
y = train_df["Claim Injury Type Encoded"]

## <span style="color:salmon"> 3. CatBoosted </span> 

In [8]:
config = {'iterations': 500, 'learning_rate': 0.15, 'depth': 6, 'l2_leaf_reg': 9.5}

In [9]:
model = CatBoostClassifier(
        iterations=config["iterations"],
        learning_rate=config["learning_rate"],
        depth=config["depth"],
        l2_leaf_reg=config["l2_leaf_reg"],
        loss_function="MultiClass", 
        eval_metric="MultiClass",  
        custom_metric=['F1'], 
        verbose=0
    )

In [10]:
# For essential_features

In [11]:
check_performace(model,X,y,numerical_features,essential_features,n_folds = 5)

Fold 1 train F1 score: 0.2614
Fold 1 validation F1 score: 0.2590
Fold 2 train F1 score: 0.2550
Fold 2 validation F1 score: 0.2547
Fold 3 train F1 score: 0.2607
Fold 3 validation F1 score: 0.2604


KeyboardInterrupt: 

In [None]:
# For reduced_features

In [None]:
check_performace(model,X,y,numerical_features,reduced_features,n_folds = 5)

In [None]:
# For no feature selection

In [None]:
check_performace(model,X,y,numerical_features,[],n_folds = 5)

#### <span style="color:salmon"> 3.1  Evaluate the model </span> 


In [None]:
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.75, stratify = y, shuffle = True)

In [None]:
X_train, X_val = apply_frequency_encoding(X_train, X_val)
NA_imputer(X_train,X_val)
create_new_features(X_train,X_val)

In [None]:
scaler = StandardScaler().fit(X_train[numerical_features])
X_train[features_to_scale]  = scaler.transform(X_train[numerical_features])
X_val[features_to_scale]  = scaler.transform(X_val[numerical_features])  

In [None]:
drop_list = []
for col in X_train.columns:
    if col not in feature_selection:
        drop_list.append(col)
X_train = X_train.drop(drop_list, axis=1)
X_val = X_val.drop(drop_list, axis=1)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_train_pred = model_cb.predict(X_train)

In [None]:
y_val_pred = model_cb.predict(X_val)

In [None]:
class_mapping = {
    0:'1. CANCELLED', 
    1:'2. NON-COMP',
    2:'3. MED ONLY', 
    3:'4. TEMPORARY',
    4:'5. PPD SCH LOSS', 
    5:'6. PPD NSL', 
    6:'7. PTD', 
    7:'8. DEATH'
}

# Use the values from class_mapping as the target names
target_names = list(class_mapping.values())

Compute confusion matrix to evaluate the accuracy of a classification

In [None]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))
print("\nClassification Report:")
print(classification_report(y_train, y_train_pred, target_names=target_names))

In [None]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred, target_names=target_names))


## <span style="color:salmon"> 4. Test Predictions </span> 

In [None]:
X, X_test = apply_frequency_encoding(X, X_test)
NA_imputer(X, X_test)
create_new_features(X, X_test)

In [None]:
scaler = StandardScaler().fit(X[numerical_features])
X[features_to_scale]  = scaler.transform(X[numerical_features])
X_test[features_to_scale]  = scaler.transform(X_test[numerical_features])  

In [None]:
drop_list = []
for col in X.columns:
    if col not in feature_selection:
        drop_list.append(col)
X_test = X_test.drop(drop_list, axis=1)

In [None]:
# Make validation predictions
y_test_pred = model.predict(X_test)
y_test_pred = y_test_pred.ravel()

In [None]:
y_test_final = np.array([class_mapping[i] for i in y_test_pred])

In [None]:
test_id = model.index

In [None]:
submission_df = pd.DataFrame({
    'Claim Identifier': test_id,
    'Claim Injury Type': y_test_final
})

In [None]:
if False:
    version = version_control()
    submission_df.to_csv(f'./submissions/Group49_Version{version:02}.csv', index=False)