# <center>Machine Learning Project</center>

** **
## <center>*03.4 - Instance Based*</center>

** **

The members of the `team` are:
- Ana Farinha - 20211514
- Francisco Capontes - 20211692
- Sofia Gomes - 20240848
- Rui Lourenço - 2021639



In [None]:
# Import libraries
import pandas as pd
import numpy as np

#make the split here
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer

from utils import *

## <span style="color:salmon"> 1. Import Dataset </span> 

In [None]:
# Import dataset
df = pd.read_csv('preprocessed_data/train_data.csv', index_col="Claim Identifier")
df

In [None]:
# Import dataset
test_df = pd.read_csv('./preprocessed_data/test_data.csv', index_col = 'Claim Identifier')
test_df

## <span style="color:salmon"> 2. Prepare Dataset </span> 

Define y as a target "Claim Injury Type Encoded" and X with all the other columns

In [None]:
X = df.drop(["Claim Injury Type Encoded"], axis = 1)
y = df["Claim Injury Type Encoded"]

Split the data and the target to X_train, X_test, y_train, y_test, where test_size should be equal to 0.25, stratify equal to target and shuffle equal to True

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.25, stratify = y, shuffle = True)

Define X_test

In [None]:
X_test = test_df.copy()

Check missing values in each dataset:

In [None]:
X_train.isna().sum().sort_values()[X_train.isna().sum() != 0]

In [None]:
X_val.isna().sum().sort_values()[X_val.isna().sum() != 0]

In [None]:
X_test.isna().sum().sort_values()[X_test.isna().sum() != 0]

Input `Average Weekly Wage` and variables created from it

In [None]:
median_avg_week_wage = X_train['Average Weekly Wage'].median()

X_train['Average Weekly Wage'] = X_train['Average Weekly Wage'].fillna(median_avg_week_wage)
X_val['Average Weekly Wage'] = X_val['Average Weekly Wage'].fillna(median_avg_week_wage)
X_test['Average Weekly Wage'] = X_test['Average Weekly Wage'].fillna(median_avg_week_wage)

In [None]:
financial_impact(X_train)
financial_impact(X_val)
financial_impact(X_test)

*Input `Age at Injury` and variables that can be calculated with it*

In [None]:
median_age_at_injury = X_train['Age at Injury'].median()

X_train['Age at Injury'] = X_train['Age at Injury'].fillna(median_age_at_injury)
X_val['Age at Injury'] = X_val['Age at Injury'].fillna(median_age_at_injury)
X_test['Age at Injury'] = X_test['Age at Injury'].fillna(median_age_at_injury)

In [None]:
def calculate_birth_year(df):
    # Ensure the correct format of 'Birth Year'
    df['Accident Date'] = pd.to_datetime(df['Accident Date'], errors='coerce')

    # Filter the rows where 'Birth Year' is NaN, but 'Age at Injury' and 'Accident Date' are not NaN
    condition = df['Birth Year'].isna() & df['Age at Injury'].notna() & df['Accident Date'].notna()

    # Replace missing 'Birth Year' with the difference between 'Accident Date' year and 'Age at Injury'
    df.loc[condition, 'Birth Year'] = df.loc[condition, 'Accident Date'].dt.year - df.loc[condition, 'Age at Injury']

In [None]:
calculate_birth_year(X_train)
calculate_birth_year(X_val)
calculate_birth_year(X_test)

Drop the columns: Accident Date

In [None]:
X_train.drop('Accident Date', axis=1, inplace=True)
X_val.drop('Accident Date', axis=1, inplace=True)
X_test.drop('Accident Date', axis=1, inplace=True)

In [None]:
class_mapping = {
    0:'1. CANCELLED', 
    1:'2. NON-COMP',
    2:'3. MED ONLY', 
    3:'4. TEMPORARY',
    4:'5. PPD SCH LOSS', 
    5:'6. PPD NSL', 
    6:'7. PTD', 
    7:'8. DEATH'
}

# Use the values from class_mapping as the target names
target_names = list(class_mapping.values())

## <span style="color:salmon"> 3. Instance Based </span> 

Using KNeighborsClassifier, create a Nearest Neighbor classifier instance:

In [None]:
model_KNN = KNeighborsClassifier()

#### <span style="color:salmon"> 3.1 Methods in KNeighborsClassifier </span> 

Use the `.fit()`method of model to fit the model to the array of points `X_train` and `y_train

In [None]:
model_KNN.fit(X = X_train, y = y_train)

Use the `.predict()` method of modelKNN to perform classification in `X_train` and assign to the object `labels_train`

In [None]:
y_train_pred = model_KNN.predict(X_train)
y_val_pred = model_KNN.predict(X_val)
y_val_pred

Use the `.predict_proba()` method of modelKNN to obtain the probability estimates for the `X_val`

In [None]:
model_KNN.predict_proba(X_val)

Use the `.score()` method of modelKNN to obtain the mean accuracy of the model in the training data and in the validation data.

In [None]:
print(model_KNN.score(X_train, y_train))
print(model_KNN.score(X_val, y_val))

In [None]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))
print("\nClassification Report:")
print(classification_report(y_train, y_train_pred, target_names=target_names))


In [None]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred, target_names=target_names))

#### <span style="color:salmon"> 3.2 Methods in KNeighborsClassifier (change the number of neighbors) </span> 

To identify the best number of neighbors:

In [None]:
numberK_list=np.arange(1,21)
high_score=0
nof=0           
score_list_train =[]
score_list_val =[]
for n in numberK_list:
    model = KNeighborsClassifier(n_neighbors = n).fit(X_train, y_train)
    score_train = model.score(X_train, y_train)
    score_val = model.score(X_val, y_val)
    score_list_train.append(score_train)
    score_list_val.append(score_val)
    
    if(score_val>high_score):
        high_score = score_val
        nof = numberK_list[n-1]
print("Best number of neighbors: %d" %nof)
print("Mean accuracy in train with %d neighbors: %f" % (nof, score_list_train[nof-1]))
print("Mean accuracy in validation with %d neighbors: %f" % (nof, high_score))

Let's use this number of neighbors:

## <span style="color:yellow"> ALTERAR n_neighbors  </span> 

In [None]:
model_KNN_N = KNeighborsClassifier(n_neighbors=11).fit(X_train, y_train)

In [None]:
y_train_pred = model_KNN_N.predict(X_train)
y_val_pred = model_KNN_N.predict(X_val)
y_val_pred

In [None]:
print(model_KNN_N.score(X_train, y_train))
print(model_KNN_N.score(X_val, y_val))

In [None]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))
print("\nClassification Report:")
print(classification_report(y_train, y_train_pred, target_names=target_names))


In [None]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred, target_names=target_names))

#### <span style="color:salmon"> 3.3 Methods in KNeighborsClassifier (change the algorithm) </span> 

The algorithm by default is brute-force algorithm (algorithm = 'brute), so let´s see the algorithm 'kd_tree'

In [None]:
model_KNN_T = KNeighborsClassifier(algorithm='kd_tree').fit(X_train, y_train)

In [None]:
y_train_pred = model_KNN_T.predict(X_train)
y_val_pred = model_KNN_T.predict(X_val)
y_val_pred

In [None]:
print(model_KNN_T.score(X_train, y_train))
print(model_KNN_T.score(X_val, y_val))

In [None]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))
print("\nClassification Report:")
print(classification_report(y_train, y_train_pred, target_names=target_names))


In [None]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred, target_names=target_names))

#### <span style="color:salmon"> 3.4 Methods in KNeighborsClassifier (change the metric) </span> 

The metric by default is Euclidean, so let´s see the metric manhattan

In [None]:
model_KNN_M = KNeighborsClassifier(metric = 'manhattan').fit(X = X_train, y = y_train)

In [None]:
y_train_pred = model_KNN_M.predict(X_train)
y_val_pred = model_KNN_M.predict(X_val)
y_val_pred

In [None]:
print(model_KNN_M.score(X_train, y_train))
print(model_KNN_M.score(X_val, y_val))

In [None]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))
print("\nClassification Report:")
print(classification_report(y_train, y_train_pred, target_names=target_names))


In [None]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred, target_names=target_names))

#### <span style="color:salmon"> 3.5 Methods in KNeighborsClassifier (change the distance weights) </span> 

The default distance weight is uniform. Let´s see the weights='distance'

In [None]:
model_KNN_W= KNeighborsClassifier(weights='distance').fit(X = X_train, y = y_train)

In [None]:
y_train_pred = model_KNN_W.predict(X_train)
y_val_pred = model_KNN_W.predict(X_val)
y_val_pred

In [None]:
print(model_KNN_W.score(X_train, y_train))
print(model_KNN_W.score(X_val, y_val))

In [None]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))
print("\nClassification Report:")
print(classification_report(y_train, y_train_pred, target_names=target_names))


In [None]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred, target_names=target_names))