In [32]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import GridSearchCV, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, hamming_loss
from src.read_data import read_file
from src.create_train_test_split import split_predictor_and_target

# Load Data

In [13]:
# load the train and test data
df_train = read_file(folder="processed", filename="iris_train_processed.csv", delimiter=",")
df_test = read_file(folder="processed", filename="iris_test_processed.csv", delimiter=",")

# Split Data

In [14]:
# split processed training set
X_train, y_train, columns_X_train, columns_y_train = split_predictor_and_target(df=df_train)
# split processed test set
X_test, y_test, columns_X_test, columns_y_test = split_predictor_and_target(df=df_test)

# KNN Algorithm

In [34]:
# Define KNeighborsClassifier model
knn = KNeighborsClassifier()
# Wrap it with MultiOutputClassifier
multi_knn = MultiOutputClassifier(knn)

In [35]:
# Define the parameter grid for grid search
param_grid = {
    'estimator__n_neighbors': [1, 2, 3, 4, 5],
    'estimator__weights': ['uniform', 'distance'],
    'estimator__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

In [36]:
# Create 5-fold cross-validation object
cv = KFold(n_splits=3, shuffle=True, random_state=42)

In [37]:
# Perform GridSearchCV
grid_search = GridSearchCV(estimator=multi_knn,
                           param_grid=param_grid,
                           scoring='accuracy',  # Adjust scoring metric as needed
                           cv=cv,
                           return_train_score=True)
grid_search.fit(X_train, y_train)

In [38]:
# Display the best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Best parameters: {'estimator__algorithm': 'auto', 'estimator__n_neighbors': 3, 'estimator__weights': 'uniform'}
Best cross-validation score: 0.9666666666666667


In [39]:
best_multi_knn_model = grid_search.best_estimator_

In [40]:
# Make predictions on the test set
y_pred = best_multi_knn_model.predict(X_test)

In [41]:
# Print classification report and hamming loss
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Hamming Loss:", hamming_loss(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.89      0.80      0.84        10
           2       0.82      0.90      0.86        10

   micro avg       0.90      0.90      0.90        30
   macro avg       0.90      0.90      0.90        30
weighted avg       0.90      0.90      0.90        30
 samples avg       0.90      0.90      0.90        30

Hamming Loss: 0.06666666666666667
