In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from matplotlib.colors import ListedColormap
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.model_selection import train_test_split

In [None]:
frame = sns.load_dataset('Penguins')
df = frame.dropna()
print("Cleaned dataset has been loaded! ")

In [None]:
X = df[['flipper_length_mm', 'bill_length_mm']]
y = df['species']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

pipe = Pipeline([
    ('scaler' , StandardScaler()),
    ('estimator' , KNeighborsClassifier())
])

parameters={
            'estimator': [KNeighborsClassifier()],
            'estimator__n_neighbors': [1,3,5,9],
            'estimator__weights': ['unifrom','distance']}
grid = GridSearchCV(
    estimator= pipe,
    param_grid = parameters,
    # n_jobs=5,
    scoring='accuracy',
    cv=5,
    refit=True

)
grid.fit(X_train, y_train)
print("Model has been trained !")

In [None]:
print(grid.classes_)            # â†’ will print something like: ['Adelie' 'Chinstrap' 'Gentoo']

In [None]:

print(grid.best_estimator_)

print(f"Best CV score: {grid.best_score_:.4f}")
print(f"Accuracy on data (after refit): {grid.score(X_test, y_test):.4f}")

# pred = grid.predict(X_test)
pred = cross_val_predict(grid.best_estimator_, X, y, cv=5)
misclassified = y != pred
n_misclassified = misclassified.sum()
print(f"Misclassified points: {n_misclassified}")

X_test_idx = X_test.index
misclassified_idx = df[misclassified].index

mask = X[misclassified].copy()

mask['true'] = y[misclassified]
mask['predicted'] = pred[misclassified]
print(mask)

In [None]:
colors = ['Blue','Orange','Green']
colors_map = ListedColormap(colors)
species_colors = {'Adelie': 'blue', 'Chinstrap':'Yellow', 'Gentoo':'Green'}
# fig, ax = plt.subplots(figsize=(11, 8))
disp = DecisionBoundaryDisplay.from_estimator(
    grid,
    X,
    grid_resolution=500,
    eps=0.5,
    response_method='predict',
    # plot_method='pcolormesh',
    cmap=colors_map,
    alpha=0.6
)

sns.scatterplot(data=df.loc[X.index], x='flipper_length_mm', y='bill_length_mm', hue='species', style='species', s=100, edgecolor='k', 
                palette=species_colors)
sns.scatterplot(data= df.loc[misclassified_idx], x='flipper_length_mm', y='bill_length_mm', style='species', color= 'red', markers={'Adelie':'s', 'Chinstrap':'X', 'Gentoo':'o'}, s=130, edgecolor='k')

plt.tight_layout()
plt.show()