# Penguin size - KNN

In [2]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("penguins.csv")
df = df.dropna() 

In [4]:
df['species'] = df['species'].astype('category').cat.codes

In [5]:
df['species']

0      0
1      0
2      0
4      0
5      0
      ..
339    1
340    1
341    1
342    1
343    1
Name: species, Length: 333, dtype: int8

In [6]:
features = df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']].values
labels = df['species'].values

In [7]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [8]:
def euclidean_distance(a, b):
    return np.sqrt(np.sum((a - b) ** 2))

In [9]:
def knn_predict(X_train, y_train, x_test, k=3):
    distances = [euclidean_distance(x_test, x_train) for x_train in X_train]
    k_indices = np.argsort(distances)[:k]
    k_nearest_labels = [y_train[i] for i in k_indices]
    most_common = Counter(k_nearest_labels).most_common(1)
    return most_common[0][0]

In [10]:
k = 5
y_pred = [knn_predict(X_train, y_train, x, k) for x in X_test]

In [11]:
def evaluate(y_true, y_pred):
    labels = np.unique(y_true)
    metrics = {}
    for label in labels:
        TP = sum((y_pred[i] == label) and (y_true[i] == label) for i in range(len(y_true)))
        FP = sum((y_pred[i] == label) and (y_true[i] != label) for i in range(len(y_true)))
        FN = sum((y_pred[i] != label) and (y_true[i] == label) for i in range(len(y_true)))
        precision = TP / (TP + FP) if (TP + FP) != 0 else 0
        recall = TP / (TP + FN) if (TP + FN) != 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0
        metrics[label] = {"Precision": precision, "Recall": recall, "F1-Score": f1}

    accuracy = np.mean(np.array(y_pred) == np.array(y_true))
    return accuracy, metrics

In [12]:
accuracy, class_metrics = evaluate(y_test, y_pred)
print("Accuracy:", round(accuracy, 3))
print("Class-wise Metrics:")
for label, m in class_metrics.items():
    print(f"Class {label} -> Precision: {m['Precision']:.2f}, Recall: {m['Recall']:.2f}, F1: {m['F1-Score']:.2f}")

Accuracy: 0.731
Class-wise Metrics:
Class 0 -> Precision: 0.68, Recall: 0.81, F1: 0.74
Class 1 -> Precision: 0.88, Recall: 0.39, F1: 0.54
Class 2 -> Precision: 0.77, Recall: 0.94, F1: 0.85


In [13]:
species_map = dict(enumerate(df['species'].astype('category').cat.categories))

print("\nFirst 10 Predictions:")
for i in range(10):
    actual = species_map[y_test[i]]
    predicted = species_map[y_pred[i]]
    print(f"Sample {i+1}: Predicted = {predicted}, Actual = {actual}")


First 10 Predictions:
Sample 1: Predicted = 1, Actual = 0
Sample 2: Predicted = 1, Actual = 1
Sample 3: Predicted = 0, Actual = 0
Sample 4: Predicted = 2, Actual = 2
Sample 5: Predicted = 0, Actual = 0
Sample 6: Predicted = 0, Actual = 1
Sample 7: Predicted = 0, Actual = 1
Sample 8: Predicted = 2, Actual = 2
Sample 9: Predicted = 2, Actual = 2
Sample 10: Predicted = 2, Actual = 2


In [17]:
print("Evaluating accuracy for different values of k:\n")
for k in range(1, 11):  # Try k from 1 to 10
    y_pred_k = [knn_predict(X_train, y_train, x, k) for x in X_test]
    accuracy_k, _ = evaluate(y_test, y_pred_k)
    print(f"k = {k} --> Accuracy: {accuracy_k:.3f}")


Evaluating accuracy for different values of k:

k = 1 --> Accuracy: 0.776
k = 2 --> Accuracy: 0.776
k = 3 --> Accuracy: 0.746
k = 4 --> Accuracy: 0.731
k = 5 --> Accuracy: 0.731
k = 6 --> Accuracy: 0.746
k = 7 --> Accuracy: 0.761
k = 8 --> Accuracy: 0.716
k = 9 --> Accuracy: 0.687
k = 10 --> Accuracy: 0.701
