# k-Nearest Neighbors (Classification)

In [1]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# Import dataset
df = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv")
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,FEMALE
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,MALE
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,FEMALE


In [3]:
# Delete sex variable
df = df.drop('sex', axis=1)

# Drop rows containing missing values
df.dropna(inplace=True)

# Convert non-numeric data using one-hot encoding
df = pd.get_dummies(df, columns=['island'], dtype=int)

# Assign X and y variables
X = df.drop('species', axis=1)
y = df['species']

# Standardize the independent variables using StandardScaler
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_Biscoe,island_Dream,island_Torgersen
0,-0.884499,0.785449,-1.418347,-0.564142,-0.976875,-0.754193,2.388699
1,-0.811126,0.126188,-1.062250,-0.501703,-0.976875,-0.754193,2.388699
2,-0.664380,0.430462,-0.421277,-1.188532,-0.976875,-0.754193,2.388699
3,-1.324737,1.089724,-0.563715,-0.938776,-0.976875,-0.754193,2.388699
4,-0.847812,1.748985,-0.777373,-0.689020,-0.976875,-0.754193,2.388699
...,...,...,...,...,...,...,...
337,0.601305,-1.750171,0.931890,0.903175,1.023672,-0.754193,-0.418638
338,0.527932,-1.445897,1.003109,0.809516,1.023672,-0.754193,-0.418638
339,1.188289,-0.735923,1.501644,1.933419,1.023672,-0.754193,-0.418638
340,0.234440,-1.192335,0.789451,1.246590,1.023672,-0.754193,-0.418638


In [4]:
# Split data into train/test set and shuffle
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)

In [5]:
# Assign algorithm
model = KNeighborsClassifier(n_neighbors=4)

# Link algorithm to X and y variables
model.fit(X_train, y_train)

In [6]:
# Run algorithm on test data to make predictions
y_pred = model.predict(X_test)

# Evaluate predictions
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[50  0  0]
 [ 1 19  0]
 [ 0  0 33]]
              precision    recall  f1-score   support

      Adelie       0.98      1.00      0.99        50
   Chinstrap       1.00      0.95      0.97        20
      Gentoo       1.00      1.00      1.00        33

    accuracy                           0.99       103
   macro avg       0.99      0.98      0.99       103
weighted avg       0.99      0.99      0.99       103



In [7]:
# Data point to predict
penguin = [
    -0.8,  # bill_length_mm
    1.9,  # bill_depth_mm
    -0.77,  # flipper_length_mm
    -0.3,  # body_mass_g
    1.1,  # island_Biscoe
    -0.8,  # island_Dream
    -0.45,  # island_Torgersen
    # 1,  # sex_FEMALE
    # 0,  # sex_MALE
]
# Make prediction
model.predict([penguin])



array(['Adelie'], dtype=object)