# Import the libraries and dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df = pd.read_csv('data/penguins.csv')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


# Clean and Transform the dataset

## Drop sex variable

In [4]:
df.drop('sex', inplace=True, axis=1)
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0
3,Adelie,Torgersen,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0


## Drop missing values

In [5]:
df.dropna(inplace=True)
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0


## One-hot encoding

In [6]:
df = pd.get_dummies(df, columns=['island'])
df.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_Biscoe,island_Dream,island_Torgersen
0,Adelie,39.1,18.7,181.0,3750.0,0,0,1
1,Adelie,39.5,17.4,186.0,3800.0,0,0,1
2,Adelie,40.3,18.0,195.0,3250.0,0,0,1
4,Adelie,36.7,19.3,193.0,3450.0,0,0,1
5,Adelie,39.3,20.6,190.0,3650.0,0,0,1


## Standardize variables

In [7]:
scaler = StandardScaler()
scaler.fit(df.drop('species', axis=1))

StandardScaler()

In [8]:
scaled_df = scaler.transform(df.drop('species', axis=1))

## Assign X, y variables

In [11]:
X = scaled_df
y = df['species']

In [12]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)

# Training model

In [13]:
model = KNeighborsClassifier(n_neighbors=4)
model.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=4)

In [14]:
model_test = model.predict(X_test)

# Evaluation

In [15]:
print(confusion_matrix(y_test, model_test))

[[43  0  0]
 [ 1 20  0]
 [ 0  0 39]]


In [16]:
print(classification_report(y_test, model_test))

              precision    recall  f1-score   support

      Adelie       0.98      1.00      0.99        43
   Chinstrap       1.00      0.95      0.98        21
      Gentoo       1.00      1.00      1.00        39

    accuracy                           0.99       103
   macro avg       0.99      0.98      0.99       103
weighted avg       0.99      0.99      0.99       103

