In [1]:
import pandas as pd

penguins = pd.read_csv("../datasets/penguins.csv")

columns = ["Body Mass (g)", "Flipper Length (mm)", "Culmen Length (mm)"]
target_name = "Species"

# Remove lines with missing values for the columns of interestes
penguins_non_missing = penguins[columns + [target_name]].dropna()

data = penguins_non_missing[columns]
target = penguins_non_missing[target_name]

In [3]:
data.head()

Unnamed: 0,Body Mass (g),Flipper Length (mm),Culmen Length (mm)
0,3750.0,181.0,39.1
1,3800.0,186.0,39.5
2,3250.0,195.0,40.3
4,3450.0,193.0,36.7
5,3650.0,190.0,39.3


In [8]:
data.describe()

Unnamed: 0,Body Mass (g),Flipper Length (mm),Culmen Length (mm)
count,342.0,342.0,342.0
mean,4201.754386,200.915205,43.92193
std,801.954536,14.061714,5.459584
min,2700.0,172.0,32.1
25%,3550.0,190.0,39.225
50%,4050.0,197.0,44.45
75%,4750.0,213.0,48.5
max,6300.0,231.0,59.6


In [6]:
target.unique()

array(['Adelie Penguin (Pygoscelis adeliae)',
       'Gentoo penguin (Pygoscelis papua)',
       'Chinstrap penguin (Pygoscelis antarctica)'], dtype=object)

In [9]:
target.value_counts(normalize=True)

Adelie Penguin (Pygoscelis adeliae)          0.441520
Gentoo penguin (Pygoscelis papua)            0.359649
Chinstrap penguin (Pygoscelis antarctica)    0.198830
Name: Species, dtype: float64

In [31]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import set_config
from sklearn.model_selection import cross_validate
set_config(display="diagram")
model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])
model

In [32]:
cv_result = cross_validate(model, data, target, cv=10, scoring="balanced_accuracy")
cv_result["test_score"].mean()

0.9521978021978021

In [34]:
model.set_params(classifier__n_neighbors=51)
cv_result = cross_validate(model, data, target, cv=10, scoring="balanced_accuracy")
cv_result["test_score"].mean()

0.9418803418803419

In [30]:
model.set_params(classifier__n_neighbors=5)
cv_result = cross_validate(KNeighborsClassifier(), data, target, cv=10,
                          scoring="balanced_accuracy")
cv_result["test_score"].mean()

0.7398382173382173

### Tuning parameters

In [35]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer


all_preprocessors = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    QuantileTransformer(n_quantiles=100),
    PowerTransformer(method="box-cox"),
]

In [41]:
model.get_params()

{'memory': None,
 'steps': [('preprocessor', StandardScaler()),
  ('classifier', KNeighborsClassifier(n_neighbors=51))],
 'verbose': False,
 'preprocessor': StandardScaler(),
 'classifier': KNeighborsClassifier(n_neighbors=51),
 'preprocessor__copy': True,
 'preprocessor__with_mean': True,
 'preprocessor__with_std': True,
 'classifier__algorithm': 'auto',
 'classifier__leaf_size': 30,
 'classifier__metric': 'minkowski',
 'classifier__metric_params': None,
 'classifier__n_jobs': None,
 'classifier__n_neighbors': 51,
 'classifier__p': 2,
 'classifier__weights': 'uniform'}

In [55]:
from sklearn.model_selection import GridSearchCV

parameters = [{
    'preprocessor':all_preprocessors,
    'classifier__n_neighbors':[5, 51, 101]
}]

model_search = GridSearchCV(model, parameters, scoring="balanced_accuracy",
                           cv=10).fit(data, target)

In [56]:
model_search.best_params_


{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}

In [57]:
cv_results = pd.DataFrame(model_search.cv_results_).sort_values(
    "mean_test_score", ascending=False)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__n_neighbors,param_preprocessor,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
1,0.014208,0.000485,0.012776,0.00065,5,StandardScaler(),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,1.0,1.0,0.918803,0.88254,0.952381,0.977778,0.930159,0.907937,0.952381,0.952198,0.039902,1
2,0.014977,0.001364,0.013384,0.000643,5,MinMaxScaler(),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,0.952381,1.0,0.944444,0.88254,0.930159,0.955556,0.952381,0.907937,0.952381,0.947778,0.034268,2
3,0.017396,0.000786,0.013722,0.001328,5,QuantileTransformer(n_quantiles=100),"{'classifier__n_neighbors': 5, 'preprocessor':...",0.952381,0.92674,1.0,0.918803,0.904762,1.0,0.977778,0.930159,0.907937,0.952381,0.947094,0.033797,3
4,0.028489,0.003621,0.013979,0.001446,5,PowerTransformer(method='box-cox'),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,0.977778,1.0,0.863248,0.88254,0.952381,0.955556,0.930159,0.907937,1.0,0.94696,0.047387,4
6,0.014395,0.00013,0.013811,0.001854,51,StandardScaler(),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.952381,0.977778,1.0,0.863248,0.88254,0.952381,0.955556,0.952381,0.930159,0.952381,0.94188,0.038905,5
8,0.017807,0.000358,0.014348,0.00021,51,QuantileTransformer(n_quantiles=100),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.857143,0.952381,1.0,0.863248,0.904762,0.904762,0.977778,0.930159,0.930159,0.952381,0.927277,0.043759,6
9,0.029197,0.003532,0.015303,0.002155,51,PowerTransformer(method='box-cox'),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.904762,0.977778,1.0,0.863248,0.834921,0.952381,0.907937,0.952381,0.930159,0.904762,0.922833,0.047883,7
7,0.013991,0.000491,0.01334,0.000403,51,MinMaxScaler(),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.904762,0.952381,1.0,0.863248,0.834921,0.952381,0.907937,0.952381,0.930159,0.904762,0.920293,0.045516,8
11,0.014299,0.000223,0.013564,0.000199,101,StandardScaler(),"{'classifier__n_neighbors': 101, 'preprocessor...",0.857143,0.952381,0.944444,0.863248,0.834921,0.857143,0.834921,0.88254,0.834921,0.904762,0.876642,0.041618,9
12,0.014107,0.000505,0.013915,0.000263,101,MinMaxScaler(),"{'classifier__n_neighbors': 101, 'preprocessor...",0.857143,0.857143,0.944444,0.863248,0.834921,0.857143,0.765079,0.904762,0.834921,0.904762,0.862357,0.046244,10
