In [1]:
import pandas as pd

penguins = pd.read_csv("../datasets/penguins.csv")

columns = ["Body Mass (g)", "Flipper Length (mm)", "Culmen Length (mm)"]
target_name = "Species"

# Remove lines with missing values for the columns of interestes
penguins_non_missing = penguins[columns + [target_name]].dropna()

data = penguins_non_missing[columns]
target = penguins_non_missing[target_name]

In [3]:
data.head()

Unnamed: 0,Body Mass (g),Flipper Length (mm),Culmen Length (mm)
0,3750.0,181.0,39.1
1,3800.0,186.0,39.5
2,3250.0,195.0,40.3
4,3450.0,193.0,36.7
5,3650.0,190.0,39.3


In [4]:
target.head()

0    Adelie Penguin (Pygoscelis adeliae)
1    Adelie Penguin (Pygoscelis adeliae)
2    Adelie Penguin (Pygoscelis adeliae)
4    Adelie Penguin (Pygoscelis adeliae)
5    Adelie Penguin (Pygoscelis adeliae)
Name: Species, dtype: object

In [5]:
target.value_counts()

Adelie Penguin (Pygoscelis adeliae)          151
Gentoo penguin (Pygoscelis papua)            123
Chinstrap penguin (Pygoscelis antarctica)     68
Name: Species, dtype: int64

In [14]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier()),
])

In [15]:
model.get_params()

{'memory': None,
 'steps': [('preprocessor', StandardScaler()),
  ('classifier', KNeighborsClassifier())],
 'verbose': False,
 'preprocessor': StandardScaler(),
 'classifier': KNeighborsClassifier(),
 'preprocessor__copy': True,
 'preprocessor__with_mean': True,
 'preprocessor__with_std': True,
 'classifier__algorithm': 'auto',
 'classifier__leaf_size': 30,
 'classifier__metric': 'minkowski',
 'classifier__metric_params': None,
 'classifier__n_jobs': None,
 'classifier__n_neighbors': 5,
 'classifier__p': 2,
 'classifier__weights': 'uniform'}

In [16]:
from sklearn.model_selection import cross_validate

for n in [5, 51]:
    model.set_params(classifier__n_neighbors=n)
    cv_results = cross_validate(model, data, target, cv=10, scoring="balanced_accuracy")
    scores = cv_results["test_score"]
    print(f"Accuracy score via cross-validation with n={n}:\n"
          f"{scores.mean():.3f} +/- {scores.std():.3f}")

Accuracy score via cross-validation with n=5:
0.952 +/- 0.040
Accuracy score via cross-validation with n=51:
0.942 +/- 0.039


In [17]:
0.942 + 0.039

0.981

In [18]:
0.942 - 0.039

0.9029999999999999

In [19]:
model = KNeighborsClassifier(n_neighbors=5)

cv_results = cross_validate(model, data, target, cv=10, scoring="balanced_accuracy")
scores = cv_results["test_score"]
print(f"Accuracy score via cross-validation with n={5}:\n"
      f"{scores.mean():.3f} +/- {scores.std():.3f}")

Accuracy score via cross-validation with n=5:
0.740 +/- 0.087


In [23]:
from sklearn.model_selection import cross_validate

model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier()),
])

cv_results = cross_validate(model, data, target, cv=10,
                            scoring="balanced_accuracy")
cv_results["test_score"]

array([1.        , 1.        , 1.        , 0.91880342, 0.88253968,
       0.95238095, 0.97777778, 0.93015873, 0.90793651, 0.95238095])

which gives values between 0.88 and 1.0 with an average close to 0.95.

It is possible to change the pipeline parameters and re-run a cross-validation with:

In [24]:
model.set_params(classifier__n_neighbors=51)
cv_results = cross_validate(model, data, target, cv=10,
                            scoring="balanced_accuracy")
cv_results["test_score"]

array([0.95238095, 0.97777778, 1.        , 0.86324786, 0.88253968,
       0.95238095, 0.95555556, 0.95238095, 0.93015873, 0.95238095])

which gives slightly worse test scores but the difference is not necessarily significant: they overlap a lot.

We can disable the preprocessor by setting the preprocessor parameter to None (while resetting the number of neighbors to 5) as follows:

In [25]:
model.set_params(preprocessor=None, classifier__n_neighbors=5)
cv_results = cross_validate(model, data, target, cv=10,
                            scoring="balanced_accuracy")
cv_results["test_score"]

array([0.66468254, 0.73601954, 0.74102564, 0.7042735 , 0.58412698,
       0.66984127, 0.83492063, 0.74285714, 0.88253968, 0.83809524])

In [26]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer


all_preprocessors = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    QuantileTransformer(n_quantiles=100),
    PowerTransformer(method="box-cox"),
]

In [27]:
model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier()),
])

In [33]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'preprocessor': all_preprocessors,
    'classifier__n_neighbors': (5, 51, 101)}

model_grid_search = GridSearchCV(model, param_grid=param_grid,
                                 n_jobs=4, cv=2)

cv_results = cross_validate(
    model_grid_search, data, target, cv=10, return_estimator=True, scoring="balanced_accuracy")

In [34]:
for fold_idx, estimator in enumerate(cv_results["estimator"]):
    print(f"Best parameter found on fold #{fold_idx + 1}")
    print(f"{estimator.best_params_}")

Best parameter found on fold #1
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Best parameter found on fold #2
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
Best parameter found on fold #3
{'classifier__n_neighbors': 5, 'preprocessor': PowerTransformer(method='box-cox')}
Best parameter found on fold #4
{'classifier__n_neighbors': 5, 'preprocessor': PowerTransformer(method='box-cox')}
Best parameter found on fold #5
{'classifier__n_neighbors': 5, 'preprocessor': PowerTransformer(method='box-cox')}
Best parameter found on fold #6
{'classifier__n_neighbors': 5, 'preprocessor': PowerTransformer(method='box-cox')}
Best parameter found on fold #7
{'classifier__n_neighbors': 5, 'preprocessor': PowerTransformer(method='box-cox')}
Best parameter found on fold #8
{'classifier__n_neighbors': 5, 'preprocessor': PowerTransformer(method='box-cox')}
Best parameter found on fold #9
{'classifier__n_neighbors': 5, 'preprocessor': PowerTransformer(

In [35]:
scores = cv_results["test_score"]
print(f"Accuracy score by cross-validation combined with hyperparameters "
      f"search:\n{scores.mean():.3f} +/- {scores.std():.3f}")

Accuracy score by cross-validation combined with hyperparameters search:
0.940 +/- 0.043


In [36]:
scores

array([0.95238095, 1.        , 1.        , 0.86324786, 0.88253968,
       0.95238095, 0.95555556, 0.93015873, 0.90793651, 0.95238095])

In [38]:
from sklearn.model_selection import cross_validate

model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier()),
])

param_grid = {
    'classifier__n_neighbors': (51, 101)}

model_grid_search = GridSearchCV(model, param_grid=param_grid,
                                 n_jobs=4, cv=2)

cv_results = cross_validate(
    model_grid_search, data, target, cv=10, return_estimator=True, scoring="balanced_accuracy")

for fold_idx, estimator in enumerate(cv_results["estimator"]):
    print(f"Best parameter found on fold #{fold_idx + 1}")
    print(f"{estimator.best_params_}")

Best parameter found on fold #1
{'classifier__n_neighbors': 51}
Best parameter found on fold #2
{'classifier__n_neighbors': 51}
Best parameter found on fold #3
{'classifier__n_neighbors': 51}
Best parameter found on fold #4
{'classifier__n_neighbors': 51}
Best parameter found on fold #5
{'classifier__n_neighbors': 51}
Best parameter found on fold #6
{'classifier__n_neighbors': 51}
Best parameter found on fold #7
{'classifier__n_neighbors': 51}
Best parameter found on fold #8
{'classifier__n_neighbors': 51}
Best parameter found on fold #9
{'classifier__n_neighbors': 51}
Best parameter found on fold #10
{'classifier__n_neighbors': 51}


Let's do the grid search with:

In [40]:
from sklearn.model_selection import GridSearchCV
param_grid = {
  "preprocessor": all_preprocessors,
  "classifier__n_neighbors": [5, 51, 101],
}

grid_search = GridSearchCV(
    model,
    param_grid=param_grid,
    scoring="balanced_accuracy",
    cv=10,
).fit(data, target)
#grid_search.cv_results_


We can sort the results and focus on the columns of interest with:

In [41]:
results = (
    pd.DataFrame(grid_search.cv_results_)
    .sort_values(by="mean_test_score", ascending=False)
)

results = results[
    [c for c in results.columns if c.startswith("param_")]
    + ["mean_test_score", "std_test_score"]
]

In [42]:
results

Unnamed: 0,param_classifier__n_neighbors,param_preprocessor,mean_test_score,std_test_score
1,5,StandardScaler(),0.952198,0.039902
2,5,MinMaxScaler(),0.947778,0.034268
3,5,QuantileTransformer(n_quantiles=100),0.947094,0.033797
4,5,PowerTransformer(method='box-cox'),0.94696,0.047387
6,51,StandardScaler(),0.94188,0.038905
8,51,QuantileTransformer(n_quantiles=100),0.927277,0.043759
9,51,PowerTransformer(method='box-cox'),0.922833,0.047883
7,51,MinMaxScaler(),0.920293,0.045516
11,101,StandardScaler(),0.876642,0.041618
12,101,MinMaxScaler(),0.862357,0.046244
