In [27]:
import pandas as pd

penguins = pd.read_csv("../datasets/penguins.csv")

columns = ["Body Mass (g)", "Flipper Length (mm)", "Culmen Length (mm)"]
target_name = "Species"

# Remove lines with missing values for the columns of interest
penguins_non_missing = penguins[columns + [target_name]].dropna()

data = penguins_non_missing[columns]
target = penguins_non_missing[target_name]

In [28]:
print(target.nunique())

pd.DataFrame(target.value_counts())

3


Unnamed: 0_level_0,count
Species,Unnamed: 1_level_1
Adelie Penguin (Pygoscelis adeliae),151
Gentoo penguin (Pygoscelis papua),123
Chinstrap penguin (Pygoscelis antarctica),68


In [29]:
pd.DataFrame(data.describe())

Unnamed: 0,Body Mass (g),Flipper Length (mm),Culmen Length (mm)
count,342.0,342.0,342.0
mean,4201.754386,200.915205,43.92193
std,801.954536,14.061714,5.459584
min,2700.0,172.0,32.1
25%,3550.0,190.0,39.225
50%,4050.0,197.0,44.45
75%,4750.0,213.0,48.5
max,6300.0,231.0,59.6


In [30]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])

accuracy = cross_val_score(
    estimator=model,
    X= data,
    y= target,
    cv=10,
    scoring='balanced_accuracy'
)

accuracy.mean()

np.float64(0.9521978021978021)

In [31]:
for parameter in model.get_params():
    print(parameter)

memory
steps
transform_input
verbose
preprocessor
classifier
preprocessor__copy
preprocessor__with_mean
preprocessor__with_std
classifier__algorithm
classifier__leaf_size
classifier__metric
classifier__metric_params
classifier__n_jobs
classifier__n_neighbors
classifier__p
classifier__weights


In [32]:
from sklearn.model_selection import GridSearchCV

parameter_grid = {
    "classifier__n_neighbors": [5, 51, 101],
}

# model = Pipeline(steps=[
#     ("classifier", KNeighborsClassifier(n_neighbors=5)),
# ])

grid_search_model = GridSearchCV(
    estimator= model,
    param_grid= parameter_grid,
    n_jobs= 2,
    cv= 10
)

grid_search_model.fit(data, target)

pd.DataFrame(grid_search_model.cv_results_)



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003647,0.001167,0.0028,0.000392,5,{'classifier__n_neighbors': 5},1.0,1.0,1.0,0.941176,0.911765,0.970588,0.970588,0.941176,0.911765,0.970588,0.961765,0.032353,1
1,0.003993,0.000557,0.003881,0.0009,51,{'classifier__n_neighbors': 51},0.971429,0.971429,1.0,0.911765,0.911765,0.970588,0.941176,0.970588,0.941176,0.970588,0.95605,0.027209,2
2,0.003833,0.001071,0.003462,0.000843,101,{'classifier__n_neighbors': 101},0.914286,0.971429,0.970588,0.911765,0.882353,0.911765,0.882353,0.911765,0.882353,0.941176,0.917983,0.03178,3


In [93]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer


all_preprocessors = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    QuantileTransformer(n_quantiles=100),
    PowerTransformer(method="box-cox"),
]

parameter_grid = {
    "classifier__n_neighbors": [5, 51, 101],
    "preprocessor": all_preprocessors
}

grid_search_model = GridSearchCV(
    estimator= model,
    param_grid= parameter_grid,
    n_jobs= 2,
    cv= 10,
    scoring='balanced_accuracy'
)

grid_search_model.fit(data, target)

cv_results= pd.DataFrame(grid_search_model.cv_results_)

cv_results= cv_results.rename(
    mapper={
        "param_classifier__n_neighbors": "n_neighbors",
        "param_preprocessor": "preprocessor"
    },
    axis=1
)

cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,n_neighbors,preprocessor,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002716,0.001018,0.003906,0.000905,5,,"{'classifier__n_neighbors': 5, 'preprocessor':...",0.664683,0.73602,0.741026,0.704274,0.584127,0.669841,0.834921,0.742857,0.88254,0.838095,0.739838,0.086685,13
1,0.003838,0.00151,0.003869,0.000454,5,StandardScaler(),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,1.0,1.0,0.918803,0.88254,0.952381,0.977778,0.930159,0.907937,0.952381,0.952198,0.039902,1
2,0.003383,0.000427,0.003633,0.000568,5,MinMaxScaler(),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,0.952381,1.0,0.944444,0.88254,0.930159,0.955556,0.952381,0.907937,0.952381,0.947778,0.034268,2
3,0.004266,0.000704,0.003462,0.000414,5,QuantileTransformer(n_quantiles=100),"{'classifier__n_neighbors': 5, 'preprocessor':...",0.952381,0.92674,1.0,0.918803,0.904762,1.0,0.977778,0.930159,0.907937,0.952381,0.947094,0.033797,3
4,0.041841,0.007747,0.003551,0.000492,5,PowerTransformer(method='box-cox'),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,0.977778,1.0,0.863248,0.88254,0.952381,0.955556,0.930159,0.907937,1.0,0.94696,0.047387,4
5,0.002204,0.000392,0.003203,0.000509,51,,"{'classifier__n_neighbors': 51, 'preprocessor'...",0.618056,0.567521,0.596581,0.564103,0.533333,0.644444,0.622222,0.622222,0.644444,0.638889,0.605182,0.03648,15
6,0.003235,0.000379,0.003159,0.000243,51,StandardScaler(),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.952381,0.977778,1.0,0.863248,0.88254,0.952381,0.955556,0.952381,0.930159,0.952381,0.94188,0.038905,5
7,0.002723,0.000298,0.003468,0.00021,51,MinMaxScaler(),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.904762,0.952381,1.0,0.863248,0.834921,0.952381,0.907937,0.952381,0.930159,0.904762,0.920293,0.045516,8
8,0.003668,0.000431,0.003293,0.000235,51,QuantileTransformer(n_quantiles=100),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.857143,0.952381,1.0,0.863248,0.904762,0.904762,0.977778,0.930159,0.930159,0.952381,0.927277,0.043759,6
9,0.044763,0.008228,0.003831,0.000786,51,PowerTransformer(method='box-cox'),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.904762,0.977778,1.0,0.863248,0.834921,0.952381,0.907937,0.952381,0.930159,0.904762,0.922833,0.047883,7


In [105]:
split_score_columns = [
    column
    for column in cv_results.columns
    if column.startswith("split") and column.endswith("_test_score")
]

scaler_col = cv_results[cv_results["preprocessor"].apply(lambda x: isinstance(x, StandardScaler))][split_score_columns]
non_scaler_cols = cv_results[~cv_results["preprocessor"].apply(lambda x: isinstance(x, StandardScaler))][split_score_columns]

none_col = cv_results[cv_results["preprocessor"].apply(lambda x: x == None)][split_score_columns]
non_none_col = cv_results[cv_results["preprocessor"].apply(lambda x: x != None)][split_score_columns]

# results_01 = scaler_col[split_score_columns] > other_cols[split_score_columns]\
for i in range(len(none_col)):
    isBetter = True
    for j in range(len(non_none_col)):
        target_row = none_col.iloc[i]
        other_row = non_none_col.iloc[j]
        if (target_row > other_row).sum() < 7:
            isBetter = False
            break
    print(isBetter)

scaler_temp = cv_results[cv_results["preprocessor"].apply(lambda x: isinstance(x, StandardScaler))]
(scaler_temp[scaler_temp["n_neighbors"] == 5][split_score_columns].values > scaler_temp[scaler_temp["n_neighbors"] == 51][split_score_columns].values).sum()

(scaler_temp[scaler_temp["n_neighbors"] == 51][split_score_columns].values > scaler_temp[scaler_temp["n_neighbors"] == 101][split_score_columns].values).sum()

False
False
False


np.int64(9)

In [108]:
from sklearn.model_selection import cross_validate

cv = cross_validate(
    estimator=grid_search_model,
    X= data,
    y= target,
    scoring='balanced_accuracy',
    cv=10,
    return_estimator=True
)

cv['test_score'].mean()

np.float64(0.9426495726495727)

In [115]:
for estimator in cv['estimator']:
    print(estimator.best_params_)

{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
{'classifier__n_neighbors': 5, 'preprocessor': MinMaxScaler()}
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
{'classifier__n_neighbors': 5, 'preprocessor': MinMaxScaler()}
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
