In [1]:
import pandas as pd
import seaborn as sns

In [2]:
df = sns.load_dataset('penguins')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [3]:
df.shape

(344, 7)

In [4]:
target = 'species'
X=df.drop(target,axis=1)
y=df.loc[:,target]

In [5]:
y.isna().sum()

0

#Select Numerical and categorical columns

In [6]:
from sklearn.compose import make_column_selector

In [7]:
num_col = make_column_selector(dtype_exclude=object)
cat_col = make_column_selector(dtype_include=object)

#Impute Missing values

In [8]:
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline

In [9]:
imp_mean = SimpleImputer(strategy='mean')

In [10]:
imp_cat = SimpleImputer(strategy='most_frequent')
onehot = OneHotEncoder()

In [11]:
col_transform = make_column_transformer(
    #(make_pipeline(imp_mean,mm_scaler),num_col), --- this is for applying scaling only for numericals
    (make_pipeline(imp_mean),num_col),
    (make_pipeline(imp_cat,onehot),cat_col),
    remainder='passthrough'
)

# Build Model

In [41]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

In [42]:
std_scaler = StandardScaler()

In [43]:
mm_scaler = MinMaxScaler()

In [17]:
knn_model = KNeighborsClassifier()

In [18]:
pipe = make_pipeline(col_transform,std_scaler,knn_model)

In [19]:
from sklearn.model_selection import KFold

In [20]:
K_fold = KFold(n_splits=10,shuffle=True,random_state=985)

In [21]:
from sklearn.model_selection import cross_val_score

In [22]:
cross_val_score(pipe,X,y,cv=K_fold)

array([0.97142857, 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 0.97058824, 1.        , 0.94117647])

In [23]:
cross_val_score(pipe,X,y,cv=K_fold).mean()

0.9883193277310924

In [24]:
from sklearn.model_selection import GridSearchCV

In [25]:
pipe.get_params()

{'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('pipeline-1',
                                  Pipeline(steps=[('simpleimputer',
                                                   SimpleImputer())]),
                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f5aef639650>),
                                 ('pipeline-2',
                                  Pipeline(steps=[('simpleimputer',
                                                   SimpleImputer(strategy='most_frequent')),
                                                  ('onehotencoder',
                                                   OneHotEncoder())]),
                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f5aef639610>)]),
 'columntransformer__n_jobs': None,
 'columntransformer__pipeline-1': Pipeline(steps=[('simpleimputer', SimpleImputer())]),
 'columntransformer__pipeline-1__m

In [58]:
params1 = {'columntransformer__pipeline-1__simpleimputer__strategy':  ['mean','median'],
           'standardscaler':[std_scaler,mm_scaler],
          'kneighborsclassifier__n_neighbors': range(2,30),
          'kneighborsclassifier__p': [1,2,3]
          }

In [59]:
grdcv1 = GridSearchCV(pipe,param_grid=params1,cv=K_fold,n_jobs=-1)

In [60]:
grdcv1.fit(X,y)

GridSearchCV(cv=KFold(n_splits=10, random_state=985, shuffle=True),
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('pipeline-1',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f5aef639650>),
                                                                        ('pipeline-2',
                                                                         Pipeline(steps=[('si...
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 

In [61]:
grd_df1= pd.DataFrame(grdcv1.cv_results_)
grd_df1

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_columntransformer__pipeline-1__simpleimputer__strategy,param_kneighborsclassifier__n_neighbors,param_kneighborsclassifier__p,param_standardscaler,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.023251,0.007240,0.011629,0.003170,mean,2,1,StandardScaler(),{'columntransformer__pipeline-1__simpleimputer...,0.971429,0.971429,1.0,1.0,1.0,1.0,1.0,0.970588,1.0,0.970588,0.988403,0.014205,285
1,0.021577,0.006989,0.010323,0.001547,mean,2,1,MinMaxScaler(),{'columntransformer__pipeline-1__simpleimputer...,0.971429,0.971429,1.0,1.0,1.0,1.0,1.0,0.970588,1.0,0.970588,0.988403,0.014205,285
2,0.028443,0.009363,0.011916,0.002283,mean,2,2,StandardScaler(),{'columntransformer__pipeline-1__simpleimputer...,0.971429,0.971429,1.0,1.0,1.0,1.0,1.0,0.970588,1.0,0.970588,0.988403,0.014205,285
3,0.021881,0.003207,0.012403,0.004204,mean,2,2,MinMaxScaler(),{'columntransformer__pipeline-1__simpleimputer...,0.971429,0.971429,1.0,1.0,1.0,1.0,1.0,0.970588,1.0,0.970588,0.988403,0.014205,285
4,0.023697,0.010955,0.014145,0.002054,mean,2,3,StandardScaler(),{'columntransformer__pipeline-1__simpleimputer...,0.971429,0.971429,1.0,1.0,1.0,1.0,1.0,0.970588,1.0,0.970588,0.988403,0.014205,285
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,0.019508,0.001327,0.011064,0.000697,median,29,1,MinMaxScaler(),{'columntransformer__pipeline-1__simpleimputer...,0.971429,1.000000,1.0,1.0,1.0,1.0,1.0,0.970588,1.0,1.000000,0.994202,0.011598,1
332,0.027887,0.009071,0.012397,0.003522,median,29,2,StandardScaler(),{'columntransformer__pipeline-1__simpleimputer...,0.971429,1.000000,1.0,1.0,1.0,1.0,1.0,0.970588,1.0,0.970588,0.991261,0.013352,145
333,0.019605,0.001017,0.010704,0.000243,median,29,2,MinMaxScaler(),{'columntransformer__pipeline-1__simpleimputer...,0.971429,1.000000,1.0,1.0,1.0,1.0,1.0,0.970588,1.0,1.000000,0.994202,0.011598,1
334,0.020489,0.002287,0.015491,0.000472,median,29,3,StandardScaler(),{'columntransformer__pipeline-1__simpleimputer...,0.971429,1.000000,1.0,1.0,1.0,1.0,1.0,0.970588,1.0,0.970588,0.991261,0.013352,145


In [62]:
grd_df1[['params','mean_test_score','rank_test_score']].sort_values('rank_test_score').head()

Unnamed: 0,params,mean_test_score,rank_test_score
167,{'columntransformer__pipeline-1__simpleimputer...,0.994202,1
187,{'columntransformer__pipeline-1__simpleimputer...,0.994202,1
204,{'columntransformer__pipeline-1__simpleimputer...,0.994202,1
205,{'columntransformer__pipeline-1__simpleimputer...,0.994202,1
206,{'columntransformer__pipeline-1__simpleimputer...,0.994202,1


In [63]:
grdcv1.best_score_

0.9942016806722689

In [64]:
grdcv1.best_params_

{'columntransformer__pipeline-1__simpleimputer__strategy': 'mean',
 'kneighborsclassifier__n_neighbors': 3,
 'kneighborsclassifier__p': 3,
 'standardscaler': StandardScaler()}

In [65]:
from sklearn.model_selection import RandomizedSearchCV

In [66]:
rmdcv = RandomizedSearchCV(pipe,params1,n_iter=40,random_state=895,n_jobs=-1,cv=K_fold)

In [67]:
rmdcv.fit(X,y)

RandomizedSearchCV(cv=KFold(n_splits=10, random_state=985, shuffle=True),
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(remainder='passthrough',
                                                                transformers=[('pipeline-1',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer())]),
                                                                               <sklearn.compose._column_transformer.make_column_selector object at 0x7f5aef639650>),
                                                                              ('pipeline-2',
                                                                               Pipeline(steps...
                                             ('standardscaler',
                             

In [68]:
rmdcv.best_score_

0.9942016806722689

In [69]:
rmdcv.best_params_

{'columntransformer__pipeline-1__simpleimputer__strategy': 'median',
 'kneighborsclassifier__n_neighbors': 24,
 'kneighborsclassifier__p': 2,
 'standardscaler': MinMaxScaler()}

In [72]:
best = rmdcv.best_estimator_
best

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f5aea0f2dd0>),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder())]),
                                                  <sklearn.compose._column_transformer

In [73]:
col_transform.fit_transform(X,y)

array([[ 39.1,  18.7, 181. , ...,   1. ,   0. ,   1. ],
       [ 39.5,  17.4, 186. , ...,   1. ,   1. ,   0. ],
       [ 40.3,  18. , 195. , ...,   1. ,   1. ,   0. ],
       ...,
       [ 50.4,  15.7, 222. , ...,   0. ,   0. ,   1. ],
       [ 45.2,  14.8, 212. , ...,   0. ,   1. ,   0. ],
       [ 49.9,  16.1, 213. , ...,   0. ,   0. ,   1. ]])

In [74]:
col_transform

ColumnTransformer(remainder='passthrough',
                  transformers=[('pipeline-1',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer())]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7f5aef639650>),
                                ('pipeline-2',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehotencoder',
                                                  OneHotEncoder())]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7f5aef639610>)])

In [78]:
col_transform.transformers_[0]

('pipeline-1',
 Pipeline(steps=[('simpleimputer', SimpleImputer())]),
 ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'])

In [79]:
col_transform.transformers_[0][1]

Pipeline(steps=[('simpleimputer', SimpleImputer())])

In [86]:
col_transform.transformers_[0][1].fit_transform(X[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']],y)

array([[  39.1,   18.7,  181. , 3750. ],
       [  39.5,   17.4,  186. , 3800. ],
       [  40.3,   18. ,  195. , 3250. ],
       ...,
       [  50.4,   15.7,  222. , 5750. ],
       [  45.2,   14.8,  212. , 5200. ],
       [  49.9,   16.1,  213. , 5400. ]])

In [89]:
X_trans = col_transform.fit_transform(X,y)
X_trans

array([[ 39.1,  18.7, 181. , ...,   1. ,   0. ,   1. ],
       [ 39.5,  17.4, 186. , ...,   1. ,   1. ,   0. ],
       [ 40.3,  18. , 195. , ...,   1. ,   1. ,   0. ],
       ...,
       [ 50.4,  15.7, 222. , ...,   0. ,   0. ,   1. ],
       [ 45.2,  14.8, 212. , ...,   0. ,   1. ,   0. ],
       [ 49.9,  16.1, 213. , ...,   0. ,   0. ,   1. ]])

In [93]:
col_transform.transformers_[1]

('pipeline-2',
 Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')),
                 ('onehotencoder', OneHotEncoder())]),
 ['island', 'sex'])

In [94]:
col_transform.transformers_[1][1]

Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')),
                ('onehotencoder', OneHotEncoder())])

In [95]:
col_transform.transformers_[1][1][1]

OneHotEncoder()

In [96]:
col_transform.transformers_[1][1][1].get_feature_names_out()

array(['x0_Biscoe', 'x0_Dream', 'x0_Torgersen', 'x1_Female', 'x1_Male'],
      dtype=object)