In [1]:
import pandas as pd
import seaborn as sns

In [2]:
df = sns.load_dataset('penguins')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [3]:
df.shape

(344, 7)

In [4]:
target = 'body_mass_g'
X=df.drop(target,axis=1)
y=df.loc[:,target]

In [5]:
y.isna().sum()

2

In [6]:
y.fillna(y.median(),inplace=True)

#Select Numerical and categorical columns

In [7]:
from sklearn.compose import make_column_selector

In [8]:
num_col = make_column_selector(dtype_exclude=object)
cat_col = make_column_selector(dtype_include=object)

#Impute Missing values

In [9]:
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline

In [10]:
imp_mean = SimpleImputer(strategy='mean')

In [11]:
imp_cat = SimpleImputer(strategy='most_frequent')
onehot = OneHotEncoder()

In [12]:
col_transform = make_column_transformer(
    (make_pipeline(imp_mean),num_col),
    (make_pipeline(imp_cat,onehot),cat_col),
    remainder='passthrough'
)

# Build Model

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

In [14]:
std_scaler = StandardScaler()

In [15]:
knn_regressor = KNeighborsRegressor()

In [16]:
pipe = make_pipeline(col_transform,std_scaler,knn_regressor)

In [17]:
KNeighborsRegressor?

In [18]:
from sklearn.model_selection import KFold

In [19]:
K_fold = KFold(n_splits=10,shuffle=True,random_state=985)

In [20]:
from sklearn.model_selection import cross_val_score

In [21]:
cross_val_score(pipe,X,y,cv=K_fold)

array([0.79425101, 0.86941835, 0.85862446, 0.81320532, 0.84658953,
       0.85935887, 0.76061421, 0.87097963, 0.8513418 , 0.84485039])

In [22]:
cross_val_score(pipe,X,y,cv=K_fold).mean()

0.8369233571894721

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
pipe.get_params()

{'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('pipeline-1',
                                  Pipeline(steps=[('simpleimputer',
                                                   SimpleImputer())]),
                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fc096373e90>),
                                 ('pipeline-2',
                                  Pipeline(steps=[('simpleimputer',
                                                   SimpleImputer(strategy='most_frequent')),
                                                  ('onehotencoder',
                                                   OneHotEncoder())]),
                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fc096373e50>)]),
 'columntransformer__n_jobs': None,
 'columntransformer__pipeline-1': Pipeline(steps=[('simpleimputer', SimpleImputer())]),
 'columntransformer__pipeline-1__m

In [25]:
params = {'columntransformer__pipeline-1__simpleimputer__strategy':  ['mean','median']}

In [26]:
grdcv = GridSearchCV(pipe,param_grid=params,cv=K_fold)

In [27]:
grdcv.fit(X,y)

GridSearchCV(cv=KFold(n_splits=10, random_state=985, shuffle=True),
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('pipeline-1',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7fc096373e90>),
                                                                        ('pipeline-2',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(strategy='most_frequent')),

In [28]:
grd_df= pd.DataFrame(grdcv.cv_results_)
grd_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_columntransformer__pipeline-1__simpleimputer__strategy,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.021391,0.00391,0.009867,0.001689,mean,{'columntransformer__pipeline-1__simpleimputer...,0.794251,0.869418,0.858624,0.813205,0.84659,0.859359,0.760614,0.87098,0.851342,0.84485,0.836923,0.034294,1
1,0.019652,0.002885,0.008957,0.001348,median,{'columntransformer__pipeline-1__simpleimputer...,0.781926,0.869418,0.858624,0.813205,0.84659,0.859359,0.760614,0.87098,0.851342,0.844115,0.835617,0.035967,2


In [29]:
grd_df[['params','mean_test_score','rank_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score
0,{'columntransformer__pipeline-1__simpleimputer...,0.836923,1
1,{'columntransformer__pipeline-1__simpleimputer...,0.835617,2


In [30]:
grdcv.best_score_

0.8369233571894721

In [31]:
params1 = {'columntransformer__pipeline-1__simpleimputer__strategy':  ['mean','median'],
          'kneighborsregressor__n_neighbors': range(2,30),
          'kneighborsregressor__p': [1,2,3]
          }

In [32]:
grdcv1 = GridSearchCV(pipe,param_grid=params1,cv=K_fold)

In [33]:
grdcv1.fit(X,y)

GridSearchCV(cv=KFold(n_splits=10, random_state=985, shuffle=True),
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('pipeline-1',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7fc096373e90>),
                                                                        ('pipeline-2',
                                                                         Pipeline(steps=[('si...
                                                                                          OneHotEncoder())]),
                               

In [34]:
grd_df1= pd.DataFrame(grdcv1.cv_results_)
grd_df1

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_columntransformer__pipeline-1__simpleimputer__strategy,param_kneighborsregressor__n_neighbors,param_kneighborsregressor__p,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.020956,0.005433,0.010173,0.004172,mean,2,1,{'columntransformer__pipeline-1__simpleimputer...,0.766700,0.861593,0.815564,0.746139,0.819126,0.850708,0.710268,0.851121,0.808120,0.810048,0.803939,0.046540,163
1,0.017277,0.002279,0.007170,0.000355,mean,2,2,{'columntransformer__pipeline-1__simpleimputer...,0.744203,0.837447,0.820412,0.732112,0.810546,0.873273,0.707433,0.837634,0.852574,0.782047,0.799768,0.053002,166
2,0.015445,0.000992,0.008809,0.000892,mean,2,3,{'columntransformer__pipeline-1__simpleimputer...,0.741894,0.825797,0.817971,0.713962,0.829528,0.870197,0.716338,0.838141,0.840599,0.804666,0.799909,0.052622,165
3,0.015964,0.001131,0.007632,0.001531,mean,3,1,{'columntransformer__pipeline-1__simpleimputer...,0.795079,0.870902,0.871941,0.789904,0.838697,0.862459,0.722291,0.886110,0.816152,0.818293,0.827183,0.047315,151
4,0.016608,0.001682,0.007403,0.000382,mean,3,2,{'columntransformer__pipeline-1__simpleimputer...,0.781182,0.844910,0.848769,0.788983,0.833537,0.874566,0.747071,0.852214,0.839802,0.825478,0.823651,0.037044,157
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,0.015725,0.001340,0.007381,0.000471,median,28,2,{'columntransformer__pipeline-1__simpleimputer...,0.799398,0.837408,0.882625,0.808623,0.824210,0.848951,0.798011,0.833134,0.851573,0.796560,0.828049,0.026812,144
164,0.015802,0.000409,0.010545,0.000313,median,28,3,{'columntransformer__pipeline-1__simpleimputer...,0.799955,0.829311,0.882065,0.796761,0.815244,0.842628,0.802679,0.836804,0.847453,0.790122,0.824302,0.027311,156
165,0.016930,0.002735,0.007395,0.000558,median,29,1,{'columntransformer__pipeline-1__simpleimputer...,0.798017,0.835968,0.882891,0.805087,0.842600,0.850119,0.796201,0.837017,0.853553,0.799931,0.830138,0.027732,134
166,0.016927,0.001187,0.007920,0.000579,median,29,2,{'columntransformer__pipeline-1__simpleimputer...,0.798963,0.834386,0.881305,0.806774,0.817865,0.849315,0.798139,0.832375,0.856328,0.799780,0.827523,0.026908,149


In [36]:
grd_df1[['params','mean_test_score','rank_test_score']].sort_values('rank_test_score').head()

Unnamed: 0,params,mean_test_score,rank_test_score
117,{'columntransformer__pipeline-1__simpleimputer...,0.843221,1
114,{'columntransformer__pipeline-1__simpleimputer...,0.843176,2
30,{'columntransformer__pipeline-1__simpleimputer...,0.843176,2
33,{'columntransformer__pipeline-1__simpleimputer...,0.842926,4
120,{'columntransformer__pipeline-1__simpleimputer...,0.842358,5


In [37]:
grdcv1.best_score_

0.8432213447515273

In [38]:
grdcv1.best_params_

{'columntransformer__pipeline-1__simpleimputer__strategy': 'median',
 'kneighborsregressor__n_neighbors': 13,
 'kneighborsregressor__p': 1}

In [39]:
from sklearn.model_selection import RandomizedSearchCV

In [42]:
rmdcv = RandomizedSearchCV(pipe,params1,n_iter=40,random_state=895,n_jobs=-1,cv=K_fold)

In [43]:
rmdcv.fit(X,y)

RandomizedSearchCV(cv=KFold(n_splits=10, random_state=985, shuffle=True),
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(remainder='passthrough',
                                                                transformers=[('pipeline-1',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer())]),
                                                                               <sklearn.compose._column_transformer.make_column_selector object at 0x7fc096373e90>),
                                                                              ('pipeline-2',
                                                                               Pipeline(steps...
                                                                               <sklearn.compo

In [44]:
rmdcv.best_score_

0.8432213447515273

In [45]:
rmdcv.best_params_

{'columntransformer__pipeline-1__simpleimputer__strategy': 'median',
 'kneighborsregressor__n_neighbors': 13,
 'kneighborsregressor__p': 1}