In [1]:
import pandas as pd
import seaborn as sns

In [2]:
df = sns.load_dataset('penguins')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [3]:
df.shape

(344, 7)

In [4]:
target = 'body_mass_g'
X= df.drop(target,axis=1)
y= df.loc[:,target]

In [5]:
X.isna().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
sex                  11
dtype: int64

In [6]:
y.isna().sum()

2

- To avoid hassles of missing target values lets forcefully impute target with its mean value 
- This is only for learning purpose and should never be done in new world

In [7]:
y.fillna(y.mean(),inplace=True)

# Selecting numerical and categorical columns

In [8]:
from sklearn.compose import make_column_selector

In [9]:
num_cols = make_column_selector(dtype_exclude=object)
cat_cols = make_column_selector(dtype_include=object)

# Impute missing values

In [10]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [11]:
imp_mean = SimpleImputer(strategy='mean')

In [12]:
imp_mediean = SimpleImputer(strategy='median')

In [13]:
imp_mode = SimpleImputer(strategy='most_frequent')
one_hot = OneHotEncoder()

In [14]:
col_trans = make_column_transformer(
    (make_pipeline(imp_mean), num_cols),
    (make_pipeline(imp_mode,one_hot), cat_cols),
    remainder='passthrough'
)

# Build the model

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [16]:
std_scaler = StandardScaler()

In [17]:
lr_model = LinearRegression()

In [18]:
pipe = make_pipeline(col_trans,std_scaler,lr_model)

In [19]:
from sklearn.model_selection import cross_val_score

In [20]:
cross_val_score(pipe,X,y,cv=10)

array([ 0.51077642,  0.62570803,  0.50280351,  0.44776977, -0.24278342,
        0.59714462,  0.83938864,  0.54953302,  0.76757603,  0.61147749])

In [21]:
cross_val_score(pipe,X,y,cv=10).mean()

0.5209394121142845

In [22]:
from sklearn.model_selection import KFold

In [23]:
kf_10 = KFold(n_splits=10,shuffle=True,random_state=789)

In [24]:
cross_val_score(pipe,X,y,cv=kf_10)

array([0.91100749, 0.79217984, 0.79087381, 0.87753466, 0.86886507,
       0.81848668, 0.83963883, 0.89092751, 0.89573672, 0.88136538])

In [25]:
cross_val_score(pipe,X,y,cv=kf_10).mean()

0.8566616003510619

In [26]:
from sklearn.model_selection import GridSearchCV

In [27]:
pipe.get_params()

{'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('pipeline-1',
                                  Pipeline(steps=[('simpleimputer',
                                                   SimpleImputer())]),
                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f0e9daf98d0>),
                                 ('pipeline-2',
                                  Pipeline(steps=[('simpleimputer',
                                                   SimpleImputer(strategy='most_frequent')),
                                                  ('onehotencoder',
                                                   OneHotEncoder())]),
                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f0e9daf9890>)]),
 'columntransformer__n_jobs': None,
 'columntransformer__pipeline-1': Pipeline(steps=[('simpleimputer', SimpleImputer())]),
 'columntransformer__pipeline-1__m

In [28]:
params={'columntransformer__pipeline-1__simpleimputer__strategy': ['mean','median']}

In [29]:
grdcv = GridSearchCV(pipe,param_grid=params,cv=kf_10)

In [30]:
grdcv.fit(X,y)

GridSearchCV(cv=KFold(n_splits=10, random_state=789, shuffle=True),
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('pipeline-1',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f0e9daf98d0>),
                                                                        ('pipeline-2',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(strategy='most_frequent')),

In [31]:
grdcv.best_params_

{'columntransformer__pipeline-1__simpleimputer__strategy': 'mean'}

In [32]:
grd_df= pd.DataFrame(grdcv.cv_results_)
grd_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_columntransformer__pipeline-1__simpleimputer__strategy,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.014591,0.000839,0.006014,0.000509,mean,{'columntransformer__pipeline-1__simpleimputer...,0.911007,0.79218,0.790874,0.877535,0.868865,0.818487,0.839639,0.890928,0.895737,0.881365,0.856662,0.041331,1
1,0.016286,0.003606,0.005988,0.000186,median,{'columntransformer__pipeline-1__simpleimputer...,0.911702,0.792145,0.791088,0.876848,0.870163,0.815031,0.840014,0.89116,0.895766,0.881579,0.85655,0.041751,2


In [33]:
grd_df[['params','mean_test_score','rank_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score
0,{'columntransformer__pipeline-1__simpleimputer...,0.856662,1
1,{'columntransformer__pipeline-1__simpleimputer...,0.85655,2


In [34]:
grdcv.best_score_

0.8566616003510619

In [35]:
col_trans

ColumnTransformer(remainder='passthrough',
                  transformers=[('pipeline-1',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer())]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7f0e9daf98d0>),
                                ('pipeline-2',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehotencoder',
                                                  OneHotEncoder())]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7f0e9daf9890>)])

In [36]:
col_trans.transformers[1]

('pipeline-2',
 Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')),
                 ('onehotencoder', OneHotEncoder())]),
 <sklearn.compose._column_transformer.make_column_selector at 0x7f0e9daf9890>)

In [37]:
col_trans.transformers[1][1]

Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')),
                ('onehotencoder', OneHotEncoder())])

In [38]:
col_trans.transformers[1][1][1]

OneHotEncoder()

In [39]:
grdcv.best_estimator_['columntransformer']

ColumnTransformer(remainder='passthrough',
                  transformers=[('pipeline-1',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer())]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7f0e9b360b50>),
                                ('pipeline-2',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehotencoder',
                                                  OneHotEncoder())]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7f0e9b3606d0>)])

In [41]:
col_trans.fit_transform(X,y)

array([[ 39.1,  18.7, 181. , ...,   1. ,   0. ,   1. ],
       [ 39.5,  17.4, 186. , ...,   1. ,   1. ,   0. ],
       [ 40.3,  18. , 195. , ...,   1. ,   1. ,   0. ],
       ...,
       [ 50.4,  15.7, 222. , ...,   0. ,   0. ,   1. ],
       [ 45.2,  14.8, 212. , ...,   0. ,   1. ,   0. ],
       [ 49.9,  16.1, 213. , ...,   0. ,   0. ,   1. ]])

In [45]:
col_trans.fit_transform(X,y).shape

(344, 11)

In [43]:
col_trans.transformers_[1][1][1].get_feature_names_out()

array(['x0_Adelie', 'x0_Chinstrap', 'x0_Gentoo', 'x1_Biscoe', 'x1_Dream',
       'x1_Torgersen', 'x2_Female', 'x2_Male'], dtype=object)

In [47]:
grdcv.best_estimator_['linearregression']

LinearRegression()

In [48]:
grdcv.best_estimator_['linearregression'].coef_

array([ 1.43594467e+02,  1.37121613e+02,  2.62210340e+02, -3.40186834e+14,
       -2.72806109e+14, -3.28902577e+14, -2.42269550e+14, -2.32707318e+14,
       -1.73612486e+14,  2.84052222e+11,  2.84052222e+11])