In [2]:
import pandas as pd
import seaborn as sns

In [3]:
df = sns.load_dataset('penguins')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [4]:
df.shape

(344, 7)

In [5]:
target = 'body_mass_g'
X= df.drop(target,axis=1)
y= df.loc[:,target]

In [6]:
X.isna().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
sex                  11
dtype: int64

In [7]:
y.isna().sum()

2

- To avoid hassles of missing target values lets forcefully impute target with its mean value 
- This is only for learning purpose and should never be done in new world

In [8]:
y.fillna(y.mean(),inplace=True)

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=345)
X_train.shape,X_test.shape

((258, 6), (86, 6))

In [11]:
X_train.isna().sum()

species              0
island               0
bill_length_mm       1
bill_depth_mm        1
flipper_length_mm    1
sex                  7
dtype: int64

In [12]:
X_test.isna().sum()

species              0
island               0
bill_length_mm       1
bill_depth_mm        1
flipper_length_mm    1
sex                  4
dtype: int64

# Selecting numerical and categorical columns

In [13]:
from sklearn.compose import make_column_selector

In [14]:
num_cols = make_column_selector(dtype_exclude=object)
cat_cols = make_column_selector(dtype_include=object)

# Impute missing values

In [15]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [16]:
imp_mean = SimpleImputer(strategy='mean')

In [17]:
imp_mediean = SimpleImputer(strategy='median')

In [18]:
imp_mode = SimpleImputer(strategy='most_frequent')
one_hot = OneHotEncoder()

In [19]:
col_trans = make_column_transformer(
    (make_pipeline(imp_mean), num_cols),
    (make_pipeline(imp_mode,one_hot), cat_cols),
    remainder='passthrough'
)

# Build the model

In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [21]:
std_scaler = StandardScaler()

In [22]:
lr_model = LinearRegression()

In [23]:
pipe = make_pipeline(col_trans,std_scaler,lr_model)

In [24]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f40629f1c90>),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder())]),
                                                  <sklearn.compose._column_transformer.make_column_sele

In [25]:
pipe.score(X_test,y_test)

0.8776148173594256

In [26]:
X_train.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,sex
160,Chinstrap,Dream,46.0,18.9,195.0,Female
263,Gentoo,Biscoe,49.6,15.0,216.0,Male
100,Adelie,Biscoe,35.0,17.9,192.0,Female
65,Adelie,Biscoe,41.6,18.0,192.0,Male
158,Chinstrap,Dream,46.1,18.2,178.0,Female


In [27]:
from sklearn import set_config

In [28]:
set_config(display='diagram')
pipe