In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/Manju410/MLPractice/main/data/insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
df.shape

(1338, 7)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [5]:
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [6]:
target = 'charges'
X = df.drop(target,axis=1)
y = df.loc[:,target]

In [7]:
from sklearn.compose import make_column_selector

In [8]:
num_col = make_column_selector(dtype_exclude=object)
cat_col = make_column_selector(dtype_include=object)

In [9]:
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline

In [10]:
imp_mean = SimpleImputer(strategy='mean')

In [11]:
imp_cat = SimpleImputer(strategy='most_frequent')
onehot = OneHotEncoder()

In [12]:
col_transform = make_column_transformer(
    (make_pipeline(imp_mean),num_col),
    (make_pipeline(imp_cat,onehot),cat_col),
    remainder='passthrough'
)

In [13]:
from sklearn.linear_model import LinearRegression

In [14]:
from sklearn.preprocessing import MinMaxScaler

In [15]:
linear_model = LinearRegression()
mm_scaler = MinMaxScaler()

In [16]:
pipe = make_pipeline(col_transform,mm_scaler,linear_model)

In [17]:
from sklearn.model_selection import cross_val_score

In [18]:
cross_val_score(pipe,X,y,cv=10)

array([0.78599896, 0.73180103, 0.73208748, 0.67549652, 0.77174242,
       0.78592701, 0.79314562, 0.6681603 , 0.74281604, 0.76261003])

In [19]:
cross_val_score(pipe,X,y,cv=10).mean()

0.744978541828283

In [20]:
from sklearn.model_selection import KFold

In [21]:
kfold = KFold(n_splits=10,shuffle=True,random_state=456)

In [22]:
cross_val_score(pipe,X,y,cv=kfold).mean()

0.7393879589928123

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
pipe.get_params()

{'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('pipeline-1',
                                  Pipeline(steps=[('simpleimputer',
                                                   SimpleImputer())]),
                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fe0f58bd090>),
                                 ('pipeline-2',
                                  Pipeline(steps=[('simpleimputer',
                                                   SimpleImputer(strategy='most_frequent')),
                                                  ('onehotencoder',
                                                   OneHotEncoder())]),
                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fe0f58bd050>)]),
 'columntransformer__n_jobs': None,
 'columntransformer__pipeline-1': Pipeline(steps=[('simpleimputer', SimpleImputer())]),
 'columntransformer__pipeline-1__m

In [25]:
params={'columntransformer__pipeline-1__simpleimputer__strategy': ['mean','median']}

In [26]:
grdcv = GridSearchCV(pipe,param_grid=params,cv=kfold,n_jobs=-1)

In [27]:
grdcv.fit(X,y)

GridSearchCV(cv=KFold(n_splits=10, random_state=456, shuffle=True),
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('pipeline-1',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7fe0f58bd090>),
                                                                        ('pipeline-2',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(strategy='most_frequent')),

In [28]:
grdcv.best_params_

{'columntransformer__pipeline-1__simpleimputer__strategy': 'mean'}

In [29]:
grdcv.best_score_

0.7406498175799765