In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn import set_config; set_config(display='diagram')
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from tempfile import mkdtemp
from shutil import rmtree
import pickle

## 2.1 Preprocessing Pipelines

We are going to predict the charges of a health insurance contract based on various features using the following dataset.

In [4]:
data = pd.read_csv("../data/workflow/data_workflow.csv")
data.head()

Unnamed: 0,age,bmi,children,smoker,region,charges
0,19.0,27.9,0,True,southwest,16884.924
1,18.0,33.77,1,False,southeast,1725.5523
2,,33.0,3,False,southeast,4449.462
3,33.0,22.705,0,False,northwest,21984.47061
4,32.0,28.88,0,False,northwest,3866.8552


In [5]:
data.shape

(1338, 6)

In [6]:
data.describe()

Unnamed: 0,age,bmi,children,charges
count,1334.0,1338.0,1338.0,1338.0
mean,39.183658,30.684932,1.094918,13270.422265
std,14.038208,6.070713,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,26.25,26.33625,0.0,4740.28715
50%,39.0,30.495,1.0,9382.033
75%,51.0,34.6,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1334 non-null   float64
 1   bmi       1338 non-null   float64
 2   children  1338 non-null   int64  
 3   smoker    1338 non-null   bool   
 4   region    1338 non-null   object 
 5   charges   1338 non-null   float64
dtypes: bool(1), float64(3), int64(1), object(1)
memory usage: 53.7+ KB


In [8]:
# Defining the features and the target
X = data.drop(columns=['charges'])
y = data['charges']

# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((1070, 5), (268, 5), (1070,), (268,))

### (a) Pipeline -> -> ->

In [9]:
# Build the pipeline with the different steps
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('standard_scaler', StandardScaler())
])
pipeline.fit(X_train[['age']])
pipeline.transform(X_train[['age']])

array([[ 0.98762065],
       [-0.15610951],
       [-1.08539026],
       ...,
       [-1.51428907],
       [-1.01390713],
       [-0.65649145]], shape=(1070, 1))

In [10]:
# Show the different steps of the pipeline
pipeline

0,1,2
,steps,"[('imputer', ...), ('standard_scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


### (b) Column Transformer

In [11]:
# Impute then scale numerical values
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy="mean")),
    ('standard_scaler', StandardScaler())
])

# Encode categorical values
cat_transformer = OneHotEncoder(handle_unknown='ignore') # categoria desconhecida vira vetor de zeros (nenhuma coluna ativa)

# Parallelize "num_transformer" and "cat_transformer"
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['age', 'bmi']),
    ('cat_transformer', cat_transformer, ['smoker', 'region'])
])

preprocessor

0,1,2
,transformers,"[('num_transformer', ...), ('cat_transformer', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [12]:
X_train_transformed = preprocessor.fit_transform(X_train)

print("Original training set")
display(X_train.head(3))

print("Preprocessed training set")
display(pd.DataFrame(X_train_transformed).head(3))

Original training set


Unnamed: 0,age,bmi,children,smoker,region
721,53.0,36.6,3,False,southwest
7,37.0,27.74,3,False,northwest
1060,24.0,32.01,0,False,southeast


Preprocessed training set


Unnamed: 0,0,1,2,3,4,5,6,7
0,0.987621,0.972681,1.0,0.0,0.0,0.0,0.0,1.0
1,-0.15611,-0.504544,1.0,0.0,0.0,1.0,0.0,0.0
2,-1.08539,0.207392,1.0,0.0,0.0,0.0,1.0,0.0


In [13]:
 # Get your features' names
preprocessor.get_feature_names_out()

array(['num_transformer__age', 'num_transformer__bmi',
       'cat_transformer__smoker_False', 'cat_transformer__smoker_True',
       'cat_transformer__region_northeast',
       'cat_transformer__region_northwest',
       'cat_transformer__region_southeast',
       'cat_transformer__region_southwest'], dtype=object)

In [14]:
pd.DataFrame(
    X_train_transformed,
    columns=preprocessor.get_feature_names_out()
).head()

Unnamed: 0,num_transformer__age,num_transformer__bmi,cat_transformer__smoker_False,cat_transformer__smoker_True,cat_transformer__region_northeast,cat_transformer__region_northwest,cat_transformer__region_southeast,cat_transformer__region_southwest
0,0.987621,0.972681,1.0,0.0,0.0,0.0,0.0,1.0
1,-0.15611,-0.504544,1.0,0.0,0.0,1.0,0.0,0.0
2,-1.08539,0.207392,1.0,0.0,0.0,0.0,1.0,0.0
3,-1.08539,-0.694616,1.0,0.0,1.0,0.0,0.0,0.0
4,0.344272,-1.486583,1.0,0.0,1.0,0.0,0.0,0.0


In [15]:
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['age','bmi']),
    ('cat_transformer', cat_transformer, ['region', 'smoker'])],
    remainder='passthrough'
)
preprocessor

0,1,2
,transformers,"[('num_transformer', ...), ('cat_transformer', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [16]:
pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out()).head(3)

Unnamed: 0,num_transformer__age,num_transformer__bmi,cat_transformer__region_northeast,cat_transformer__region_northwest,cat_transformer__region_southeast,cat_transformer__region_southwest,cat_transformer__smoker_False,cat_transformer__smoker_True,remainder__children
0,0.987621,0.972681,0.0,0.0,0.0,1.0,1.0,0.0,3.0
1,-0.15611,-0.504544,0.0,1.0,0.0,0.0,1.0,0.0,3.0
2,-1.08539,0.207392,0.0,0.0,1.0,0.0,1.0,0.0,0.0


### (c) Custom: Function Transformer

In [17]:
# Create a transformer that compresses data to 2 digits (for instance!)
# rounder = FunctionTransformer(np.round)

# We can use a lambda function for more customizable functions
rounder = FunctionTransformer(lambda array: np.round(array, decimals=2))

In [18]:
# Add it at the end of our numerical transformer
num_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler()),
    ('rounder', rounder)])

# Encode categorical values
cat_transformer = OneHotEncoder(drop='if_binary', handle_unknown='ignore')

preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['bmi', 'age']),
    ('cat_transformer', cat_transformer, ['region', 'smoker'])],
    remainder='passthrough')
preprocessor

0,1,2
,transformers,"[('num_transformer', ...), ('cat_transformer', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,func,<function <la...002859656DB20>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,categories,'auto'
,drop,'if_binary'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [19]:
pd.DataFrame(preprocessor.fit_transform(X_train)).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.97,0.99,0.0,0.0,0.0,1.0,0.0,3.0
1,-0.5,-0.16,0.0,1.0,0.0,0.0,0.0,3.0
2,0.21,-1.09,0.0,0.0,1.0,0.0,0.0,0.0


In [20]:
# Transformers under the hood
class MyCustomTranformer(TransformerMixin, BaseEstimator):
    # BaseEstimator generates the get_params() and set_params() methods that all Pipelines require
    # TransformerMixin creates the fit_transform() method from fit() and transform()

    def __init__(self):
        pass

    def fit(self, X, y=None):
        # Here you store what needs to be stored/learned during .fit(X_train) as instance attributes
        # Return "self" to allow chaining .fit().transform()
        pass
    
    def transform(self, X, y=None):
        # Return the result as a DataFrame for an integration into the ColumnTransformer
        pass

my_transformer = MyCustomTranformer()
my_transformer.fit(X_train)
my_transformer.transform(X_train)
my_transformer.transform(X_test)


### (d) FeatureUnion

In [21]:
# Example: let's build and add a new feature called bmi_age_ratio
X_train.head(3)

Unnamed: 0,age,bmi,children,smoker,region
721,53.0,36.6,3,False,southwest
7,37.0,27.74,3,False,northwest
1060,24.0,32.01,0,False,southeast


In [22]:
# Create a custom transformer that multiplies/divides two columns
# Notice that we are creating this new feature completely randomly just as an example
bmi_age_ratio_constructor = FunctionTransformer(lambda df: pd.DataFrame(df["bmi"] / df["age"]))

union = FeatureUnion([
    ('preprocess', preprocessor),
    ('bmi_age_ratio', bmi_age_ratio_constructor)
])

union

0,1,2
,transformer_list,"[('preprocess', ...), ('bmi_age_ratio', ...)]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,func,<function <la...002859656DB20>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,categories,'auto'
,drop,'if_binary'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,func,<function <la...002859656E8E0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,


In [23]:
pd.DataFrame(union.fit_transform(X_train)).head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.97,0.99,0.0,0.0,0.0,1.0,0.0,3.0,0.690566


In [24]:
# make_column_selector selects features automatically based on dtype

num_col = make_column_selector(dtype_include=['float64'])
cat_col = make_column_selector(dtype_include=['object', 'bool'])

X_train.dtypes

age         float64
bmi         float64
children      int64
smoker         bool
region       object
dtype: object

In [25]:
# Complete preprocessing pipeline

num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
num_col = make_column_selector(dtype_include=['float64'])

cat_transformer = OneHotEncoder()
cat_col = make_column_selector(dtype_include=['object','bool'])

preproc_basic = make_column_transformer(
    (num_transformer, num_col),
    (cat_transformer, cat_col),
    remainder='passthrough'
)

preproc_full = make_union(preproc_basic, bmi_age_ratio_constructor)
preproc_full

0,1,2
,transformer_list,"[('columntransformer', ...), ('functiontransformer', ...)]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,func,<function <la...002859656E8E0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,


## 2.2 Including models in Pipelines

### (a) Full pipeline

In [26]:
# Preprocessor
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
cat_transformer = OneHotEncoder()

preproc = make_column_transformer(
    (num_transformer, make_column_selector(dtype_include=['float64'])),
    (cat_transformer, make_column_selector(dtype_include=['object','bool'])),
    remainder='passthrough'
)

# Add estimator
pipeline = make_pipeline(preproc, Ridge())
pipeline

0,1,2
,steps,"[('columntransformer', ...), ('ridge', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('pipeline', ...), ('onehotencoder', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [27]:
# Train Pipeline
pipeline.fit(X_train, y_train)

# Make predictions
pipeline.predict(X_test.iloc[0:1])

# Score model
pipeline.score(X_test, y_test)

0.6755966859575218

### (b) Cross-validate a Pipeline

In [28]:
# Cross-validate Pipeline
cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2').mean()

np.float64(0.7591966908615821)

### (c) Grid Search a Pipeline

In [29]:
# Which parameters of the pipeline are GridSearch-able?
pipeline.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('pipeline',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer()),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x000002859645FA80>),
                                   ('onehotencoder', OneHotEncoder(),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x000002859654E9F0>)])),
  ('ridge', Ridge())],
 'transform_input': None,
 'verbose': False,
 'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('pipeline',
                                  Pipeline(steps=[('simpleimputer'

In [30]:
grid_search = GridSearchCV(
    pipeline,
    param_grid={
        # Access any component of the Pipeline
        # and any available hyperparamater you want to optimize
        'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'],
        'ridge__alpha': [0.1, 0.5, 1, 5, 10]
    },
    cv=5,
    scoring="r2"
)

grid_search.fit(X_train, y_train)
grid_search.best_params_


{'columntransformer__pipeline__simpleimputer__strategy': 'median',
 'ridge__alpha': 1}

In [31]:
# Let's save the pipelined model with the best hyperparameters
pipeline_tuned = grid_search.best_estimator_
pipeline_tuned

0,1,2
,steps,"[('columntransformer', ...), ('ridge', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('pipeline', ...), ('onehotencoder', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,alpha,1
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [32]:
# We can use this "best" model for predictions without re-training it
pipeline_tuned.predict(X_test[0:1])

array([9761.99126148])

### (d) Caching to avoid repeated computations

In [33]:
# Create a temp folder
cachedir = mkdtemp()

# Instantiate the Pipeline with the cache parameter
# pipeline = Pipeline(steps, memory=cachedir)

# Clear the cache directory after the cross-validation
rmtree(cachedir)

### (e) Debug your pipe

In [34]:
# Access the components of a Pipeline with `named_steps`
pipeline_tuned.named_steps.keys()

dict_keys(['columntransformer', 'ridge'])

In [35]:
# Check intermediate steps
print("Before preprocessing, X_train.shape = ")
print(X_train.shape)
print("After preprocessing, X_train_preprocessed.shape = ")
pipeline_tuned.named_steps["columntransformer"].fit_transform(X_train).shape

Before preprocessing, X_train.shape = 
(1070, 5)
After preprocessing, X_train_preprocessed.shape = 


(1070, 9)

### (f) Exporting models/Pipelines

In [36]:
# You can export your final model/pipeline as a pickle file

# Export Pipeline as pickle file
with open("pipeline.pkl", "wb") as file:
    pickle.dump(pipeline_tuned, file)

# Load Pipeline from pickle file
my_pipeline = pickle.load(open("pipeline.pkl","rb"))

my_pipeline.score(X_test, y_test)

0.6755935812788032

# TPOT

In [37]:
import os
from tpot import TPOTRegressor

X_train_preproc = preproc_basic.fit_transform(X_train)
X_test_preproc = preproc_basic.transform(X_test)

  import pkg_resources
  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Instantiate TPOTClassifier
tpot = TPOTRegressor(generations=4, population_size=20, verbose=2, n_jobs=-1, cv=2)

# Process autoML with TPOT
tpot.fit(X_train_preproc, y_train)

# Print score
print(tpot.score(X_test_preproc, y_test))

In [None]:
# Export TPOT Pipeline to a Python file
tpot.export(os.path.join(os.getcwd(),'tpot_iris_pipeline.py'))

! cat 'tpot_iris_pipeline.py'