<a href="https://colab.research.google.com/github/HCGrit/PipelineFoundation/blob/master/Pipeline_Experiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Imports

In [0]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.linear_model import LinearRegression

#Pipeline Experiment

In [0]:
df = pd.DataFrame(columns=['X1', 'X2', 'y'], data=[
                                                   [1,16,9],
                                                   [4,36,16],
                                                   [1,16,9],
                                                   [2,9,8],
                                                   [3,36,15],
                                                   [2,49,16],
                                                   [4,25,14],
                                                   [5,36,17]
])

### y = X1 + 2 * sqrt(X2)

train = df.iloc[:6]
test = df.iloc[6:]

train_X = train.drop('y', axis=1)
train_y = train.y

test_X = test.drop('y', axis=1)
test_y = test.y

In [29]:
# let's see if linear regression is able to predict this properly
m1 = LinearRegression()
fit1 = m1.fit(train_X, train_y)
preds = fit1.predict(test_X)
print(f"\n{preds}")
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, preds))}\n")


[13.72113586 16.93334467]
RMSE: 0.20274138822160603



In [30]:
# what if we square-root X2 and multiply by 2?
train_X.X2 = 2 * np.sqrt(train_X.X2)
test_X.X2 = 2 * np.sqrt(test_X.X2)
print(test_X)
m2 = LinearRegression()
fit2 = m2.fit(train_X, train_y)
preds = fit2.predict(test_X)
print(f"\n{preds}")
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, preds))}\n")

   X1    X2
6   4  10.0
7   5  12.0

[14. 17.]
RMSE: 1.2560739669470201e-15



In [0]:
# a perfect prediction, because the data after transformation, fits a perfect linear trend.
# let's restore the data back to original, and do this via custom transformers using pipeline.
train = df.iloc[:6]
test = df.iloc[6:]

train_X = train.drop('y', axis=1)
train_y = train.y

test_X = test.drop('y', axis=1)
test_y = test.y

In [0]:
# references: 
# https://towardsdatascience.com/custom-transformers-and-ml-data-pipelines-with-python-20ea2a7adb65
# https://machinelearningmastery.com/how-to-transform-target-variables-for-regression-with-scikit-learn/
# http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
# https://stackoverflow.com/questions/43308042/transformer-initialize-twice-in-pipeline

class ExperimentalTransformer(BaseEstimator, TransformerMixin):
  def __init__(self):
    print('\n>>>>>>>init() called.\n')

  def fit(self, X, y = None):
    print('\n>>>>>>>fit() called.\n')
    return self

  def transform(self, X, y = None):
    print('\n>>>>>>>transform() called.\n')
    X_ = X.copy() # creating a copy to avoid changes to original dataset
    X_.X2 = 2 * np.sqrt(X_.X2)
    return X_

In [33]:
# without input transformation - to validate that we get the same results as before
print("create pipeline 1")
pipe1 = Pipeline(steps=[
                       ('linear_model', LinearRegression())
])
print("fit pipeline 1")
pipe1.fit(train_X, train_y)
print("predict via pipeline 1")
preds1 = pipe1.predict(test_X)
print(f"\n{preds1}")  # should be [13.72113586 16.93334467]
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, preds1))}\n")  

create pipeline 1
fit pipeline 1
predict via pipeline 1

[13.72113586 16.93334467]
RMSE: 0.20274138822160603



In [34]:
# with input transformation
print("create pipeline 2")
pipe2 = Pipeline(steps=[
                       ('experimental_trans', ExperimentalTransformer()),    # this will trigger a call to __init__
                       ('linear_model', LinearRegression())
])

# an alternate, shorter syntax to do the above, without naming each step, is:
#pipe2 = make_pipeline(ExperimentalTransformer(), LinearRegression())

print("fit pipeline 2")
pipe2.fit(train_X, train_y)
print("predict via pipeline 2")
preds2 = pipe2.predict(test_X)
print(f"\n{preds2}")  # should be [14. 17.]
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, preds2))}\n")

create pipeline 2

>>>>>>>init() called.

fit pipeline 2

>>>>>>>fit() called.


>>>>>>>transform() called.

predict via pipeline 2

>>>>>>>transform() called.


[14. 17.]
RMSE: 1.2560739669470201e-15



In [0]:
# we've assumed in the transform() function of our ExperimentalTransformer that the column name is X2. Let's not do so and
# pass the column name via the constructor, __init__()

In [0]:
class ExperimentalTransformer_2(BaseEstimator, TransformerMixin):
  # add another additional parameter, just for fun, while we are at it
  def __init__(self, feature_name, additional_param = "Himanshu"):  
    print('\n>>>>>>>init() called.\n')
    self.feature_name = feature_name
    self.additional_param = additional_param

  def fit(self, X, y = None):
    print('\n>>>>>>>fit() called.\n')
    print(f'\nadditional param ~~~~~ {self.additional_param}\n')
    return self

  def transform(self, X, y = None):
    print('\n>>>>>>>transform() called.\n')
    X_ = X.copy() # creating a copy to avoid changes to original dataset
    X_[self.feature_name] = 2 * np.sqrt(X_[self.feature_name])
    return X_

In [0]:
# take care to keep the parameter name exactly the same in the function argument as well as 
# the class' variable (feature_name). Changing that will cause problems later when we also
# try to transform the target feature (y). It causes a double-call to __init__ for some reason.

In [38]:
print("create pipeline 2")
pipe2 = Pipeline(steps=[
                       ('experimental_trans', ExperimentalTransformer_2('X2')),
                       ('linear_model', LinearRegression())
])
print("fit pipeline 2")
pipe2.fit(train_X, train_y)
print("predict via pipeline 2")
preds2 = pipe2.predict(test_X)
print(f"\n{preds2}")  # should be [14. 17.]
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, preds2))}\n")

create pipeline 2

>>>>>>>init() called.

fit pipeline 2

>>>>>>>fit() called.


additional param ~~~~~ Himanshu


>>>>>>>transform() called.

predict via pipeline 2

>>>>>>>transform() called.


[14. 17.]
RMSE: 1.2560739669470201e-15



In [0]:
# let's take this a step further by modifying the dataframe to have target as squares of current values:

In [0]:
df = pd.DataFrame(columns=['X1', 'X2', 'y'], data=[
                                                   [1,16,81],
                                                   [4,36,256],
                                                   [1,16,81],
                                                   [2,9,64],
                                                   [3,36,225],
                                                   [2,49,256],
                                                   [4,25,196],
                                                   [5,36,289]
])

### sqrt(y) = X1 + 2 * sqrt(X2)

train = df.iloc[:6]
test = df.iloc[6:]

train_X = train.drop('y', axis=1)
train_y = train.y

test_X = test.drop('y', axis=1)
test_y = test.y

In [41]:
# let's see model's performance with no input & target transformations:
print("create pipeline 1")
pipe1 = Pipeline(steps=[
                       ('linear_model', LinearRegression())
])
print("fit pipeline 1")
pipe1.fit(train_X, train_y)
print("predict via pipeline 1")
preds1 = pipe1.predict(test_X)
print(f"\n{preds1}")  
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, preds1))}\n")  

create pipeline 1
fit pipeline 1
predict via pipeline 1

[200.34790002 279.04738423]
RMSE: 7.679804528409077



In [42]:
# with input transformation but no target transformation
print("create pipeline 2")
pipe2 = Pipeline(steps=[
                       ('experimental_trans', ExperimentalTransformer_2('X2')),
                       ('linear_model', LinearRegression())
])
print("fit pipeline 2")
pipe2.fit(train_X, train_y)
print("predict via pipeline 2")
preds2 = pipe2.predict(test_X)
print(f"\n{preds2}")  
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, preds2))}\n")

create pipeline 2

>>>>>>>init() called.

fit pipeline 2

>>>>>>>fit() called.


additional param ~~~~~ Himanshu


>>>>>>>transform() called.

predict via pipeline 2

>>>>>>>transform() called.


[207.42690058 280.94152047]
RMSE: 9.887192456534303



In [0]:
# we'll now write a custom target transformer.
# this needs 2 functions, one to transform and another to inverse-transform

In [0]:
def target_transform(target):
  print('\n*****************target_transform() called.\n')
  target_ = target.copy() 
  target_ = np.sqrt(target_)
  return target_

def inverse_target_transform(target):
  print('\n*****************inverse_target_transform() called.\n')
  target_ = target.copy() 
  target_ = target_ ** 2
  return target_

In [45]:
# with input transformation & target transformation
print("create pipeline 3")
# no change in input pipeline
pipe3 = Pipeline(steps=[
                       ('experimental_trans', ExperimentalTransformer_2('X2')),
                       ('linear_model', LinearRegression())
])

# create a TargetTransformer 
model = TransformedTargetRegressor(regressor=pipe3, 
                                   func=target_transform, 
                                   inverse_func=inverse_target_transform)

print("fit pipeline 3 [fit Model]")
# note the different syntax here; we fit the 'model' now, instead of 'pipe3'
model.fit(train_X, train_y)  
print("predict via pipeline 3 [Model]")
preds3 = model.predict(test_X) # same here, using 'model' to predict
print(f"\n{preds3}")  # should be [196. 289.]
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, preds3))}\n")

create pipeline 3

>>>>>>>init() called.

fit pipeline 3 [fit Model]

*****************target_transform() called.


*****************inverse_target_transform() called.


*****************target_transform() called.


*****************inverse_target_transform() called.


*****************target_transform() called.


>>>>>>>init() called.


>>>>>>>fit() called.


additional param ~~~~~ Himanshu


>>>>>>>transform() called.

predict via pipeline 3 [Model]

>>>>>>>transform() called.


*****************inverse_target_transform() called.


[196. 289.]
RMSE: 4.0194366942304644e-14



In [0]:
# perfect predictions!

In [0]:
# we can even use in-built Transformers instead of user-defined functions. Example-
# model = TransformedTargetRegressor(regressor=pipe3, transformer=PowerTransformer())
# or
# model = TransformedTargetRegressor(regressor=pipe3, transformer=StandardScaler())
# using a built-in transformer does not require us to specify the inverse_transformer() as that is taken care of internally.

In [0]:
# in case you want to have a custom transformer inside TransformedTargetRegressor, you can do that too. The only additional 
# function you'll have to implement would be inverse_transform(). Here's an example:

In [0]:
class CustomTargetTransformer(BaseEstimator, TransformerMixin):
  # no need to implement __init__ in this particular case
  
  def fit(self, target):
    return self

  def transform(self, target):
    print('\n%%%%%%%%%%%%%%%custom_target_transform() called.\n')
    target_ = target.copy() 
    target_ = np.sqrt(target_)
    return target_

  # need to implement this too
  def inverse_transform(self, target):
    print('\n%%%%%%%%%%%%%%%custom_inverse_target_transform() called.\n')
    target_ = target.copy() 
    target_ = target_ ** 2
    return target_

In [50]:
# with input transformation & target transformation
print("create pipeline 3.1")
# no change in input pipeline
pipe3_1 = Pipeline(steps=[
                       ('experimental_trans', ExperimentalTransformer_2('X2')),
                       ('linear_model', LinearRegression())
])

# create a TargetTransformer 
# By default, the provided functions are checked at each fit to be the inverse of each other. However, it is 
# possible to bypass this checking by setting check_inverse to False.
model = TransformedTargetRegressor(regressor=pipe3_1, 
                                   transformer=CustomTargetTransformer(),
                                   check_inverse=False) # avoid repeated calls

print("fit pipeline 3.1 [fit Model]")
model.fit(train_X, train_y)  
print("predict via pipeline 3.1 [Model]")
preds3_1 = model.predict(test_X) 
print(f"\n{preds3_1}")  # should be [196. 289.]
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, preds3_1))}\n")

create pipeline 3.1

>>>>>>>init() called.

fit pipeline 3.1 [fit Model]

%%%%%%%%%%%%%%%custom_target_transform() called.


>>>>>>>init() called.


>>>>>>>fit() called.


additional param ~~~~~ Himanshu


>>>>>>>transform() called.

predict via pipeline 3.1 [Model]

>>>>>>>transform() called.


%%%%%%%%%%%%%%%custom_inverse_target_transform() called.


[196. 289.]
RMSE: 4.0194366942304644e-14



In [0]:
# let's now see how to get and set parameters of the model. We'll also cache the transformer to 
# avoid repeated computation and make it more efficient.

In [52]:
# get all the params of our model
model.get_params()

{'check_inverse': False,
 'func': None,
 'inverse_func': None,
 'regressor': Pipeline(memory=None,
          steps=[('experimental_trans',
                  ExperimentalTransformer_2(additional_param='Himanshu',
                                            feature_name='X2')),
                 ('linear_model',
                  LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                   normalize=False))],
          verbose=False),
 'regressor__experimental_trans': ExperimentalTransformer_2(additional_param='Himanshu', feature_name='X2'),
 'regressor__experimental_trans__additional_param': 'Himanshu',
 'regressor__experimental_trans__feature_name': 'X2',
 'regressor__linear_model': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 'regressor__linear_model__copy_X': True,
 'regressor__linear_model__fit_intercept': True,
 'regressor__linear_model__n_jobs': None,
 'regressor__linear_model__normalize': False,
 'regressor_

In [0]:
from tempfile import mkdtemp
from shutil import rmtree
# read about caching and side effect at: https://scikit-learn.org/stable/modules/compose.html?highlight=transformedtargetregressor#pipeline-chaining-estimators

In [54]:
cachedir = mkdtemp()
print("create pipeline 4")
pipe4 = Pipeline(steps=[
                        # incorrect column name passed
                       ('experimental_trans', ExperimentalTransformer_2('X1')), 
                       ('linear_model', LinearRegression())
], memory=cachedir)
# create a TargetTransformer
model = TransformedTargetRegressor(regressor=pipe4, 
                                   func=target_transform, 
                                   inverse_func=inverse_target_transform, 
                                   check_inverse=False) 
# correcting the column name using set_params()
model.set_params(regressor__experimental_trans__feature_name = 'X2') 

print("fit pipeline 4 [fit Model]")
model.fit(train_X, train_y)  
print("predict via pipeline 4 [Model]")
preds4 = model.predict(test_X) 
print(f"\n{preds4}")  # should be [196. 289.]
print(f"RMSE: {np.sqrt(mean_squared_error(test_y, preds4))}\n")

# Clear the cache directory when you don't need it anymore
rmtree(cachedir)

create pipeline 4

>>>>>>>init() called.

fit pipeline 4 [fit Model]

*****************target_transform() called.


>>>>>>>init() called.


>>>>>>>init() called.


>>>>>>>fit() called.


additional param ~~~~~ Himanshu


>>>>>>>transform() called.

predict via pipeline 4 [Model]

>>>>>>>transform() called.


*****************inverse_target_transform() called.


[196. 289.]
RMSE: 4.0194366942304644e-14



In [0]:
# NEXT STEPS:

# 1. FeatureUnion and ColumnTransformer
# Some great examples:
# https://scikit-learn.org/stable/modules/compose.html#featureunion-composite-feature-spaces  
# https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html#sklearn.compose.ColumnTransformer


# 2. Using GridSearch with Pipelines
# Example: https://scikit-learn.org/stable/auto_examples/compose/plot_feature_union.html?highlight=pipeline
