In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.metrics import r2_score

# Dataset: Bike Sharing Daily
*Source: https: // www.kaggle.com / contactprad / bike - share - daily - data
*Licence: [1]
Fanaee - T, Hadi, and Gama, Joao, "Event labeling combining ensemble detectors and background knowledge", Progress in Artificial
Intelligence(2013): pp.
1 - 15, Springer
Berlin
Heidelberg, doi: 10.1007 / s13748 - 013 - 0040 - 3.


@article


{
    year = {2013},
           issn = {2192 - 6352},
                  journal = {Progress in Artificial
Intelligence},
doi={10.1007 / s13748-013-0040-3},
title={Event labeling combining ensemble detectors and background knowledge},
url={http: // dx.doi.org / 10.1007 / s13748-013-0040-3},
publisher={Springer Berlin Heidelberg},
keywords={Event labeling; Event detection; Ensemble learning; Background knowledge},
author={Fanaee-T, Hadi and Gama, Joao},
pages={1-15}
}

In [2]:
data = pd.read_csv('datasets/bike_sharing_daily.csv')

X = data.drop('cnt',axis=1)
y = data['cnt']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [3]:
data

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.200000,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.229270,0.436957,0.186900,82,1518,1600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,727,2012-12-27,1,1,12,0,4,1,2,0.254167,0.226642,0.652917,0.350133,247,1867,2114
727,728,2012-12-28,1,1,12,0,5,1,2,0.253333,0.255046,0.590000,0.155471,644,2451,3095
728,729,2012-12-29,1,1,12,0,6,0,2,0.253333,0.242400,0.752917,0.124383,159,1182,1341
729,730,2012-12-30,1,1,12,0,0,0,1,0.255833,0.231700,0.483333,0.350754,364,1432,1796


# Create a custom  transformer class - to add more features to the pipeline output

Source of inspiration: http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html

A transformer is just an object that responds to `fit`, `transform`, and `fit_transform`. This includes built-in transformers (like MinMaxScaler), Pipelines, FeatureUnions, and of course, plain old Python objects that implement those methods. Inheriting from TransformerMixin is not required, but helps to communicate intent, and gets you `fit_transform` for free.

A transformer can be thought of as a data in, data out black box. Generally, they accept a matrix as input and return a matrix of the same shape as output. That makes it easy to reorder and remix them at will. However, I often use Pandas DataFrames, and expect one as input to a transformer. For example, the ColumnExtractor is for extracting columns from a DataFrame.

Sometimes transformers are very simple, like HourOfDayTransformer, which just extracts the hour components out of a vector of datetime objects. Such transformers are “stateless”–they don’t need to be fitted, so fit is a no-op.

However, sometimes transformers do need to be fitted. Let’s take a look at my ModelTransformer. I use this one to wrap a scikit-learn model and make it behave like a transformer. I find these useful when I want to use something like a KMeans clustering model to generate features for another model. It needs to be fitted in order to train the model it wraps.


In [4]:
# Example. We will create a new feature that is the extraction of date features from the date column
class DateExtractionTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        X['day'] = pd.to_datetime(X['dteday']).dt.day

        return X

    def fit(self, X, y=None, **fit_params):
        return self

In [5]:
# Example. Create a new feature based on sine and cosine of date (and drop the other date features) - weekday
class SineCosineWeekday(TransformerMixin):
    def transform(self, X, **transform_params):
        X['sine_weekday'] = np.sin(X['weekday'])
        X['cosine_weekday'] = np.cos(X['weekday'])

        return X

    def fit(self, X, y=None, **fit_params):
        return self

# Create the pipeline

In [6]:
# NUMERIC FEATURES
numeric_features = ['temp', 'atemp', 'hum', 'windspeed']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# CATEGORICAL FEATURES
categorical_ordinal_encode_features = ['season', 'holiday', 'workingday', 'weathersit']
categorical_one_hot_encode_features = ['mnth']

categorical_impute_ordenc = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant')), ('ord_encoder', OrdinalEncoder())])
categorical_impute_onehot = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant')), ('onehot_encoder', OneHotEncoder())])

# FEATURE ENGINEERING
feature_engineering = Pipeline(steps=[
    ('date_features', DateExtractionTransformer()),
    ('sine_cosine_weekday', SineCosineWeekday())
])

f_eng_features = ['sine_weekday', 'cosine_weekday', 'day']
simple_numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))])

# PREPROCESSING STEPS
preprocessor = ColumnTransformer(transformers=[
    ('f_eng', simple_numeric_transformer, f_eng_features),
    ('numeric', numeric_transformer, numeric_features),
    ('categorical_ordenc', categorical_impute_ordenc, categorical_ordinal_encode_features),
    ('categorical_onehot', categorical_impute_onehot, categorical_one_hot_encode_features),
])

# PIPELINE INCLUDING A MODEL
pipeline = Pipeline(steps = [
    ('feature_engineering', feature_engineering),
    ('preprocessor', preprocessor),
    ('regressor',LinearRegression())])
pipeline

Pipeline(steps=[('feature_engineering',
                 Pipeline(steps=[('date_features',
                                  <__main__.DateExtractionTransformer object at 0x1468bf040>),
                                 ('sine_cosine_weekday',
                                  <__main__.SineCosineWeekday object at 0x1468bf070>)])),
                ('preprocessor',
                 ColumnTransformer(transformers=[('f_eng',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer())]),
                                                  ['sine_weekday',
                                                   'cosine_weekday', 'day']),
                                                 ('...
                                                   'windspeed']),
                                                 ('categorical_ordenc',
                                                  Pipeline(steps=[('impu

# Execute preprocessing steps

In [7]:
pipeline_array = pipeline[0:2].fit_transform(X_train)

# engineered feature names
f_eng_names = pipeline.named_steps.preprocessor.transformers_[0][-1]

# numeric feature names
numeric_feature_names = pipeline.named_steps.preprocessor.transformers_[1][-1]

# categorical ordinal encoder feature names
cat_ord_feature_names = pipeline.named_steps.preprocessor.transformers_[2][-1]

# categorical one hot encoder feature names
cat_onehot_feature_names = list(pipeline.named_steps.preprocessor.transformers_[3][1]['onehot_encoder'].get_feature_names_out(categorical_one_hot_encode_features))

pipeline_colnames = f_eng_names + numeric_feature_names + cat_ord_feature_names + cat_onehot_feature_names
pipeline_df = pd.DataFrame(data=pipeline_array, columns=pipeline_colnames)
pipeline_df

Unnamed: 0,sine_weekday,cosine_weekday,day,temp,atemp,hum,windspeed,season,holiday,workingday,...,mnth_3,mnth_4,mnth_5,mnth_6,mnth_7,mnth_8,mnth_9,mnth_10,mnth_11,mnth_12
0,-0.756802,-0.653644,4.0,0.900346,0.837047,0.684351,-0.942055,3.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-0.756802,-0.653644,6.0,-1.292066,-1.309675,-0.824082,-0.206565,3.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.958924,0.283662,20.0,0.945830,0.868041,1.538345,0.233291,2.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.909297,-0.416147,16.0,1.136870,1.069425,-0.333991,0.585068,2.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.141120,-0.989992,28.0,0.777535,0.635632,1.570624,-0.542439,3.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
579,-0.279415,0.960170,9.0,-0.819013,-0.798267,1.773118,-0.742305,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
580,-0.279415,0.960170,19.0,-0.891788,-0.902887,-0.871039,0.432939,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
581,0.141120,-0.989992,18.0,-1.032797,-1.205031,-1.284830,2.887653,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
582,0.000000,1.000000,1.0,-0.668910,-0.589027,0.470116,0.017303,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# display estimators as diagrams
from sklearn import set_config
set_config(display='diagram')
pipeline