Most data science projects need some sort of data cleaning and preprocessing. These steps can be easily performed on a step by step basis and even manually, however, for reproducibility and for deployment, we would want these steps to be stable, standard and reproducible for multiple users, or for offline and online environments. A good way to start is using sklearn `Pipeline`.

The purpose of this notebook is to show the syntax of simple pipeline, not to train a perfect model.

In [7]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import r2_score

# Dataset: Bike Sharing Daily
* Source: https://www.kaggle.com/contactprad/bike-share-daily-data
* Licence: [1] Fanaee-T, Hadi, and Gama, Joao, "Event labeling combining ensemble detectors and background knowledge", Progress in Artificial Intelligence (2013): pp. 1-15, Springer Berlin Heidelberg, doi:10.1007/s13748-013-0040-3.

@article{
year={2013},
issn={2192-6352},
journal={Progress in Artificial Intelligence},
doi={10.1007/s13748-013-0040-3},
title={Event labeling combining ensemble detectors and background knowledge},
url={http://dx.doi.org/10.1007/s13748-013-0040-3},
publisher={Springer Berlin Heidelberg},
keywords={Event labeling; Event detection; Ensemble learning; Background knowledge},
author={Fanaee-T, Hadi and Gama, Joao},
pages={1-15}
}

In [2]:
data = pd.read_csv('datasets/bike_sharing_daily.csv')
data.dtypes

instant         int64
dteday         object
season          int64
yr              int64
mnth            int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
casual          int64
registered      int64
cnt             int64
dtype: object

# Splitting the data
This is important to highlight:
1. Preprocessing is performed only the train set
2. The same pipeline can be executed for the test set

In [4]:
X = data.drop('cnt',axis=1)
y = data['cnt']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Create the pipeline

In [8]:
# NUMERIC FEATURES
numeric_features = ['temp', 'atemp', 'hum', 'windspeed']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# CATEGORICAL FEATURES
categorical_features = ['season', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('encoder', OrdinalEncoder())])

# PREPROCESSING STEPS
preprocessor = ColumnTransformer(transformers=[
    ('numeric', numeric_transformer, numeric_features)
    ,('categorical', categorical_transformer, categorical_features)])

# PIPELINE INCLUDING A MODEL
pipeline = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('regressor',RandomForestRegressor())])

## Fit the pipeline to the train data

In [9]:
rf_model = pipeline.fit(X_train, y_train)
print (rf_model)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['temp', 'atemp', 'hum',
                                                   'windspeed']),
                                                 ('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('encoder',
                                                                   OrdinalEncoder())]),
                        

## Predict on train and test data

In [10]:
train_prediction = rf_model.predict(X_train)
test_predictions = rf_model.predict(X_test)
print(f'Train: {r2_score(y_train, train_prediction)}')
print(f'Test: {r2_score(y_test, test_predictions)}')

Train: 0.9450886675550338
Test: 0.5494215707004846
