In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import r2_score

# Dataset: Bike Sharing Daily
*Source: https: // www.kaggle.com / contactprad / bike - share - daily - data
*Licence: [1]
Fanaee - T, Hadi, and Gama, Joao, "Event labeling combining ensemble detectors and background knowledge", Progress in Artificial
Intelligence(2013): pp.
1 - 15, Springer
Berlin
Heidelberg, doi: 10.1007 / s13748 - 013 - 0040 - 3.


@article


{
    year = {2013},
           issn = {2192 - 6352},
                  journal = {Progress in Artificial
Intelligence},
doi={10.1007 / s13748-013-0040-3},
title={Event labeling combining ensemble detectors and background knowledge},
url={http: // dx.doi.org / 10.1007 / s13748-013-0040-3},
publisher={Springer Berlin Heidelberg},
keywords={Event labeling; Event detection; Ensemble learning; Background knowledge},
author={Fanaee-T, Hadi and Gama, Joao},
pages={1-15}
}

In [2]:
data = pd.read_csv('datasets/bike_sharing_daily.csv')

X = data.drop('cnt',axis=1)
y = data['cnt']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Create the pipeline

In [12]:
# NUMERIC FEATURES
numeric_features = ['temp', 'atemp', 'hum', 'windspeed']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# CATEGORICAL FEATURES
categorical_ordinal_encode_features = ['season', 'holiday', 'weekday', 'workingday', 'weathersit']
categorical_one_hot_encode_features = ['mnth']

categorical_impute_ordenc = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant')), ('ord_encoder', OrdinalEncoder())])
categorical_impute_onehot = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant')), ('onehot_encoder', OneHotEncoder())])

# PREPROCESSING STEPS
preprocessor = ColumnTransformer(transformers=[
    ('numeric', numeric_transformer, numeric_features),
    ('categorical_ordenc', categorical_impute_ordenc, categorical_ordinal_encode_features),
    ('categorical_onehot', categorical_impute_onehot, categorical_one_hot_encode_features),
])

# PIPELINE INCLUDING A MODEL
pipeline = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('regressor',LinearRegression())])
pipeline

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['temp', 'atemp', 'hum',
                                                   'windspeed']),
                                                 ('categorical_ordenc',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('ord_encoder',
                                                                   OrdinalEncoder())]),
             

# Execute preprocessing steps

In [16]:
pipeline_array = pipeline[0].fit_transform(X_train)

# numeric feature names
numeric_feature_names = pipeline.named_steps.preprocessor.transformers_[0][-1]

# categorical ordinal encoder feature names
cat_ord_feature_names = pipeline.named_steps.preprocessor.transformers_[1][-1]

# categorical one hot encoder feature names
cat_onehot_feature_names = list(pipeline.named_steps.preprocessor.transformers_[2][1]['onehot_encoder'].get_feature_names_out(categorical_one_hot_encode_features))

pipeline_colnames = numeric_feature_names + cat_ord_feature_names + cat_onehot_feature_names
pipeline_df = pd.DataFrame(data=pipeline_array, columns=pipeline_colnames)
pipeline_df

Unnamed: 0,temp,atemp,hum,windspeed,season,holiday,weekday,workingday,weathersit,mnth_1,...,mnth_3,mnth_4,mnth_5,mnth_6,mnth_7,mnth_8,mnth_9,mnth_10,mnth_11,mnth_12
0,0.900346,0.837047,0.684351,-0.942055,3.0,0.0,4.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-1.292066,-1.309675,-0.824082,-0.206565,3.0,0.0,4.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.945830,0.868041,1.538345,0.233291,2.0,0.0,5.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,1.136870,1.069425,-0.333991,0.585068,2.0,0.0,2.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.777535,0.635632,1.570624,-0.542439,3.0,0.0,3.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
579,-0.819013,-0.798267,1.773118,-0.742305,1.0,0.0,6.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
580,-0.891788,-0.902887,-0.871039,0.432939,3.0,0.0,6.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
581,-1.032797,-1.205031,-1.284830,2.887653,0.0,0.0,3.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
582,-0.668910,-0.589027,0.470116,0.017303,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
