In [30]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectPercentile, f_regression

from sklearn.metrics import r2_score

# Dataset: Bike Sharing Daily
*Source: https: // www.kaggle.com / contactprad / bike - share - daily - data
*Licence: [1]
Fanaee - T, Hadi, and Gama, Joao, "Event labeling combining ensemble detectors and background knowledge", Progress in Artificial
Intelligence(2013): pp.
1 - 15, Springer
Berlin
Heidelberg, doi: 10.1007 / s13748 - 013 - 0040 - 3.


@article


{
    year = {2013},
           issn = {2192 - 6352},
                  journal = {Progress in Artificial
Intelligence},
doi={10.1007 / s13748-013-0040-3},
title={Event labeling combining ensemble detectors and background knowledge},
url={http: // dx.doi.org / 10.1007 / s13748-013-0040-3},
publisher={Springer Berlin Heidelberg},
keywords={Event labeling; Event detection; Ensemble learning; Background knowledge},
author={Fanaee-T, Hadi and Gama, Joao},
pages={1-15}
}

In [3]:
data = pd.read_csv('datasets/bike_sharing_daily.csv')

X = data.drop('cnt',axis=1)
y = data['cnt']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Create the pipeline

In [31]:
# NUMERIC FEATURES
numeric_features = ['temp', 'atemp', 'hum', 'windspeed']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# CATEGORICAL FEATURES
categorical_features = ['season', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('encoder', OrdinalEncoder())])

# PREPROCESSING STEPS
preprocessor = ColumnTransformer(transformers=[
    ('numeric', numeric_transformer, numeric_features)
    ,('categorical', categorical_transformer, categorical_features)])

# SELECTING FEATURES
fs = SelectPercentile(f_regression, percentile=50)

# PIPELINE INCLUDING A MODEL
pipeline = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('feature_selector', fs),
    ('regressor',LinearRegression())])
pipeline

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['temp', 'atemp', 'hum',
                                                   'windspeed']),
                                                 ('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('encoder',
                                                                   OrdinalEncoder())]),
                        

# Execute preprocessing steps

In [32]:
pipeline_array = pipeline[0].fit_transform(X_train)
pipeline_colnames = pipeline.named_steps.preprocessor.transformers_[0][-1] + pipeline.named_steps.preprocessor.transformers_[1][-1]
pipeline_df = pd.DataFrame(data=pipeline_array, columns=pipeline_colnames)
pipeline_df

Unnamed: 0,temp,atemp,hum,windspeed,season,mnth,holiday,weekday,workingday,weathersit
0,0.900346,0.837047,0.684351,-0.942055,3.0,9.0,0.0,4.0,1.0,1.0
1,-1.292066,-1.309675,-0.824082,-0.206565,3.0,11.0,0.0,4.0,1.0,0.0
2,0.945830,0.868041,1.538345,0.233291,2.0,6.0,0.0,5.0,1.0,1.0
3,1.136870,1.069425,-0.333991,0.585068,2.0,7.0,0.0,2.0,1.0,0.0
4,0.777535,0.635632,1.570624,-0.542439,3.0,8.0,0.0,3.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...
579,-0.819013,-0.798267,1.773118,-0.742305,1.0,3.0,0.0,6.0,0.0,1.0
580,-0.891788,-0.902887,-0.871039,0.432939,3.0,10.0,0.0,6.0,0.0,0.0
581,-1.032797,-1.205031,-1.284830,2.887653,0.0,0.0,0.0,3.0,1.0,0.0
582,-0.668910,-0.589027,0.470116,0.017303,0.0,0.0,0.0,0.0,0.0,0.0


# Execute preprocessing steps + feature selection

In [46]:
pipeline_array = pipeline[0:2].fit_transform(X_train, y_train)

# 1. Extract the initial column names
pipeline_colnames = np.array(pipeline.named_steps.preprocessor.transformers_[0][-1] + pipeline.named_steps.preprocessor.transformers_[1][-1])

# 2. Extract the ones decided by the feature selector using the get_support() masking
pipeline_colnames = pipeline_colnames[pipeline.named_steps.feature_selector.get_support()]

pipeline_df = pd.DataFrame(data=pipeline_array, columns=pipeline_colnames)
pipeline_df

Unnamed: 0,temp,atemp,season,mnth,weathersit
0,0.900346,0.837047,3.0,9.0,1.0
1,-1.292066,-1.309675,3.0,11.0,0.0
2,0.945830,0.868041,2.0,6.0,1.0
3,1.136870,1.069425,2.0,7.0,0.0
4,0.777535,0.635632,3.0,8.0,1.0
...,...,...,...,...,...
579,-0.819013,-0.798267,1.0,3.0,1.0
580,-0.891788,-0.902887,3.0,10.0,0.0
581,-1.032797,-1.205031,0.0,0.0,0.0
582,-0.668910,-0.589027,0.0,0.0,0.0
