In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

# LOADING THE DATA

In [4]:
data=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/SeoulBikeData.csv', encoding= 'unicode_escape')
display(data)

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,30/11/2018,1003,19,4.2,34,2.6,1894,-10.3,0.0,0.0,0.0,Autumn,No Holiday,Yes
8756,30/11/2018,764,20,3.4,37,2.3,2000,-9.9,0.0,0.0,0.0,Autumn,No Holiday,Yes
8757,30/11/2018,694,21,2.6,39,0.3,1968,-9.9,0.0,0.0,0.0,Autumn,No Holiday,Yes
8758,30/11/2018,712,22,2.1,41,1.0,1859,-9.8,0.0,0.0,0.0,Autumn,No Holiday,Yes


The variable to be predicted in Rented Bike Count which is continuous. This problem is a supervised learning problem and the target variable is Rented Bike Count.

The relevant metrics are variance explained, root mean square error (RMSE), RMSE normalized by standard deviation of the target variable, mean absolute error. For this problem, I'll use root mean square error (RMSE) and mean absolute error (MAE)

The relevant variables which capture the information are Hour, Temperature, Humidity, Windspeed, Visibility, Dew point temperature, Solar radiation, Rainfall, Snowfall, Seasons, Holiday, Functional Day

All of the above variables are usable

In [5]:
# Dropping the date column
data = data.drop('Date', axis = 1)

# Dividing the dataframe into features and labels
X, Y = data.drop('Rented Bike Count', axis = 1), data['Rented Bike Count'] 

print(X.shape, Y.shape)

(8760, 12) (8760,)


In [6]:
print(X, Y)

      Hour  Temperature(°C)  Humidity(%)  ...  Seasons     Holiday  Functioning Day
0        0             -5.2           37  ...   Winter  No Holiday              Yes
1        1             -5.5           38  ...   Winter  No Holiday              Yes
2        2             -6.0           39  ...   Winter  No Holiday              Yes
3        3             -6.2           40  ...   Winter  No Holiday              Yes
4        4             -6.0           36  ...   Winter  No Holiday              Yes
...    ...              ...          ...  ...      ...         ...              ...
8755    19              4.2           34  ...   Autumn  No Holiday              Yes
8756    20              3.4           37  ...   Autumn  No Holiday              Yes
8757    21              2.6           39  ...   Autumn  No Holiday              Yes
8758    22              2.1           41  ...   Autumn  No Holiday              Yes
8759    23              1.9           43  ...   Autumn  No Holiday          

# TRANSFORING DATA

In [7]:
# Segregating the numerical and categorical data types
numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = X.select_dtypes(include=['object', 'bool']).columns

# Converting the categorical variables into one-hot dummy variables and scaling the numerical variables
transforms = [('cat', OneHotEncoder(), categorical_ix), ('num', StandardScaler(), numerical_ix)]
col_transform = ColumnTransformer(transformers=transforms)
X = col_transform.fit_transform(X)

In [8]:
# Sanity check of shapes
print(X.shape)
print(Y.shape)

(8760, 17)
(8760,)


# PREPROCESSING AND TRAINING SCHEMES

For this regression task, we have a large number of samples and a fewer number of features. The ML frameworks I will be using for this are - SVM's, and Random Forest

I will not be explicitly using a validation set, and only divide the training set into a training and test set (in 85:15 ratio). I will be using 5-fold CV later with the algorithms to have a 70:15:15 split of the data

In [9]:
from sklearn.model_selection import train_test_split

# Splliting the data into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.15, shuffle = True)

print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(7446, 17)
(7446,)
(1314, 17)
(1314,)


# SVM-R

In [10]:
# Setting up a parameter grid
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10], 'kernel': ('linear', 'rbf'), 'epsilon': [0.01, 0.1, 1]}

# Carrying out a grid search with 5-fold cross validation
gsc = GridSearchCV(estimator=SVR(), param_grid=param_grid, cv=5)

# Fitting the training data
grid_result = gsc.fit(X_train, Y_train)

In [13]:
best_params = grid_result.best_params_

# Creating the best optimizer on the basis of performance on the CV set
best_svr = SVR(kernel=best_params['kernel'], C=best_params['C'], epsilon=best_params['epsilon'])

scoring = {'abs_error': 'neg_mean_absolute_error',
           'squared_error': 'neg_mean_squared_error'}

# Evaluating the RMSE and MAE on the test set
scores = cross_validate(best_svr, X_test, Y_test, scoring=scoring)
print("MAE :", abs(scores['test_abs_error'].mean()), "| RMSE :", math.sqrt(abs(scores['test_squared_error'].mean())))

MAE : 330.254820670193 | RMSE : 486.2653292302483


# RANDOM FOREST

In [14]:
# Setting up a parameter grid
param_grid = {'n_estimators': [10, 50, 100, 500, 1000]}

# Carrying out a grid search with 5-fold cross validation
gsc = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=5)

# Fitting the training data
grid_result = gsc.fit(X_train, Y_train)

In [15]:
best_params = grid_result.best_params_

# Creating the best optimizer on the basis of performance on the CV set
best_rf = RandomForestRegressor(n_estimators=best_params['n_estimators'])

scoring = {'abs_error': 'neg_mean_absolute_error',
           'squared_error': 'neg_mean_squared_error'}

# Evaluating the RMSE and MAE on the test set
scores = cross_validate(best_rf, X_test, Y_test, scoring=scoring)
print("MAE :", abs(scores['test_abs_error'].mean()), "| RMSE :", math.sqrt(abs(scores['test_squared_error'].mean())))

MAE : 170.97432011145617 | RMSE : 265.88285852331774
