# **Bike Sharing Assignment**

Import Modules

In [1237]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
import scipy as sp
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import statsmodels.tsa.api as smt
from datetime import datetime
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler

Import Dataset and check for sample of data

In [1238]:
rides = pd.read_csv('day.csv')
rides.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,01-01-2018,1,0,1,0,6,0,2,14.110847,18.18125,80.5833,10.749882,331,654,985
1,2,02-01-2018,1,0,1,0,0,0,2,14.902598,17.68695,69.6087,16.652113,131,670,801
2,3,03-01-2018,1,0,1,0,1,1,1,8.050924,9.47025,43.7273,16.636703,120,1229,1349
3,4,04-01-2018,1,0,1,0,2,1,1,8.2,10.6061,59.0435,10.739832,108,1454,1562
4,5,05-01-2018,1,0,1,0,3,1,1,9.305237,11.4635,43.6957,12.5223,82,1518,1600


Check the shape, datatype info of the dataset

In [1239]:
rides.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730 entries, 0 to 729
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     730 non-null    int64  
 1   dteday      730 non-null    object 
 2   season      730 non-null    int64  
 3   yr          730 non-null    int64  
 4   mnth        730 non-null    int64  
 5   holiday     730 non-null    int64  
 6   weekday     730 non-null    int64  
 7   workingday  730 non-null    int64  
 8   weathersit  730 non-null    int64  
 9   temp        730 non-null    float64
 10  atemp       730 non-null    float64
 11  hum         730 non-null    float64
 12  windspeed   730 non-null    float64
 13  casual      730 non-null    int64  
 14  registered  730 non-null    int64  
 15  cnt         730 non-null    int64  
dtypes: float64(4), int64(11), object(1)
memory usage: 91.4+ KB


In [1240]:
rides.shape

(730, 16)

There are no null values in the dataset, But we'll confirm again,

In [1241]:
rides.isnull().sum().sum()

0

Checking the range of values

In [1242]:
rides.describe()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
count,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0
mean,365.5,2.49863,0.5,6.526027,0.028767,2.99726,0.683562,1.394521,20.319259,23.726322,62.765175,12.76362,849.249315,3658.757534,4508.006849
std,210.877136,1.110184,0.500343,3.450215,0.167266,2.006161,0.465405,0.544807,7.506729,8.150308,14.237589,5.195841,686.479875,1559.758728,1936.011647
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,2.424346,3.95348,0.0,1.500244,2.0,20.0,22.0
25%,183.25,2.0,0.0,4.0,0.0,1.0,0.0,1.0,13.811885,16.889713,52.0,9.04165,316.25,2502.25,3169.75
50%,365.5,3.0,0.5,7.0,0.0,3.0,1.0,1.0,20.465826,24.368225,62.625,12.125325,717.0,3664.5,4548.5
75%,547.75,3.0,1.0,10.0,0.0,5.0,1.0,2.0,26.880615,30.445775,72.989575,15.625589,1096.5,4783.25,5966.0
max,730.0,4.0,1.0,12.0,1.0,6.0,1.0,3.0,35.328347,42.0448,97.25,34.000021,3410.0,6946.0,8714.0



**Cleaning the data**

Check if column is useful in the analysis, if not then drop the column

In [1243]:
rides.columns

Index(['instant', 'dteday', 'season', 'yr', 'mnth', 'holiday', 'weekday',
       'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed',
       'casual', 'registered', 'cnt'],
      dtype='object')


*   Instant is a serial number so we can drop the column.
*   We will drop casual and registered as we already have count (cnt)
*   Dropping dteday as we have month and year




In [1244]:
rides.drop(['instant', 'dteday', 'casual', 'registered'], axis=1, inplace=True)

In [1245]:
rides.columns

Index(['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday',
       'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'cnt'],
      dtype='object')

Renaming column names to proper names

In [1246]:
rides.rename(columns = {'yr':'year','mnth':'month','hum':'humidity'}, inplace=True)
rides.columns

Index(['season', 'year', 'month', 'holiday', 'weekday', 'workingday',
       'weathersit', 'temp', 'atemp', 'humidity', 'windspeed', 'cnt'],
      dtype='object')

In [1247]:
rides['weathersit'].value_counts()

1    463
2    246
3     21
Name: weathersit, dtype: int64

We will change the category values to sub category names

In [1248]:
rides['season'] = rides['season'].map({1:'spring', 2:'summer', 3:'fall', 4:'winter'})
rides['month'] = rides['month'].map({1: 'Jan', 2: 'Feb', 3:'Mar',4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8: 'Aug', 9:'Sep', 10: 'Oct',11: 'Nov', 12:'Dec'})
rides['weathersit'] = rides['weathersit'].map({1: 'Clear', 2:'Mist + Cloudy', 3: 'Light Snow', 4: 'Heavy Rain'})
rides['weekday'] = rides['weekday'].map({0: 'Sun', 1: 'Mon', 2:'Tue', 3:'Wed', 4:'Thu', 5: 'Fri', 6:'Sat'})

In [1249]:
rides.head(10)

Unnamed: 0,season,year,month,holiday,weekday,workingday,weathersit,temp,atemp,humidity,windspeed,cnt
0,spring,0,Jan,0,Sat,0,Mist + Cloudy,14.110847,18.18125,80.5833,10.749882,985
1,spring,0,Jan,0,Sun,0,Mist + Cloudy,14.902598,17.68695,69.6087,16.652113,801
2,spring,0,Jan,0,Mon,1,Clear,8.050924,9.47025,43.7273,16.636703,1349
3,spring,0,Jan,0,Tue,1,Clear,8.2,10.6061,59.0435,10.739832,1562
4,spring,0,Jan,0,Wed,1,Clear,9.305237,11.4635,43.6957,12.5223,1600
5,spring,0,Jan,0,Thu,1,Clear,8.378268,11.66045,51.8261,6.000868,1606
6,spring,0,Jan,0,Fri,1,Mist + Cloudy,8.057402,10.44195,49.8696,11.304642,1510
7,spring,0,Jan,0,Sat,0,Mist + Cloudy,6.765,8.1127,53.5833,17.875868,959
8,spring,0,Jan,0,Sun,0,Clear,5.671653,5.80875,43.4167,24.25065,822
9,spring,0,Jan,0,Mon,1,Clear,6.184153,7.5444,48.2917,14.958889,1321


We will next check for outliers using a box plot

In [1250]:
box_count = px.box(data_frame= rides, y='cnt')
box_count.update_layout(bargap=0.2, title="Distribution of bike sharing count")
box_count.show()

There are no visible outliers in the count.

We will next visualize the data and check which variables show corelation with the target variable  

In [1251]:
hist_year = px.box(data_frame= rides, x='year', y='cnt')
hist_year.update_layout(bargap=0.2, title="Distribution of count by year")
hist_year.show()

The number of rides have increased in 2019 (1) compared to 2018 (0), showing growth

In [1252]:
box_month = px.box(rides, x='month', y='cnt')
box_month.update_layout(title="Distribution of count by month")
box_month .show()

There is a increase in count for the months from August to October. Using this we will see it based on the season

In [1253]:
box_season = px.box(rides, x='season', y='cnt')
box_season.update_layout(title="Distribution of count by season")
box_season .show()

for the seasons Summer and fall there is an increase in cnt

In [1254]:
box_season = px.box(rides, x='weathersit', y='cnt')
box_season.update_layout(title="Distribution of count by weather")
box_season .show()

bike rental count is more in Clear weather days

In [1255]:
box_week = px.histogram(rides, x='weekday', y='cnt')
box_week.update_layout(title="Distribution of count by weekdays")
box_week .show()

On saturday, bike rentals are high


In [1256]:
box_temp = px.histogram(rides, x='temp', y='cnt')
box_temp.update_layout(bargap=0.2,title="Distribution of count by Temperature")
box_temp .show()

Between temperatures of 26 to 30 there are more number of rides

In [1257]:
hist_hum = px.histogram(rides, x='humidity', y='cnt')
hist_hum.update_layout(bargap=0.2,title="Distribution of count by Humidity")
hist_hum .show()

During humidity of 47 to 72,bikesharing count is more

In [1258]:
hist_wind = px.histogram(rides, x='windspeed', y='cnt')
hist_wind.update_layout(bargap=0.2,title="Distribution of count by Windspeed")
hist_wind .show()

During Windspeed of 7.5 to 12.5,bikesharing count is more

In [1259]:
hist_holiday = px.histogram(rides, x='holiday', y='cnt')
hist_holiday.update_layout(bargap=0.2,title="Distribution of count by Holiday")
hist_holiday.show()

Bike rental count is more in holidays

In [1260]:
hist_holiday = px.histogram(rides, x='workingday', y='cnt')
hist_holiday.update_layout(bargap=0.2,title="Distribution of count by Holiday")
hist_holiday.show()

Bike rental is more in weekend and holiday

**Observations made from above visualizations:**

1.   The count has increased in 2019 compared to 2018
2.   More number of bikes are rented in the months of August to October.
3.   In Summer, Fall and Clear days more bikes are rented.
4.   Bike rentals are more in moderate temperature (26 - 30)
6. Bike rentals are more in thursday, friday and saturday 
7.   More number of bikes are rented in holidays.




In [1261]:
fig_heatmap = px.imshow(rides.corr(), text_auto = True, aspect = "auto",height=500)
fig_heatmap.update_layout(title="Heatmap for corelation")
fig_heatmap.show()

corelation between temp and atemp is more so we will drop one

In [1262]:
rides.drop(['atemp'], axis=1, inplace=True)
rides.columns

Index(['season', 'year', 'month', 'holiday', 'weekday', 'workingday',
       'weathersit', 'temp', 'humidity', 'windspeed', 'cnt'],
      dtype='object')

We will now convert all categorical variables into dummy variables

In [1263]:
rides.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730 entries, 0 to 729
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   season      730 non-null    object 
 1   year        730 non-null    int64  
 2   month       730 non-null    object 
 3   holiday     730 non-null    int64  
 4   weekday     730 non-null    object 
 5   workingday  730 non-null    int64  
 6   weathersit  730 non-null    object 
 7   temp        730 non-null    float64
 8   humidity    730 non-null    float64
 9   windspeed   730 non-null    float64
 10  cnt         730 non-null    int64  
dtypes: float64(3), int64(4), object(4)
memory usage: 62.9+ KB


In [1264]:
seasons = pd.get_dummies(rides['season'], drop_first=True)
months = pd.get_dummies(rides['month'], drop_first=True)
weekdays = pd.get_dummies(rides['weekday'], drop_first=True)
weathersits = pd.get_dummies(rides['weathersit'], drop_first=True)


Now we will concat all the dummy variables

In [1265]:
rides=pd.concat([rides,seasons,months,weekdays,weathersits], axis=1)
rides.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730 entries, 0 to 729
Data columns (total 33 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   season         730 non-null    object 
 1   year           730 non-null    int64  
 2   month          730 non-null    object 
 3   holiday        730 non-null    int64  
 4   weekday        730 non-null    object 
 5   workingday     730 non-null    int64  
 6   weathersit     730 non-null    object 
 7   temp           730 non-null    float64
 8   humidity       730 non-null    float64
 9   windspeed      730 non-null    float64
 10  cnt            730 non-null    int64  
 11  spring         730 non-null    uint8  
 12  summer         730 non-null    uint8  
 13  winter         730 non-null    uint8  
 14  Aug            730 non-null    uint8  
 15  Dec            730 non-null    uint8  
 16  Feb            730 non-null    uint8  
 17  Jan            730 non-null    uint8  
 18  Jul       

we will now prepare the data to train the model

We will drop the categorical variables which were converted to dummy variables

In [1266]:
rides.drop(['season', 'month', 'weathersit', 'weekday'], axis=1, inplace=True)
rides.columns

Index(['year', 'holiday', 'workingday', 'temp', 'humidity', 'windspeed', 'cnt',
       'spring', 'summer', 'winter', 'Aug', 'Dec', 'Feb', 'Jan', 'Jul', 'Jun',
       'Mar', 'May', 'Nov', 'Oct', 'Sep', 'Mon', 'Sat', 'Sun', 'Thu', 'Tue',
       'Wed', 'Light Snow', 'Mist + Cloudy'],
      dtype='object')

In [1267]:
rides.head()

Unnamed: 0,year,holiday,workingday,temp,humidity,windspeed,cnt,spring,summer,winter,...,Oct,Sep,Mon,Sat,Sun,Thu,Tue,Wed,Light Snow,Mist + Cloudy
0,0,0,0,14.110847,80.5833,10.749882,985,1,0,0,...,0,0,0,1,0,0,0,0,0,1
1,0,0,0,14.902598,69.6087,16.652113,801,1,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0,0,1,8.050924,43.7273,16.636703,1349,1,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,1,8.2,59.0435,10.739832,1562,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,1,9.305237,43.6957,12.5223,1600,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [1268]:
fig_heatmap2 = px.imshow(rides.corr(),text_auto = True, aspect="auto", height=800)
fig_heatmap2.update_layout(title="Heatmap for corelation")
fig_heatmap2.show()

Splitting the dataset into Train and Test

In [1269]:
ride_train, ride_test = train_test_split(rides, train_size=0.7, random_state=100)
print(ride_train.shape)
print(ride_test.shape)

(510, 29)
(220, 29)


In [1270]:
ride_train.head()

Unnamed: 0,year,holiday,workingday,temp,humidity,windspeed,cnt,spring,summer,winter,...,Oct,Sep,Mon,Sat,Sun,Thu,Tue,Wed,Light Snow,Mist + Cloudy
576,1,0,1,29.246653,70.4167,11.083475,7216,0,0,0,...,0,0,0,0,0,0,1,0,0,0
426,1,0,0,16.980847,62.125,10.792293,4066,1,0,0,...,0,0,0,1,0,0,0,0,0,1
728,1,0,0,10.489153,48.3333,23.500518,1796,1,0,0,...,0,0,0,0,1,0,0,0,0,0
482,1,0,0,15.443347,48.9583,8.708325,4220,0,1,0,...,0,0,0,1,0,0,0,0,0,1
111,0,0,1,13.803347,72.9583,14.707907,1683,0,1,0,...,0,0,0,0,0,0,0,0,0,1


We will now scale the variables

In [1271]:
scaler = MinMaxScaler()
scal_var = ['temp','humidity','windspeed','cnt']
ride_train[scal_var] = scaler.fit_transform(ride_train[scal_var])

In [1272]:
ride_train.head()

Unnamed: 0,year,holiday,workingday,temp,humidity,windspeed,cnt,spring,summer,winter,...,Oct,Sep,Mon,Sat,Sun,Thu,Tue,Wed,Light Snow,Mist + Cloudy
576,1,0,1,0.815169,0.725633,0.264686,0.827658,0,0,0,...,0,0,0,0,0,0,1,0,0,0
426,1,0,0,0.442393,0.640189,0.255342,0.465255,1,0,0,...,0,0,0,1,0,0,0,0,0,1
728,1,0,0,0.245101,0.498067,0.663106,0.204096,1,0,0,...,0,0,0,0,1,0,0,0,0,0
482,1,0,0,0.395666,0.504508,0.188475,0.482973,0,1,0,...,0,0,0,1,0,0,0,0,0,1
111,0,0,1,0.345824,0.751824,0.380981,0.191095,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [1273]:
ride_train.describe()

Unnamed: 0,year,holiday,workingday,temp,humidity,windspeed,cnt,spring,summer,winter,...,Oct,Sep,Mon,Sat,Sun,Thu,Tue,Wed,Light Snow,Mist + Cloudy
count,510.0,510.0,510.0,510.0,510.0,510.0,510.0,510.0,510.0,510.0,...,510.0,510.0,510.0,510.0,510.0,510.0,510.0,510.0,510.0,510.0
mean,0.507843,0.02549,0.676471,0.53744,0.65048,0.320883,0.513499,0.243137,0.247059,0.247059,...,0.084314,0.080392,0.15098,0.154902,0.143137,0.133333,0.131373,0.158824,0.029412,0.345098
std,0.500429,0.157763,0.468282,0.225858,0.145846,0.169803,0.224421,0.429398,0.431725,0.431725,...,0.278131,0.272166,0.358381,0.362166,0.350557,0.340268,0.338139,0.36587,0.169124,0.475867
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.339853,0.538643,0.199179,0.35642,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,1.0,0.542596,0.653714,0.296763,0.518638,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,1.0,0.735215,0.75483,0.414447,0.68471,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


We will see the which variables are highly co-related 

In [1274]:
model_heatmap = px.imshow(ride_train.corr(),text_auto = True, aspect="auto", height=800)
model_heatmap.update_layout(title="Heatmap for corelation")
model_heatmap.show()

year and temp show good corelation with count variable

we will divide the data into dependent and independant variable

In [1275]:
y_train = ride_train.pop('cnt')
X_train = ride_train

We will use RFE to select top features to build the model.

In [1276]:
lmr = LinearRegression()
lmr.fit(X_train, y_train)

rfe = RFE(lmr, n_features_to_select=15)
rfe = rfe.fit(X_train, y_train)

In [1277]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

[('year', True, 1),
 ('holiday', True, 1),
 ('workingday', True, 1),
 ('temp', True, 1),
 ('humidity', True, 1),
 ('windspeed', True, 1),
 ('spring', True, 1),
 ('summer', True, 1),
 ('winter', True, 1),
 ('Aug', False, 6),
 ('Dec', False, 3),
 ('Feb', False, 4),
 ('Jan', True, 1),
 ('Jul', True, 1),
 ('Jun', False, 13),
 ('Mar', False, 14),
 ('May', False, 5),
 ('Nov', False, 2),
 ('Oct', False, 12),
 ('Sep', True, 1),
 ('Mon', False, 8),
 ('Sat', True, 1),
 ('Sun', False, 7),
 ('Thu', False, 10),
 ('Tue', False, 9),
 ('Wed', False, 11),
 ('Light Snow', True, 1),
 ('Mist + Cloudy', True, 1)]

In [1278]:
columns = X_train.columns[rfe.support_]
print(columns)

Index(['year', 'holiday', 'workingday', 'temp', 'humidity', 'windspeed',
       'spring', 'summer', 'winter', 'Jan', 'Jul', 'Sep', 'Sat', 'Light Snow',
       'Mist + Cloudy'],
      dtype='object')


We will calculate VIF to decide which variable to remove

In [1279]:
def check_vif(train_df):
  vif = pd.DataFrame()
  vif['Features'] = train_df.columns
  vif['VIF'] = [variance_inflation_factor(train_df.values, i) for i in range(train_df.shape[1])]
  vif['VIF'] = round(vif['VIF'], 2)
  vif = vif.sort_values(by='VIF', ascending=False)
  print(vif)

We will select the columns we got from RFE

In [1280]:
X_train_sel = X_train[columns]

In [1281]:
check_vif(X_train_sel)

         Features    VIF
4        humidity  29.37
3            temp  17.78
2      workingday   5.31
5       windspeed   4.73
6          spring   4.53
8          winter   3.46
7          summer   2.85
14  Mist + Cloudy   2.29
0            year   2.09
12            Sat   1.98
9             Jan   1.67
10            Jul   1.59
11            Sep   1.39
13     Light Snow   1.24
1         holiday   1.18


Here Humidity has high VIF, we will check and drop this

In [1282]:
X_train_sel1 = X_train_sel.drop(['humidity'], axis = 1)
check_vif(X_train_sel1)

         Features   VIF
3            temp  7.07
2      workingday  5.24
4       windspeed  4.68
5          spring  3.07
6          summer  2.34
0            year  2.08
7          winter  1.98
11            Sat  1.97
8             Jan  1.62
9             Jul  1.59
13  Mist + Cloudy  1.58
10            Sep  1.35
1         holiday  1.17
12     Light Snow  1.09


VIF Value seems to be ok, we will build the model with rest of the variables

In [1283]:
X_train_lmr1 = sm.add_constant(X_train_sel1)
lr_1 = sm.OLS(y_train, X_train_lmr1).fit()
print(lr_1.summary())

                            OLS Regression Results                            
Dep. Variable:                    cnt   R-squared:                       0.843
Model:                            OLS   Adj. R-squared:                  0.838
Method:                 Least Squares   F-statistic:                     189.6
Date:                Wed, 11 May 2022   Prob (F-statistic):          1.60e-188
Time:                        18:06:10   Log-Likelihood:                 510.75
No. Observations:                 510   AIC:                            -991.5
Df Residuals:                     495   BIC:                            -928.0
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const             0.1737      0.031      5.520


In a future version of pandas all arguments of concat except for the argument 'objs' will be keyword-only



We will drop Jan as it has a higher VIF and P value comparitively to holiday

In [1284]:
X_train_sel2 = X_train_sel1.drop(['Jan'], axis = 1)
check_vif(X_train_sel2)

         Features   VIF
3            temp  6.97
2      workingday  5.20
4       windspeed  4.65
5          spring  2.49
6          summer  2.34
0            year  2.07
7          winter  1.98
10            Sat  1.96
8             Jul  1.58
12  Mist + Cloudy  1.57
9             Sep  1.35
1         holiday  1.17
11     Light Snow  1.09


We will see how the model has changed

In [1285]:
X_train_lmr2 = sm.add_constant(X_train_sel2)
lr_2 = sm.OLS(y_train, X_train_lmr2).fit()
print(lr_2.summary())

                            OLS Regression Results                            
Dep. Variable:                    cnt   R-squared:                       0.841
Model:                            OLS   Adj. R-squared:                  0.837
Method:                 Least Squares   F-statistic:                     202.2
Date:                Wed, 11 May 2022   Prob (F-statistic):          1.28e-188
Time:                        18:06:10   Log-Likelihood:                 508.20
No. Observations:                 510   AIC:                            -988.4
Df Residuals:                     496   BIC:                            -929.1
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const             0.1576      0.031      5.126


In a future version of pandas all arguments of concat except for the argument 'objs' will be keyword-only



VIF is changed only a bit also the the model, we will proceed further to heck with other variables

We will try dropping holiday and build the model

In [1286]:
X_train_sel3 = X_train_sel2.drop(['holiday'], axis = 1)
check_vif(X_train_sel3)

         Features   VIF
2            temp  6.73
1      workingday  4.66
3       windspeed  4.64
4          spring  2.38
5          summer  2.32
0            year  2.07
6          winter  1.91
9             Sat  1.83
7             Jul  1.58
11  Mist + Cloudy  1.57
8             Sep  1.34
10     Light Snow  1.08


In [1287]:
X_train_lmr3 = sm.add_constant(X_train_sel3)
lr_3 = sm.OLS(y_train, X_train_lmr3).fit()
print(lr_3.summary())

                            OLS Regression Results                            
Dep. Variable:                    cnt   R-squared:                       0.840
Model:                            OLS   Adj. R-squared:                  0.836
Method:                 Least Squares   F-statistic:                     217.2
Date:                Wed, 11 May 2022   Prob (F-statistic):          7.12e-189
Time:                        18:06:10   Log-Likelihood:                 506.01
No. Observations:                 510   AIC:                            -986.0
Df Residuals:                     497   BIC:                            -931.0
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const             0.1492      0.031      4.881


In a future version of pandas all arguments of concat except for the argument 'objs' will be keyword-only



Dropping Holiday has made little difference in VIF, we will continue with other variables and observe

In [1288]:
X_train_sel4 = X_train_sel3.drop(['Jul'], axis = 1)
check_vif(X_train_sel4)

         Features   VIF
2            temp  5.48
1      workingday  4.66
3       windspeed  4.64
4          spring  2.30
0            year  2.07
5          summer  2.00
8             Sat  1.83
6          winter  1.76
10  Mist + Cloudy  1.57
7             Sep  1.23
9      Light Snow  1.08


In [1289]:
X_train_lmr4 = sm.add_constant(X_train_sel4)
lr_4 = sm.OLS(y_train, X_train_lmr4).fit()
print(lr_4.summary())


In a future version of pandas all arguments of concat except for the argument 'objs' will be keyword-only



                            OLS Regression Results                            
Dep. Variable:                    cnt   R-squared:                       0.838
Model:                            OLS   Adj. R-squared:                  0.834
Method:                 Least Squares   F-statistic:                     233.6
Date:                Wed, 11 May 2022   Prob (F-statistic):          1.42e-188
Time:                        18:06:10   Log-Likelihood:                 502.47
No. Observations:                 510   AIC:                            -980.9
Df Residuals:                     498   BIC:                            -930.1
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const             0.1406      0.031      4.597

VIF values are dropped and atlast to make it acceptable we will drop Spring

In [1290]:
X_train_sel5 = X_train_sel4.drop(['spring'], axis = 1)
check_vif(X_train_sel5)

        Features   VIF
2           temp  4.76
1     workingday  4.04
3      windspeed  3.44
0           year  2.02
7            Sat  1.69
4         summer  1.57
9  Mist + Cloudy  1.53
5         winter  1.40
6            Sep  1.20
8     Light Snow  1.08


In [1291]:
X_train_lmr5 = sm.add_constant(X_train_sel5)
lr_5 = sm.OLS(y_train, X_train_lmr5).fit()
print(lr_5.summary())

                            OLS Regression Results                            
Dep. Variable:                    cnt   R-squared:                       0.835
Model:                            OLS   Adj. R-squared:                  0.832
Method:                 Least Squares   F-statistic:                     253.0
Date:                Wed, 11 May 2022   Prob (F-statistic):          3.13e-188
Time:                        18:06:11   Log-Likelihood:                 498.79
No. Observations:                 510   AIC:                            -975.6
Df Residuals:                     499   BIC:                            -929.0
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const             0.0750      0.019      4.031


In a future version of pandas all arguments of concat except for the argument 'objs' will be keyword-only



VIF values are less than 5 so we can proceed further

**Residual analysis**

In [1292]:
y_train_pred = lr_5.predict(X_train_lmr5)

In [1293]:
residual = y_train - y_train_pred
res_hist= px.histogram(residual, nbins=20)
res_hist.update_layout(title="Error terms")
res_hist.show()

As we see above error terms are follwing normal distribution

In [1294]:
model_heatmap = px.imshow(X_train_sel5.corr(),text_auto = True, aspect="auto", height=800)
model_heatmap.update_layout(title="Heatmap for corelation")
model_heatmap.show()

There is insignificant multicolinearity as seen in the heatmap

In [1295]:
scatter_homoscedasticity = px.scatter(x=y_train_pred, y=residual, labels={'x':'y_pred','y':'residual'}, trendline="ols", trendline_color_override="red")
scatter_homoscedasticity.update_layout(title="y_pred vs residual")
scatter_homoscedasticity.show()

We can see the Homoscedasticity in the above graph

In [1296]:
from statsmodels.stats.stattools import durbin_watson

#perform Durbin-Watson test
durbin_watson(lr_5.resid)

2.0885340299289736

**Making predictions using final model**

Lets apply scaling on test dataset

In [1297]:
ride_test[scal_var] = scaler.transform(ride_test[scal_var])

In [1298]:
ride_test.describe()

Unnamed: 0,year,holiday,workingday,temp,humidity,windspeed,cnt,spring,summer,winter,...,Oct,Sep,Mon,Sat,Sun,Thu,Tue,Wed,Light Snow,Mist + Cloudy
count,220.0,220.0,220.0,220.0,220.0,220.0,220.0,220.0,220.0,220.0,...,220.0,220.0,220.0,220.0,220.0,220.0,220.0,220.0,220.0,220.0
mean,0.481818,0.036364,0.7,0.558718,0.638221,0.313293,0.522154,0.254545,0.263636,0.236364,...,0.086364,0.086364,0.127273,0.118182,0.145455,0.163636,0.168182,0.1,0.027273,0.318182
std,0.500809,0.18762,0.459303,0.233187,0.148694,0.159584,0.219163,0.436599,0.441609,0.425817,...,0.281541,0.281541,0.334038,0.323559,0.353363,0.370789,0.374881,0.300684,0.163248,0.466833
min,0.0,0.0,0.0,0.046591,0.261915,-0.042808,0.048205,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.355429,0.529197,0.198843,0.378595,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,1.0,0.558172,0.62559,0.300126,0.524678,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,1.0,0.755981,0.743798,0.402718,0.67775,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,0.984424,1.002146,0.807474,0.9633,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


We will divide data into X_test and y_test

In [1299]:
y_test = ride_test.pop('cnt')
X_test = ride_test

In [1300]:
X_test = X_test[X_train_sel5.columns]
#add constant
X_test_lm6 = sm.add_constant(X_test)
#prediction for test data
y_pred = lr_5.predict(X_test_lm6)


In a future version of pandas all arguments of concat except for the argument 'objs' will be keyword-only



Coefficient of determination(R^2) for test data

In [1301]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.7961390438459766

**Model evaluation**

In [1302]:
scatter_pred = px.scatter(x=y_test, y=y_pred, labels={'x':'y_test','y':'y_pred'}, trendline="ols", trendline_color_override="red")
scatter_pred.update_layout(title="y_pred vs y_pred")
scatter_pred.show()

In [1303]:
lr_5.summary()

0,1,2,3
Dep. Variable:,cnt,R-squared:,0.835
Model:,OLS,Adj. R-squared:,0.832
Method:,Least Squares,F-statistic:,253.0
Date:,"Wed, 11 May 2022",Prob (F-statistic):,3.13e-188
Time:,18:06:11,Log-Likelihood:,498.79
No. Observations:,510,AIC:,-975.6
Df Residuals:,499,BIC:,-929.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0750,0.019,4.031,0.000,0.038,0.112
year,0.2331,0.008,28.370,0.000,0.217,0.249
workingday,0.0561,0.011,5.024,0.000,0.034,0.078
temp,0.5499,0.020,27.861,0.000,0.511,0.589
windspeed,-0.1552,0.025,-6.195,0.000,-0.204,-0.106
summer,0.0886,0.010,8.608,0.000,0.068,0.109
winter,0.1307,0.010,12.600,0.000,0.110,0.151
Sep,0.0974,0.016,6.184,0.000,0.066,0.128
Sat,0.0675,0.014,4.693,0.000,0.039,0.096

0,1,2,3
Omnibus:,68.639,Durbin-Watson:,2.089
Prob(Omnibus):,0.0,Jarque-Bera (JB):,151.839
Skew:,-0.731,Prob(JB):,1.0700000000000001e-33
Kurtosis:,5.238,Cond. No.,11.6


The equation of our best fitted line is:

*cnt = 0.0750 + 0.2331 * year + 0.0561 * workingday + 0.5499 * temp - 0.1552 * windpseed + 0.0886 * summer + 0.1307 * winter + 0.0974 * Sep + 0.0675 * Sat - 0.2871 - 0.0800 * Mist+Cloudy*

In [1304]:

r2_score(y_train, y_train_pred)

0.8352749595695672

Comparision between train and test data

In [1305]:
train = 1-(1-r2_score(y_train, y_train_pred))*(len(y_train)-1)/(len(y_train)-5-1)
test = 1-(1-r2_score(y_test, y_pred))*(len(X_test)-1)/(len(X_test)-5-1)
print('Coefficient of determination(R^2) for Train dataset : %.2f'% r2_score(y_train, y_train_pred))
print('Coefficient of determination(R^2) for Test dataset : %.2f'% r2_score(y_test, y_pred))
print('Adjusted R^2 for Train dataset : %.2f'% train)
print('Adjusted R^2 for Test dataset : %.2f'% test)

Coefficient of determination(R^2) for Train dataset : 0.84
Coefficient of determination(R^2) for Test dataset : 0.80
Adjusted R^2 for Train dataset : 0.83
Adjusted R^2 for Test dataset : 0.79
