# Machine learning for predicitng Bike availability and Bike stand availabilty

##### Linear Regression Model for predicting Bike Availability: 


- Here we will first be implementing a linear regression model in order to predict the number of bikes available and the number of bike stands available at a give bike stand. 
- Linear regression is a statistical method for modeling relationships between a dependent variable with a given set of independent variables.
- In our model the dependent variable will be number of bikes/bike stands and the independent variables will be time of day, day of the week, area, and weather.  

In [2]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sqlalchemy import create_engine

import pickle

from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error


from sklearn.ensemble import RandomForestRegressor





### Connect to database:

In [3]:
URL = "dublin-bikesdb.cmd8vuwgew1e.us-east-1.rds.amazonaws.com"
PORT = "3306"
DB = "dbikes"
USER = "admin"
PASSWORD = "Dbikes123"


### Weather Data 

In [4]:
def weather():
    engine = create_engine("mysql+mysqldb://{}:{}@{}:{}/{}".format(USER, PASSWORD, URL, PORT, DB), echo=True)
    sql_query_weather= """
    SELECT weather.id, weather.description1, weather.temperature, weather.humidity, weather.windspeed, weather.sunset, weather.TIME FROM weather;
    """
    df_weather = pd.read_sql_query(sql_query_weather, engine)

    return df_weather


df_weather = weather()

2022-04-02 12:28:38,061 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'sql_mode'
2022-04-02 12:28:38,077 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-02 12:28:38,200 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'lower_case_table_names'
2022-04-02 12:28:38,200 INFO sqlalchemy.engine.Engine [generated in 0.00206s] ()
2022-04-02 12:28:38,443 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2022-04-02 12:28:38,443 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-02 12:28:38,809 INFO sqlalchemy.engine.Engine 
    SELECT weather.id, weather.description1, weather.temperature, weather.humidity, weather.windspeed, weather.sunset, weather.TIME FROM weather;
    
2022-04-02 12:28:38,812 INFO sqlalchemy.engine.Engine [raw sql] ()


In [5]:
df_weather

Unnamed: 0,id,description1,temperature,humidity,windspeed,sunset,TIME
0,803,broken clouds,279.49,78,8.23,2022-03-30 18:55:15,2022-03-30 15:29:16
1,803,broken clouds,279.49,78,8.23,2022-03-30 18:55:15,2022-03-30 15:34:17
2,803,broken clouds,279.61,78,8.23,2022-03-30 18:55:15,2022-03-30 15:39:17
3,803,broken clouds,279.61,78,8.23,2022-03-30 18:55:15,2022-03-30 15:44:17
4,803,broken clouds,279.45,78,7.20,2022-03-30 18:55:15,2022-03-30 15:49:18
...,...,...,...,...,...,...,...
811,802,scattered clouds,281.17,74,4.63,2022-04-02 19:00:38,2022-04-02 11:07:34
812,802,scattered clouds,281.30,73,4.63,2022-04-02 19:00:39,2022-04-02 11:12:34
813,802,scattered clouds,281.30,73,4.63,2022-04-02 19:00:39,2022-04-02 11:17:35
814,802,scattered clouds,281.30,71,5.14,2022-04-02 19:00:39,2022-04-02 11:22:35


In [6]:
df_weather['just_date'] = df_weather['sunset'].dt.date
df_weather

Unnamed: 0,id,description1,temperature,humidity,windspeed,sunset,TIME,just_date
0,803,broken clouds,279.49,78,8.23,2022-03-30 18:55:15,2022-03-30 15:29:16,2022-03-30
1,803,broken clouds,279.49,78,8.23,2022-03-30 18:55:15,2022-03-30 15:34:17,2022-03-30
2,803,broken clouds,279.61,78,8.23,2022-03-30 18:55:15,2022-03-30 15:39:17,2022-03-30
3,803,broken clouds,279.61,78,8.23,2022-03-30 18:55:15,2022-03-30 15:44:17,2022-03-30
4,803,broken clouds,279.45,78,7.20,2022-03-30 18:55:15,2022-03-30 15:49:18,2022-03-30
...,...,...,...,...,...,...,...,...
811,802,scattered clouds,281.17,74,4.63,2022-04-02 19:00:38,2022-04-02 11:07:34,2022-04-02
812,802,scattered clouds,281.30,73,4.63,2022-04-02 19:00:39,2022-04-02 11:12:34,2022-04-02
813,802,scattered clouds,281.30,73,4.63,2022-04-02 19:00:39,2022-04-02 11:17:35,2022-04-02
814,802,scattered clouds,281.30,71,5.14,2022-04-02 19:00:39,2022-04-02 11:22:35,2022-04-02


In [7]:
df_weather.dtypes


id                       int64
description1            object
temperature            float64
humidity                 int64
windspeed              float64
sunset          datetime64[ns]
TIME            datetime64[ns]
just_date               object
dtype: object

### Availablity Data

In [8]:
def availability():
    engine = create_engine("mysql+mysqldb://{}:{}@{}:{}/{}".format(USER, PASSWORD, URL, PORT, DB), echo=True)
    df_avail = pd.read_sql_table("availability", engine)
    return df_avail

In [9]:
df_avail = availability()

2022-04-02 12:28:40,337 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'sql_mode'
2022-04-02 12:28:40,339 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-02 12:28:40,456 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'lower_case_table_names'
2022-04-02 12:28:40,456 INFO sqlalchemy.engine.Engine [generated in 0.00155s] ()
2022-04-02 12:28:40,692 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2022-04-02 12:28:40,692 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-02 12:28:41,158 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `dbikes`
2022-04-02 12:28:41,169 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-02 12:28:41,289 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `dbikes`
2022-04-02 12:28:41,289 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-02 12:28:41,437 INFO sqlalchemy.engine.Engine SHOW CREATE TABLE `availability`
2022-04-02 12:28:41,437 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-02 12:28:41,708 INFO sqlalchemy.engine.Engine SELECT availabili

In [10]:
df_avail

Unnamed: 0,number,available_bike_stands,available_bikes,last_update
0,42,16,14,2022-02-23 19:50:20
1,30,0,20,2022-02-23 19:41:25
2,54,11,22,2022-02-23 19:48:38
3,108,16,19,2022-02-23 19:51:13
4,56,2,38,2022-02-23 19:45:20
...,...,...,...,...
940294,39,0,20,2022-04-02 11:27:16
940295,83,17,23,2022-04-02 11:18:59
940296,92,34,6,2022-04-02 11:22:32
940297,21,16,14,2022-04-02 11:22:31


In [11]:
df_avail['day'] = df_avail['last_update'].dt.day_of_week
df_avail['hour'] = df_avail['last_update'].dt.hour
df_avail['just_date'] = df_avail['last_update'].dt.date




In [12]:
df_avail

Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,hour,just_date
0,42,16,14,2022-02-23 19:50:20,2,19,2022-02-23
1,30,0,20,2022-02-23 19:41:25,2,19,2022-02-23
2,54,11,22,2022-02-23 19:48:38,2,19,2022-02-23
3,108,16,19,2022-02-23 19:51:13,2,19,2022-02-23
4,56,2,38,2022-02-23 19:45:20,2,19,2022-02-23
...,...,...,...,...,...,...,...
940294,39,0,20,2022-04-02 11:27:16,5,11,2022-04-02
940295,83,17,23,2022-04-02 11:18:59,5,11,2022-04-02
940296,92,34,6,2022-04-02 11:22:32,5,11,2022-04-02
940297,21,16,14,2022-04-02 11:22:31,5,11,2022-04-02


In [13]:
df_avail.corr()

Unnamed: 0,number,available_bike_stands,available_bikes,day,hour
number,1.0,0.08037,0.339339,0.000103,-0.000514
available_bike_stands,0.08037,1.0,-0.675145,0.007987,-0.014578
available_bikes,0.339339,-0.675145,1.0,-0.008934,0.013556
day,0.000103,0.007987,-0.008934,1.0,-0.039804
hour,-0.000514,-0.014578,0.013556,-0.039804,1.0


In [14]:
df_avail.groupby(['day']).mean()

Unnamed: 0_level_0,number,available_bike_stands,available_bikes,hour
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,60.274707,12.590078,19.282253,11.582122
1,60.371049,12.570528,19.421478,12.347305
2,60.325617,12.704824,18.97155,11.481926
3,60.323107,12.590797,18.825266,12.605026
4,60.331106,12.634722,19.086406,11.506133
5,60.328925,12.816983,19.030988,10.839889
6,60.325019,12.809722,18.988247,11.508759


In [15]:
df_avail.dtypes


number                            int64
available_bike_stands             int64
available_bikes                   int64
last_update              datetime64[ns]
day                               int64
hour                              int64
just_date                        object
dtype: object

In [16]:
df_avail["number"] = df_avail["number"].astype('category')  


In [17]:
df_avail.shape

(940299, 7)

In [18]:
df_avail.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
available_bike_stands,940299.0,12.673927,9.074021,0.0,5.0,12.0,19.0,40.0
available_bikes,940299.0,19.067403,10.619963,0.0,11.0,19.0,27.0,40.0
day,940299.0,3.200847,1.818387,0.0,2.0,3.0,5.0,6.0
hour,940299.0,11.683407,6.912788,0.0,6.0,12.0,18.0,23.0


In [19]:
df_avail["number"].describe().T

count     940299
unique       110
top           61
freq        8550
Name: number, dtype: int64

#### Combining the two data frames 

In [20]:
# df_combine= df_avail.merge(df_weather,join='inner', on=['just_date'])

df_combine = pd.merge(df_avail, df_weather, on='just_date', how="outer")

# df_weather = df_weather.just_date.map(df_avail.set_index('just_date')
                                      
# pd.merge_asof(df_weather, df_avail, left_on='just_date', right_on='just_date')


# df_combine= df_avail.merge(df_weather,on=['just_date'])

# inner_merged = pd.concat([df_avail, df_weather],on=['just_date'])

# v = df1.merge(df2[['Date', 'exp']])\
#        .groupby(df1.columns.tolist())\
#        .exp\
#        .apply(pd.Series.tolist)


In [21]:
df_weather

Unnamed: 0,id,description1,temperature,humidity,windspeed,sunset,TIME,just_date
0,803,broken clouds,279.49,78,8.23,2022-03-30 18:55:15,2022-03-30 15:29:16,2022-03-30
1,803,broken clouds,279.49,78,8.23,2022-03-30 18:55:15,2022-03-30 15:34:17,2022-03-30
2,803,broken clouds,279.61,78,8.23,2022-03-30 18:55:15,2022-03-30 15:39:17,2022-03-30
3,803,broken clouds,279.61,78,8.23,2022-03-30 18:55:15,2022-03-30 15:44:17,2022-03-30
4,803,broken clouds,279.45,78,7.20,2022-03-30 18:55:15,2022-03-30 15:49:18,2022-03-30
...,...,...,...,...,...,...,...,...
811,802,scattered clouds,281.17,74,4.63,2022-04-02 19:00:38,2022-04-02 11:07:34,2022-04-02
812,802,scattered clouds,281.30,73,4.63,2022-04-02 19:00:39,2022-04-02 11:12:34,2022-04-02
813,802,scattered clouds,281.30,73,4.63,2022-04-02 19:00:39,2022-04-02 11:17:35,2022-04-02
814,802,scattered clouds,281.30,71,5.14,2022-04-02 19:00:39,2022-04-02 11:22:35,2022-04-02


In [22]:
df_combine
# inner_merged

Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,hour,just_date,id,description1,temperature,humidity,windspeed,sunset,TIME
0,42,16,14,2022-02-23 19:50:20,2,19,2022-02-23,,,,,,NaT,NaT
1,30,0,20,2022-02-23 19:41:25,2,19,2022-02-23,,,,,,NaT,NaT
2,54,11,22,2022-02-23 19:48:38,2,19,2022-02-23,,,,,,NaT,NaT
3,108,16,19,2022-02-23 19:51:13,2,19,2022-02-23,,,,,,NaT,NaT
4,56,2,38,2022-02-23 19:45:20,2,19,2022-02-23,,,,,,NaT,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21569442,88,5,25,2022-04-02 11:27:17,5,11,2022-04-02,802.0,scattered clouds,281.17,74.0,4.63,2022-04-02 19:00:38,2022-04-02 11:07:34
21569443,88,5,25,2022-04-02 11:27:17,5,11,2022-04-02,802.0,scattered clouds,281.30,73.0,4.63,2022-04-02 19:00:39,2022-04-02 11:12:34
21569444,88,5,25,2022-04-02 11:27:17,5,11,2022-04-02,802.0,scattered clouds,281.30,73.0,4.63,2022-04-02 19:00:39,2022-04-02 11:17:35
21569445,88,5,25,2022-04-02 11:27:17,5,11,2022-04-02,802.0,scattered clouds,281.30,71.0,5.14,2022-04-02 19:00:39,2022-04-02 11:22:35


In [23]:
df_combine.tail(40)

Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,hour,just_date,id,description1,temperature,humidity,windspeed,sunset,TIME
21569407,88,5,25,2022-04-02 11:27:17,5,11,2022-04-02,500.0,light rain,277.89,91.0,5.14,2022-04-02 19:00:39,2022-04-02 08:12:28
21569408,88,5,25,2022-04-02 11:27:17,5,11,2022-04-02,500.0,light rain,277.89,91.0,5.14,2022-04-02 19:00:39,2022-04-02 08:17:28
21569409,88,5,25,2022-04-02 11:27:17,5,11,2022-04-02,501.0,moderate rain,278.17,90.0,5.14,2022-04-02 19:00:39,2022-04-02 08:22:28
21569410,88,5,25,2022-04-02 11:27:17,5,11,2022-04-02,501.0,moderate rain,278.17,90.0,5.14,2022-04-02 19:00:39,2022-04-02 08:27:29
21569411,88,5,25,2022-04-02 11:27:17,5,11,2022-04-02,501.0,moderate rain,278.17,90.0,5.14,2022-04-02 19:00:39,2022-04-02 08:32:29
21569412,88,5,25,2022-04-02 11:27:17,5,11,2022-04-02,501.0,moderate rain,278.17,90.0,5.14,2022-04-02 19:00:39,2022-04-02 08:37:29
21569413,88,5,25,2022-04-02 11:27:17,5,11,2022-04-02,803.0,broken clouds,278.17,89.0,5.14,2022-04-02 19:00:39,2022-04-02 08:42:29
21569414,88,5,25,2022-04-02 11:27:17,5,11,2022-04-02,803.0,broken clouds,278.44,91.0,5.14,2022-04-02 19:00:39,2022-04-02 08:47:29
21569415,88,5,25,2022-04-02 11:27:17,5,11,2022-04-02,803.0,broken clouds,278.44,91.0,5.14,2022-04-02 19:00:39,2022-04-02 08:52:29
21569416,88,5,25,2022-04-02 11:27:17,5,11,2022-04-02,520.0,light intensity shower rain,278.52,90.0,5.14,2022-04-02 19:00:39,2022-04-02 08:57:30


### Looking at correlations

In [24]:
df_combine.corr()

Unnamed: 0,available_bike_stands,available_bikes,day,hour,id,temperature,humidity,windspeed
available_bike_stands,1.0,-0.699078,0.007653,-0.01093,-0.001818,0.000322,0.006957,-0.003168
available_bikes,-0.699078,1.0,-0.007893,0.008579,0.002179,0.000188,-0.006521,0.003707
day,0.007653,-0.007893,1.0,-0.219145,-0.286627,-0.110482,0.587194,-0.498689
hour,-0.01093,0.008579,-0.219145,1.0,0.024781,-0.008252,-0.112165,0.114402
id,-0.001818,0.002179,-0.286627,0.024781,1.0,-0.161291,-0.354999,0.067661
temperature,0.000322,0.000188,-0.110482,-0.008252,-0.161291,1.0,-0.638923,0.066327
humidity,0.006957,-0.006521,0.587194,-0.112165,-0.354999,-0.638923,1.0,-0.238525
windspeed,-0.003168,0.003707,-0.498689,0.114402,0.067661,0.066327,-0.238525,1.0


In [25]:
df_avail.head(5)

Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,hour,just_date
0,42,16,14,2022-02-23 19:50:20,2,19,2022-02-23
1,30,0,20,2022-02-23 19:41:25,2,19,2022-02-23
2,54,11,22,2022-02-23 19:48:38,2,19,2022-02-23
3,108,16,19,2022-02-23 19:51:13,2,19,2022-02-23
4,56,2,38,2022-02-23 19:45:20,2,19,2022-02-23


In [26]:
df_avail.tail(5)

Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,hour,just_date
940294,39,0,20,2022-04-02 11:27:16,5,11,2022-04-02
940295,83,17,23,2022-04-02 11:18:59,5,11,2022-04-02
940296,92,34,6,2022-04-02 11:22:32,5,11,2022-04-02
940297,21,16,14,2022-04-02 11:22:31,5,11,2022-04-02
940298,88,5,25,2022-04-02 11:27:17,5,11,2022-04-02


In [27]:
df_avail.shape

(940299, 7)

### Training Model for available bikes 

In [28]:
train_feature = ["day","hour","number"]
target_feature1 = ['available_bikes']

train = df_avail[train_feature]
target = df_avail[target_feature1]
Xtrain,Xtest,Ytrain,Ytest = train_test_split(train,target,test_size=0.3)
Xtrain

Unnamed: 0,day,hour,number
152768,5,18,64
549930,6,13,108
498747,4,22,2
593203,0,22,102
569255,0,3,34
...,...,...,...
185423,1,23,50
720827,4,23,94
302576,5,16,34
226249,3,6,109


In [29]:
LR = LinearRegression()
LR.fit(Xtrain,Ytrain)




LinearRegression()

In [30]:
LR.score(Xtest,Ytest)



0.1151354534115735

In [31]:
y_prediction =  LR.predict(Xtest)
y_prediction



array([[18.09487367],
       [17.17409692],
       [20.8980549 ],
       ...,
       [14.89751148],
       [21.11515078],
       [19.02441023]])

### Example of Prediction

- on Monday, 2pm, station: 100

In [32]:
LR.predict([[1, 14, 100]])

array([[23.45652732]])

### Metrics for model evaluation in linear regression:  

#### R Square/Adjusted R Square

In [33]:
score=r2_score(Ytest,y_prediction)
print('r2 score is ',score)

r2 score is  0.1151354534115735


#### Mean Square Error(MSE)

In [34]:
print('MSE is ',mean_squared_error(Ytest,y_prediction))

MSE is  99.8464185384106


#### Mean Absolute Error

In [35]:
print('Mean Absolute Error is ',mean_absolute_error(Ytest,y_prediction))

Mean Absolute Error is  8.14744732112836


## Using a Random Forest Model for bike availabilty

In [36]:
random_forest = RandomForestRegressor(random_state=0)
random_forest = random_forest.fit(Xtrain,Ytrain.available_bikes)
score = random_forest.score(Xtest,Ytest.available_bikes)
score



0.6621386405427082

In [37]:
y_prediction_RF =  random_forest.predict(Xtest)
y_prediction_RF



array([ 5.93132507, 21.3551664 , 16.33295807, ...,  9.17542223,
       20.03831061, 24.11719146])

### Metrics for model evaluation in random forrest:  

#### R squared

In [38]:
score=r2_score(Ytest,y_prediction_RF)
print('r2 score is ',score)

r2 score is  0.6621386405427082


#### Mean squared error

In [39]:
print('MSE is ',mean_squared_error(Ytest,y_prediction_RF))

MSE is  38.123627886766066


#### Mean absolute error

In [40]:
print('Mean Absolute Error is ',mean_absolute_error(Ytest,y_prediction_RF))

Mean Absolute Error is  4.732082608387411


### Example of prediction

In [54]:
random_forest.predict([[1, 14, 100]])

array([23.37261813])

### Saving model to disk with Pickle:

In [42]:
pickle.dump(random_forest, open('model.pkl', 'wb'))

In [43]:
#testing

In [44]:
model= pickle.load(open('model.pkl', 'rb'))

### Making model for bike stand availabilty:

In [45]:
target_feature2 = ['available_bike_stands']

train = df_avail[train_feature]
target = df_avail[target_feature2]
Xtrain,Xtest,Ytrain,Ytest = train_test_split(train,target,test_size=0.3)
Xtrain

Unnamed: 0,day,hour,number
460233,3,16,39
259391,4,7,98
4548,3,9,24
676500,3,13,93
27023,3,18,50
...,...,...,...
564203,0,0,86
125623,5,7,6
771218,6,13,75
514290,5,9,108


In [46]:
random_forest_2 = RandomForestRegressor(random_state=1)
random_forest_2 = random_forest_2.fit(Xtrain,Ytrain.available_bike_stands)
score_2 = random_forest_2.score(Xtest,Ytest.available_bike_stands)
score_2



0.5642794627166502

In [47]:
y_prediction_RF_2 =  random_forest_2.predict(Xtest)
y_prediction_RF_2



array([2.86004618, 5.35167612, 5.46779068, ..., 2.59603503, 3.14150247,
       7.729122  ])

### R2 score for RF bike stands

In [48]:
score =r2_score(Ytest,y_prediction_RF_2)
print('r2 score is ',score)

r2 score is  0.5642794627166502


### Mean squared error for RF bike stands

In [50]:
print('MSE is ',mean_squared_error(Ytest,y_prediction_RF_2))

MSE is  35.76278747855483


### Mean absolute error for RF bike stands

In [49]:
print('Mean Absolute Error is ',mean_absolute_error(Ytest,y_prediction_RF_2))

Mean Absolute Error is  4.6322200717854525


### Example prediction for RF bike stands

In [51]:
random_forest_2.predict([[1, 14, 100]])

array([1.95112286])

### Saving model to pickle file

In [55]:
pickle.dump(random_forest_2, open('model_stands.pkl', 'wb'))