# Machine learning for predicitng Bike availability and Bike stand availabilty

##### Linear Regression Model for predicting Bike Availability: 


- Here we will first be implementing a linear regression model in order to predict the number of bikes available and the number of bike stands available at a give bike stand. 
- Linear regression is a statistical method for modeling relationships between a dependent variable with a given set of independent variables.
- In our model the dependent variable will be number of bikes/bike stands and the independent variables will be time of day, day of the week, area, and weather.  

In [1]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sqlalchemy import create_engine

import pickle

from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error


from sklearn.ensemble import RandomForestRegressor





### Connect to database:

In [2]:
URL = "dublin-bikesdb.cmd8vuwgew1e.us-east-1.rds.amazonaws.com"
PORT = "3306"
DB = "dbikes"
USER = "admin"
PASSWORD = "Dbikes123"


### Weather Data 

In [3]:
def weather():
    engine = create_engine("mysql+mysqldb://{}:{}@{}:{}/{}".format(USER, PASSWORD, URL, PORT, DB), echo=True)
    sql_query_weather= """
    SELECT weather.id, weather.description1, weather.temperature, weather.humidity, weather.windspeed, weather.sunset, weather.TIME FROM weather;
    """
    df_weather = pd.read_sql_query(sql_query_weather, engine)

    return df_weather


df_weather = weather()

2022-04-04 12:32:02,106 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'sql_mode'
2022-04-04 12:32:02,106 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-04 12:32:02,395 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'lower_case_table_names'
2022-04-04 12:32:02,395 INFO sqlalchemy.engine.Engine [generated in 0.00140s] ()
2022-04-04 12:32:03,012 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2022-04-04 12:32:03,012 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-04 12:32:03,745 INFO sqlalchemy.engine.Engine 
    SELECT weather.id, weather.description1, weather.temperature, weather.humidity, weather.windspeed, weather.sunset, weather.TIME FROM weather;
    
2022-04-04 12:32:03,745 INFO sqlalchemy.engine.Engine [raw sql] ()


In [4]:
df_weather

Unnamed: 0,id,description1,temperature,humidity,windspeed,sunset,TIME
0,803,broken clouds,279.49,78,8.23,2022-03-30 18:55:15,2022-03-30 15:29:16
1,803,broken clouds,279.49,78,8.23,2022-03-30 18:55:15,2022-03-30 15:34:17
2,803,broken clouds,279.61,78,8.23,2022-03-30 18:55:15,2022-03-30 15:39:17
3,803,broken clouds,279.61,78,8.23,2022-03-30 18:55:15,2022-03-30 15:44:17
4,803,broken clouds,279.45,78,7.20,2022-03-30 18:55:15,2022-03-30 15:49:18
...,...,...,...,...,...,...,...
1387,803,broken clouds,285.98,86,11.83,2022-04-04 19:04:15,2022-04-04 11:09:31
1388,803,broken clouds,285.98,86,11.83,2022-04-04 19:04:15,2022-04-04 11:14:32
1389,803,broken clouds,285.98,86,11.83,2022-04-04 19:04:15,2022-04-04 11:19:32
1390,803,broken clouds,285.98,86,10.80,2022-04-04 19:04:15,2022-04-04 11:24:32


In [5]:
df_weather['just_date'] = df_weather['sunset'].dt.date
df_weather

Unnamed: 0,id,description1,temperature,humidity,windspeed,sunset,TIME,just_date
0,803,broken clouds,279.49,78,8.23,2022-03-30 18:55:15,2022-03-30 15:29:16,2022-03-30
1,803,broken clouds,279.49,78,8.23,2022-03-30 18:55:15,2022-03-30 15:34:17,2022-03-30
2,803,broken clouds,279.61,78,8.23,2022-03-30 18:55:15,2022-03-30 15:39:17,2022-03-30
3,803,broken clouds,279.61,78,8.23,2022-03-30 18:55:15,2022-03-30 15:44:17,2022-03-30
4,803,broken clouds,279.45,78,7.20,2022-03-30 18:55:15,2022-03-30 15:49:18,2022-03-30
...,...,...,...,...,...,...,...,...
1387,803,broken clouds,285.98,86,11.83,2022-04-04 19:04:15,2022-04-04 11:09:31,2022-04-04
1388,803,broken clouds,285.98,86,11.83,2022-04-04 19:04:15,2022-04-04 11:14:32,2022-04-04
1389,803,broken clouds,285.98,86,11.83,2022-04-04 19:04:15,2022-04-04 11:19:32,2022-04-04
1390,803,broken clouds,285.98,86,10.80,2022-04-04 19:04:15,2022-04-04 11:24:32,2022-04-04


In [6]:
df_weather.dtypes


id                       int64
description1            object
temperature            float64
humidity                 int64
windspeed              float64
sunset          datetime64[ns]
TIME            datetime64[ns]
just_date               object
dtype: object

### Availablity Data

In [7]:
def availability():
    engine = create_engine("mysql+mysqldb://{}:{}@{}:{}/{}".format(USER, PASSWORD, URL, PORT, DB), echo=True)
    df_avail = pd.read_sql_table("availability", engine)
    return df_avail

In [8]:
df_avail = availability()

2022-04-04 12:32:07,210 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'sql_mode'
2022-04-04 12:32:07,210 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-04 12:32:07,528 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'lower_case_table_names'
2022-04-04 12:32:07,528 INFO sqlalchemy.engine.Engine [generated in 0.00523s] ()
2022-04-04 12:32:08,139 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2022-04-04 12:32:08,139 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-04 12:32:08,849 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `dbikes`
2022-04-04 12:32:08,849 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-04 12:32:09,152 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `dbikes`
2022-04-04 12:32:09,152 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-04 12:32:09,487 INFO sqlalchemy.engine.Engine SHOW CREATE TABLE `availability`
2022-04-04 12:32:09,487 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-04 12:32:09,915 INFO sqlalchemy.engine.Engine SELECT availabili

In [9]:
df_avail

Unnamed: 0,number,available_bike_stands,available_bikes,last_update
0,42,16,14,2022-02-23 19:50:20
1,30,0,20,2022-02-23 19:41:25
2,54,11,22,2022-02-23 19:48:38
3,108,16,19,2022-02-23 19:51:13
4,56,2,38,2022-02-23 19:45:20
...,...,...,...,...
1003324,39,15,5,2022-04-04 11:30:15
1003325,83,11,29,2022-04-04 11:29:05
1003326,92,1,39,2022-04-04 11:28:41
1003327,21,22,7,2022-04-04 11:28:32


In [10]:
df_avail['day'] = df_avail['last_update'].dt.day_of_week
df_avail['hour'] = df_avail['last_update'].dt.hour
df_avail['just_date'] = df_avail['last_update'].dt.date




In [11]:
df_avail

Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,hour,just_date
0,42,16,14,2022-02-23 19:50:20,2,19,2022-02-23
1,30,0,20,2022-02-23 19:41:25,2,19,2022-02-23
2,54,11,22,2022-02-23 19:48:38,2,19,2022-02-23
3,108,16,19,2022-02-23 19:51:13,2,19,2022-02-23
4,56,2,38,2022-02-23 19:45:20,2,19,2022-02-23
...,...,...,...,...,...,...,...
1003324,39,15,5,2022-04-04 11:30:15,0,11,2022-04-04
1003325,83,11,29,2022-04-04 11:29:05,0,11,2022-04-04
1003326,92,1,39,2022-04-04 11:28:41,0,11,2022-04-04
1003327,21,22,7,2022-04-04 11:28:32,0,11,2022-04-04


In [12]:
df_avail.corr()

Unnamed: 0,number,available_bike_stands,available_bikes,day,hour
number,1.0,0.070305,0.34513,8.6e-05,-0.0005
available_bike_stands,0.070305,1.0,-0.680891,0.008012,-0.014322
available_bikes,0.34513,-0.680891,1.0,-0.008035,0.013174
day,8.6e-05,0.008012,-0.008035,1.0,-0.001249
hour,-0.0005,-0.014322,0.013174,-0.001249,1.0


In [13]:
df_avail.groupby(['day']).mean()

Unnamed: 0_level_0,number,available_bike_stands,available_bikes,hour
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,60.282627,12.602622,19.257334,10.709649
1,60.371049,12.570528,19.421478,12.347305
2,60.325617,12.704824,18.97155,11.481926
3,60.323107,12.590797,18.825266,12.605026
4,60.331106,12.634722,19.086406,11.506133
5,60.328567,12.802821,19.047929,11.395372
6,60.325291,12.817453,18.999444,11.489567


In [14]:
df_avail.dtypes


number                            int64
available_bike_stands             int64
available_bikes                   int64
last_update              datetime64[ns]
day                               int64
hour                              int64
just_date                        object
dtype: object

In [15]:
df_avail["number"] = df_avail["number"].astype('category')  


In [16]:
df_avail.shape

(1003329, 7)

In [17]:
df_avail.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
available_bike_stands,1003329.0,12.678964,9.071469,0.0,5.0,12.0,19.0,40.0
available_bikes,1003329.0,19.069401,10.694748,0.0,11.0,19.0,27.0,40.0
day,1003329.0,3.269668,1.883493,0.0,2.0,4.0,5.0,6.0
hour,1003329.0,11.667962,6.912526,0.0,6.0,12.0,18.0,23.0


In [18]:
df_avail["number"].describe().T

count     1003329
unique        110
top            61
freq         9123
Name: number, dtype: int64

#### Combining the two data frames 

In [19]:
# df_combine= df_avail.merge(df_weather,join='inner', on=['just_date'])

df_combine = pd.merge(df_avail, df_weather, on='just_date', how="outer")

# df_weather = df_weather.just_date.map(df_avail.set_index('just_date')
                                      
# pd.merge_asof(df_weather, df_avail, left_on='just_date', right_on='just_date')


# df_combine= df_avail.merge(df_weather,on=['just_date'])

# inner_merged = pd.concat([df_avail, df_weather],on=['just_date'])

# v = df1.merge(df2[['Date', 'exp']])\
#        .groupby(df1.columns.tolist())\
#        .exp\
#        .apply(pd.Series.tolist)


In [20]:
df_weather

Unnamed: 0,id,description1,temperature,humidity,windspeed,sunset,TIME,just_date
0,803,broken clouds,279.49,78,8.23,2022-03-30 18:55:15,2022-03-30 15:29:16,2022-03-30
1,803,broken clouds,279.49,78,8.23,2022-03-30 18:55:15,2022-03-30 15:34:17,2022-03-30
2,803,broken clouds,279.61,78,8.23,2022-03-30 18:55:15,2022-03-30 15:39:17,2022-03-30
3,803,broken clouds,279.61,78,8.23,2022-03-30 18:55:15,2022-03-30 15:44:17,2022-03-30
4,803,broken clouds,279.45,78,7.20,2022-03-30 18:55:15,2022-03-30 15:49:18,2022-03-30
...,...,...,...,...,...,...,...,...
1387,803,broken clouds,285.98,86,11.83,2022-04-04 19:04:15,2022-04-04 11:09:31,2022-04-04
1388,803,broken clouds,285.98,86,11.83,2022-04-04 19:04:15,2022-04-04 11:14:32,2022-04-04
1389,803,broken clouds,285.98,86,11.83,2022-04-04 19:04:15,2022-04-04 11:19:32,2022-04-04
1390,803,broken clouds,285.98,86,10.80,2022-04-04 19:04:15,2022-04-04 11:24:32,2022-04-04


In [21]:
df_combine
# inner_merged

Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,hour,just_date,id,description1,temperature,humidity,windspeed,sunset,TIME
0,42,16,14,2022-02-23 19:50:20,2,19,2022-02-23,,,,,,NaT,NaT
1,30,0,20,2022-02-23 19:41:25,2,19,2022-02-23,,,,,,NaT,NaT
2,54,11,22,2022-02-23 19:48:38,2,19,2022-02-23,,,,,,NaT,NaT
3,108,16,19,2022-02-23 19:51:13,2,19,2022-02-23,,,,,,NaT,NaT
4,56,2,38,2022-02-23 19:45:20,2,19,2022-02-23,,,,,,NaT,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39722928,88,2,28,2022-04-04 11:22:32,0,11,2022-04-04,803.0,broken clouds,285.98,86.0,11.83,2022-04-04 19:04:15,2022-04-04 11:09:31
39722929,88,2,28,2022-04-04 11:22:32,0,11,2022-04-04,803.0,broken clouds,285.98,86.0,11.83,2022-04-04 19:04:15,2022-04-04 11:14:32
39722930,88,2,28,2022-04-04 11:22:32,0,11,2022-04-04,803.0,broken clouds,285.98,86.0,11.83,2022-04-04 19:04:15,2022-04-04 11:19:32
39722931,88,2,28,2022-04-04 11:22:32,0,11,2022-04-04,803.0,broken clouds,285.98,86.0,10.80,2022-04-04 19:04:15,2022-04-04 11:24:32


In [22]:
df_combine.tail(40)

Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,hour,just_date,id,description1,temperature,humidity,windspeed,sunset,TIME
39722893,88,2,28,2022-04-04 11:22:32,0,11,2022-04-04,803.0,broken clouds,284.04,90.0,8.75,2022-04-04 19:04:15,2022-04-04 08:14:25
39722894,88,2,28,2022-04-04 11:22:32,0,11,2022-04-04,803.0,broken clouds,284.28,88.0,9.26,2022-04-04 19:04:15,2022-04-04 08:19:25
39722895,88,2,28,2022-04-04 11:22:32,0,11,2022-04-04,803.0,broken clouds,284.3,88.0,9.26,2022-04-04 19:04:14,2022-04-04 08:24:25
39722896,88,2,28,2022-04-04 11:22:32,0,11,2022-04-04,803.0,broken clouds,284.28,88.0,9.26,2022-04-04 19:04:15,2022-04-04 08:29:25
39722897,88,2,28,2022-04-04 11:22:32,0,11,2022-04-04,803.0,broken clouds,284.28,88.0,9.26,2022-04-04 19:04:15,2022-04-04 08:34:25
39722898,88,2,28,2022-04-04 11:22:32,0,11,2022-04-04,803.0,broken clouds,284.3,88.0,9.26,2022-04-04 19:04:14,2022-04-04 08:39:25
39722899,88,2,28,2022-04-04 11:22:32,0,11,2022-04-04,803.0,broken clouds,284.28,88.0,9.77,2022-04-04 19:04:15,2022-04-04 08:44:26
39722900,88,2,28,2022-04-04 11:22:32,0,11,2022-04-04,803.0,broken clouds,284.28,88.0,9.26,2022-04-04 19:04:15,2022-04-04 08:49:26
39722901,88,2,28,2022-04-04 11:22:32,0,11,2022-04-04,803.0,broken clouds,284.28,89.0,9.77,2022-04-04 19:04:15,2022-04-04 08:54:26
39722902,88,2,28,2022-04-04 11:22:32,0,11,2022-04-04,803.0,broken clouds,284.28,89.0,9.77,2022-04-04 19:04:15,2022-04-04 08:59:26


### Looking at correlations

In [23]:
df_combine.corr()

Unnamed: 0,available_bike_stands,available_bikes,day,hour,id,temperature,humidity,windspeed
available_bike_stands,1.0,-0.724832,0.007549,-0.011663,-4.6e-05,0.001577,0.003525,-0.003522
available_bikes,-0.724832,1.0,-0.00488,0.00904,0.000493,-0.001674,-0.003519,0.00214
day,0.007549,-0.00488,1.0,0.116812,0.004597,-0.12095,0.049174,-0.566204
hour,-0.011663,0.00904,0.116812,1.0,0.012764,-0.082078,-0.058757,-0.088927
id,-4.6e-05,0.000493,0.004597,0.012764,1.0,-0.098088,-0.28507,-0.152483
temperature,0.001577,-0.001674,-0.12095,-0.082078,-0.098088,1.0,-0.458609,0.508266
humidity,0.003525,-0.003519,0.049174,-0.058757,-0.28507,-0.458609,1.0,-0.199316
windspeed,-0.003522,0.00214,-0.566204,-0.088927,-0.152483,0.508266,-0.199316,1.0


In [24]:
df_avail.head(5)

Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,hour,just_date
0,42,16,14,2022-02-23 19:50:20,2,19,2022-02-23
1,30,0,20,2022-02-23 19:41:25,2,19,2022-02-23
2,54,11,22,2022-02-23 19:48:38,2,19,2022-02-23
3,108,16,19,2022-02-23 19:51:13,2,19,2022-02-23
4,56,2,38,2022-02-23 19:45:20,2,19,2022-02-23


In [25]:
df_avail.tail(5)

Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,hour,just_date
1003324,39,15,5,2022-04-04 11:30:15,0,11,2022-04-04
1003325,83,11,29,2022-04-04 11:29:05,0,11,2022-04-04
1003326,92,1,39,2022-04-04 11:28:41,0,11,2022-04-04
1003327,21,22,7,2022-04-04 11:28:32,0,11,2022-04-04
1003328,88,2,28,2022-04-04 11:22:32,0,11,2022-04-04


In [26]:
df_avail.shape

(1003329, 7)

### Training Model for available bikes 

In [27]:
train_feature = ["day","hour","number"]
target_feature1 = ['available_bikes']

train = df_avail[train_feature]
target = df_avail[target_feature1]
Xtrain,Xtest,Ytrain,Ytest = train_test_split(train,target,test_size=0.3)
Xtrain

Unnamed: 0,day,hour,number
782923,6,22,112
283990,5,2,111
784245,6,23,74
916441,4,17,31
491249,4,16,71
...,...,...,...
638745,2,8,53
442227,3,3,91
359166,0,11,74
975665,6,14,8


In [28]:
LR = LinearRegression()
LR.fit(Xtrain,Ytrain)




LinearRegression()

In [29]:
LR.score(Xtest,Ytest)



0.11792819745258509

In [30]:
y_prediction =  LR.predict(Xtest)
y_prediction



array([[18.44573062],
       [18.27199607],
       [18.41534545],
       ...,
       [20.81690211],
       [17.12303216],
       [14.2651784 ]])

### Example of Prediction

- on Monday, 2pm, station: 100

In [31]:
LR.predict([[1, 14, 100]])

array([[23.56501831]])

### Metrics for model evaluation in linear regression:  

#### R Square/Adjusted R Square

In [32]:
score=r2_score(Ytest,y_prediction)
print('r2 score is ',score)

r2 score is  0.11792819745258509


#### Mean Square Error(MSE)

In [33]:
print('MSE is ',mean_squared_error(Ytest,y_prediction))

MSE is  100.99869128980254


#### Mean Absolute Error

In [34]:
print('Mean Absolute Error is ',mean_absolute_error(Ytest,y_prediction))

Mean Absolute Error is  8.212297405529121


## Using a Random Forest Model for bike availabilty

In [35]:
random_forest = RandomForestRegressor(random_state=0)
random_forest = random_forest.fit(Xtrain,Ytrain.available_bikes)
score = random_forest.score(Xtest,Ytest.available_bikes)
score



0.6634254311558244

In [36]:
y_prediction_RF =  random_forest.predict(Xtest)
y_prediction_RF



array([20.40329884, 19.50772665, 27.31047896, ..., 14.05759438,
       11.71095896,  7.58742589])

### Metrics for model evaluation in random forrest:  

#### R squared

In [37]:
score=r2_score(Ytest,y_prediction_RF)
print('r2 score is ',score)

r2 score is  0.6634254311558244


#### Mean squared error

In [38]:
print('MSE is ',mean_squared_error(Ytest,y_prediction_RF))

MSE is  38.53834900573639


#### Mean absolute error

In [39]:
print('Mean Absolute Error is ',mean_absolute_error(Ytest,y_prediction_RF))

Mean Absolute Error is  4.778008989851113


### Example of prediction

In [40]:
random_forest.predict([[1, 14, 100]])

array([23.11743188])

### Saving model to disk with Pickle:

In [41]:
pickle.dump(random_forest, open('model.pkl', 'wb'))

In [42]:
#testing

In [43]:
model= pickle.load(open('model.pkl', 'rb'))

### Making model for bike stand availabilty:

In [44]:
target_feature2 = ['available_bike_stands']

train = df_avail[train_feature]
target = df_avail[target_feature2]
Xtrain,Xtest,Ytrain,Ytest = train_test_split(train,target,test_size=0.3)
Xtrain

Unnamed: 0,day,hour,number
916993,4,17,23
679470,3,15,93
11819,3,12,104
762516,6,7,95
512223,5,8,72
...,...,...,...
854732,2,5,39
735387,5,10,42
184231,1,22,107
383871,1,6,26


In [45]:
random_forest_2 = RandomForestRegressor(random_state=1)
random_forest_2 = random_forest_2.fit(Xtrain,Ytrain.available_bike_stands)
score_2 = random_forest_2.score(Xtest,Ytest.available_bike_stands)
score_2



0.5580686394220701

In [46]:
y_prediction_RF_2 =  random_forest_2.predict(Xtest)
y_prediction_RF_2



array([16.91688307, 13.68389973,  8.23945723, ...,  8.60684634,
       27.38518852, 11.13295779])

### R2 score for RF bike stands

In [47]:
score =r2_score(Ytest,y_prediction_RF_2)
print('r2 score is ',score)

r2 score is  0.5580686394220701


### Mean squared error for RF bike stands

In [48]:
print('MSE is ',mean_squared_error(Ytest,y_prediction_RF_2))

MSE is  36.388159387532525


### Mean absolute error for RF bike stands

In [49]:
print('Mean Absolute Error is ',mean_absolute_error(Ytest,y_prediction_RF_2))

Mean Absolute Error is  4.6842563663885795


### Example prediction for RF bike stands

In [50]:
random_forest_2.predict([[1, 14, 100]])

array([1.91627095])

### Saving model to pickle file

In [51]:
pickle.dump(random_forest_2, open('model_stands.pkl', 'wb'))