# Machine learning for predicitng Bike availability and Bike stand availabilty

##### Linear Regression Model for predicting Bike Availability: 


- Here we will first be implementing a linear regression model in order to predict the number of bikes available and the number of bike stands available at a give bike stand. 
- Linear regression is a statistical method for modeling relationships between a dependent variable with a given set of independent variables.
- In our model the dependent variable will be number of bikes/bike stands and the independent variables will be time of day, day of the week, area, and weather.  

In [2]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sqlalchemy import create_engine

import pickle

from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error


from sklearn.ensemble import RandomForestRegressor





### Connect to database:

In [3]:
URL = "dublin-bikesdb.cmd8vuwgew1e.us-east-1.rds.amazonaws.com"
PORT = "3306"
DB = "dbikes"
USER = "admin"
PASSWORD = "Dbikes123"


### Weather Data 

In [4]:
def weather():
    engine = create_engine("mysql+mysqldb://{}:{}@{}:{}/{}".format(USER, PASSWORD, URL, PORT, DB), echo=True)
    sql_query_weather= """
    SELECT weather.id, weather.description1, weather.temperature, weather.humidity, weather.windspeed, weather.sunset, weather.TIME FROM weather;
    """
    df_weather = pd.read_sql_query(sql_query_weather, engine)

    return df_weather


df_weather = weather()

2022-04-15 12:14:28,338 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'sql_mode'
2022-04-15 12:14:28,359 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-15 12:14:28,506 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'lower_case_table_names'
2022-04-15 12:14:28,507 INFO sqlalchemy.engine.Engine [generated in 0.00133s] ()
2022-04-15 12:14:28,746 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2022-04-15 12:14:28,746 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-15 12:14:29,108 INFO sqlalchemy.engine.Engine 
    SELECT weather.id, weather.description1, weather.temperature, weather.humidity, weather.windspeed, weather.sunset, weather.TIME FROM weather;
    
2022-04-15 12:14:29,108 INFO sqlalchemy.engine.Engine [raw sql] ()


In [5]:
df_weather

Unnamed: 0,id,description1,temperature,humidity,windspeed,sunset,TIME
0,803,broken clouds,284.17,84,6.69,2022-04-11 19:16:53,2022-04-11 21:14:23
1,803,broken clouds,284.12,84,4.12,2022-04-11 19:16:53,2022-04-11 21:19:23
2,803,broken clouds,284.12,84,4.12,2022-04-11 19:16:53,2022-04-11 21:24:23
3,803,broken clouds,284.12,84,4.12,2022-04-11 19:16:53,2022-04-11 21:29:23
4,803,broken clouds,284.02,84,4.12,2022-04-11 19:16:54,2022-04-11 21:34:23
...,...,...,...,...,...,...,...
1027,803,broken clouds,288.12,76,6.17,2022-04-15 19:24:09,2022-04-15 10:53:23
1028,803,broken clouds,288.06,77,6.17,2022-04-15 19:24:09,2022-04-15 10:58:23
1029,803,broken clouds,288.06,77,6.17,2022-04-15 19:24:09,2022-04-15 11:03:24
1030,803,broken clouds,288.28,77,4.63,2022-04-15 19:24:09,2022-04-15 11:08:24


In [6]:
df_weather['just_date'] = df_weather['sunset'].dt.date
df_weather

Unnamed: 0,id,description1,temperature,humidity,windspeed,sunset,TIME,just_date
0,803,broken clouds,284.17,84,6.69,2022-04-11 19:16:53,2022-04-11 21:14:23,2022-04-11
1,803,broken clouds,284.12,84,4.12,2022-04-11 19:16:53,2022-04-11 21:19:23,2022-04-11
2,803,broken clouds,284.12,84,4.12,2022-04-11 19:16:53,2022-04-11 21:24:23,2022-04-11
3,803,broken clouds,284.12,84,4.12,2022-04-11 19:16:53,2022-04-11 21:29:23,2022-04-11
4,803,broken clouds,284.02,84,4.12,2022-04-11 19:16:54,2022-04-11 21:34:23,2022-04-11
...,...,...,...,...,...,...,...,...
1027,803,broken clouds,288.12,76,6.17,2022-04-15 19:24:09,2022-04-15 10:53:23,2022-04-15
1028,803,broken clouds,288.06,77,6.17,2022-04-15 19:24:09,2022-04-15 10:58:23,2022-04-15
1029,803,broken clouds,288.06,77,6.17,2022-04-15 19:24:09,2022-04-15 11:03:24,2022-04-15
1030,803,broken clouds,288.28,77,4.63,2022-04-15 19:24:09,2022-04-15 11:08:24,2022-04-15


In [7]:
df_weather.dtypes


id                       int64
description1            object
temperature            float64
humidity                 int64
windspeed              float64
sunset          datetime64[ns]
TIME            datetime64[ns]
just_date               object
dtype: object

### Availablity Data

In [8]:
def availability():
    engine = create_engine("mysql+mysqldb://{}:{}@{}:{}/{}".format(USER, PASSWORD, URL, PORT, DB), echo=True)
    df_avail = pd.read_sql_table("availability", engine)
    return df_avail

In [9]:
df_avail = availability()

2022-04-15 12:14:31,642 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'sql_mode'
2022-04-15 12:14:31,642 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-15 12:14:31,774 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'lower_case_table_names'
2022-04-15 12:14:31,776 INFO sqlalchemy.engine.Engine [generated in 0.00125s] ()
2022-04-15 12:14:32,017 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2022-04-15 12:14:32,017 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-15 12:14:32,476 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `dbikes`
2022-04-15 12:14:32,476 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-15 12:14:32,649 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `dbikes`
2022-04-15 12:14:32,649 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-15 12:14:32,807 INFO sqlalchemy.engine.Engine SHOW CREATE TABLE `availability`
2022-04-15 12:14:32,807 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-15 12:14:33,107 INFO sqlalchemy.engine.Engine SELECT availabili

In [10]:
df_avail

Unnamed: 0,number,available_bike_stands,available_bikes,last_update
0,52,1,31,2022-04-07 15:21:18
1,16,11,9,2022-04-07 15:21:17
2,111,18,22,2022-04-07 15:21:01
3,15,8,8,2022-04-07 15:21:07
4,10,16,0,2022-04-07 15:21:10
...,...,...,...,...
246829,39,12,8,2022-04-15 11:10:53
246830,83,29,11,2022-04-15 11:10:04
246831,92,29,11,2022-04-15 11:11:41
246832,21,18,12,2022-04-15 11:11:31


In [11]:
df_avail['day'] = df_avail['last_update'].dt.day_of_week
df_avail['hour'] = df_avail['last_update'].dt.hour
df_avail['just_date'] = df_avail['last_update'].dt.date




In [12]:
df_avail

Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,hour,just_date
0,52,1,31,2022-04-07 15:21:18,3,15,2022-04-07
1,16,11,9,2022-04-07 15:21:17,3,15,2022-04-07
2,111,18,22,2022-04-07 15:21:01,3,15,2022-04-07
3,15,8,8,2022-04-07 15:21:07,3,15,2022-04-07
4,10,16,0,2022-04-07 15:21:10,3,15,2022-04-07
...,...,...,...,...,...,...,...
246829,39,12,8,2022-04-15 11:10:53,4,11,2022-04-15
246830,83,29,11,2022-04-15 11:10:04,4,11,2022-04-15
246831,92,29,11,2022-04-15 11:11:41,4,11,2022-04-15
246832,21,18,12,2022-04-15 11:11:31,4,11,2022-04-15


In [13]:
df_avail.corr()

Unnamed: 0,number,available_bike_stands,available_bikes,day,hour
number,1.0,0.115992,0.235326,-0.012717,0.004209
available_bike_stands,0.115992,1.0,-0.718933,0.006964,-0.011338
available_bikes,0.235326,-0.718933,1.0,-0.00195,0.010084
day,-0.012717,0.006964,-0.00195,1.0,-0.028887
hour,0.004209,-0.011338,0.010084,-0.028887,1.0


In [14]:
df_avail.groupby(['day']).mean()

Unnamed: 0_level_0,number,available_bike_stands,available_bikes,hour
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,60.322943,12.604739,19.287434,11.483892
1,63.682795,12.686169,18.994007,11.484908
2,60.33025,12.568917,19.249698,11.446564
3,60.31574,12.550214,19.316953,13.496993
4,60.324441,12.743964,19.149477,9.444533
5,60.325645,12.755719,19.178981,11.462233
6,60.334711,12.803992,19.097956,11.472873


In [15]:
df_avail.dtypes


number                            int64
available_bike_stands             int64
available_bikes                   int64
last_update              datetime64[ns]
day                               int64
hour                              int64
just_date                        object
dtype: object

In [16]:
df_avail["number"] = df_avail["number"].astype('category')  


In [17]:
df_avail.shape

(246834, 7)

In [18]:
df_avail.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
available_bike_stands,246834.0,12.671909,9.087127,0.0,5.0,12.0,18.0,40.0
available_bikes,246834.0,19.186234,10.763253,0.0,11.0,19.0,28.0,40.0
day,246834.0,3.0572,1.906514,0.0,1.0,3.0,5.0,6.0
hour,246834.0,11.443387,6.981405,0.0,5.0,11.0,18.0,23.0


### why is there 111 stations there should only 110?

In [19]:
df_avail["number"].describe().T

count     246834
unique       111
top           16
freq        2243
Name: number, dtype: int64

### Removing illogical data from dataset

In [20]:
df_avail.loc[df_avail['number']==507]


Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,hour,just_date
220498,507,1,0,2022-04-05 14:15:05,1,14,2022-04-05
220609,507,1,0,2022-04-05 14:15:05,1,14,2022-04-05
220720,507,1,0,2022-04-05 14:15:05,1,14,2022-04-05
220831,507,1,0,2022-04-05 14:15:05,1,14,2022-04-05
220942,507,1,0,2022-04-05 14:15:05,1,14,2022-04-05
...,...,...,...,...,...,...,...
246361,507,1,0,2022-04-05 14:15:05,1,14,2022-04-05
246472,507,1,0,2022-04-05 14:15:05,1,14,2022-04-05
246583,507,1,0,2022-04-05 14:15:05,1,14,2022-04-05
246694,507,1,0,2022-04-05 14:15:05,1,14,2022-04-05


In [21]:
df_avail = df_avail[df_avail.number != 507]


In [22]:
df_avail["number"].describe().T

count     246596
unique       110
top           16
freq        2243
Name: number, dtype: int64

#### Attempt at combining the two data frames 

In [23]:
# df_combine= df_avail.merge(df_weather,join='inner', on=['just_date'])

# df_combine = pd.merge(df_avail, df_weather, on='just_date', how="outer")

# df_weather = df_weather.just_date.map(df_avail.set_index('just_date')
                                      
# pd.merge_asof(df_weather, df_avail, left_on='just_date', right_on='just_date')


# df_combine= df_avail.merge(df_weather,on=['just_date'])

# inner_merged = pd.concat([df_avail, df_weather],on=['just_date'])

# v = df1.merge(df2[['Date', 'exp']])\
#        .groupby(df1.columns.tolist())\
#        .exp\
#        .apply(pd.Series.tolist)


In [23]:
# df_weather

In [24]:
# df_combine
# inner_merged

In [25]:
# df_combine.tail(40)

### Looking at correlations

In [26]:
# df_combine.corr()

In [27]:
df_avail.head(5)

Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,hour,just_date
0,52,1,31,2022-04-07 15:21:18,3,15,2022-04-07
1,16,11,9,2022-04-07 15:21:17,3,15,2022-04-07
2,111,18,22,2022-04-07 15:21:01,3,15,2022-04-07
3,15,8,8,2022-04-07 15:21:07,3,15,2022-04-07
4,10,16,0,2022-04-07 15:21:10,3,15,2022-04-07


In [28]:
df_avail.tail(5)

Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,hour,just_date
246829,39,12,8,2022-04-15 11:10:53,4,11,2022-04-15
246830,83,29,11,2022-04-15 11:10:04,4,11,2022-04-15
246831,92,29,11,2022-04-15 11:11:41,4,11,2022-04-15
246832,21,18,12,2022-04-15 11:11:31,4,11,2022-04-15
246833,88,19,11,2022-04-15 11:10:26,4,11,2022-04-15


In [29]:
df_avail.shape

(246596, 7)

### Training Model for available bikes 

In [30]:
train_feature = ["day","hour","number"]
target_feature1 = ['available_bikes']

train = df_avail[train_feature]
target = df_avail[target_feature1]
Xtrain,Xtest,Ytrain,Ytest = train_test_split(train,target,test_size=0.3)
Xtrain

Unnamed: 0,day,hour,number
60949,5,13,116
87208,6,9,54
196384,2,20,16
153960,1,12,33
142624,1,4,100
...,...,...,...
138534,1,0,59
33746,4,17,42
37541,4,19,117
96608,6,16,101


In [31]:
LR = LinearRegression()
LR.fit(Xtrain,Ytrain)




LinearRegression()

In [32]:
LR.score(Xtest,Ytest)



0.07934055566245246

In [33]:
y_prediction =  LR.predict(Xtest)
y_prediction



array([[22.14745732],
       [20.33824527],
       [23.13866632],
       ...,
       [15.33746608],
       [20.32345468],
       [14.1697975 ]])

### Example of Prediction

- on Monday, 2pm, station: 100

In [34]:
LR.predict([[1, 14, 100]])

array([[22.78577506]])

### Metrics for model evaluation in linear regression:  

#### R Square/Adjusted R Square

In [35]:
score=r2_score(Ytest,y_prediction)
print('r2 score is ',score)

r2 score is  0.07934055566245246


#### Mean Square Error(MSE)

In [36]:
print('MSE is ',mean_squared_error(Ytest,y_prediction))

MSE is  106.65240246600848


#### Mean Absolute Error

In [37]:
print('Mean Absolute Error is ',mean_absolute_error(Ytest,y_prediction))

Mean Absolute Error is  8.52738798196904


## Using a Random Forest Model for bike availabilty

In [38]:
random_forest = RandomForestRegressor(random_state=0)
random_forest = random_forest.fit(Xtrain,Ytrain.available_bikes)
score = random_forest.score(Xtest,Ytest.available_bikes)
score



0.9374696517835333

In [39]:
y_prediction_RF =  random_forest.predict(Xtest)
y_prediction_RF



array([ 0.15684973,  8.        , 25.88247511, ..., 19.        ,
       18.27144236,  7.40171248])

### Metrics for model evaluation in random forrest:  

#### R squared

In [40]:
score=r2_score(Ytest,y_prediction_RF)
print('r2 score is ',score)

r2 score is  0.9374696517835333


#### Mean squared error

In [41]:
print('MSE is ',mean_squared_error(Ytest,y_prediction_RF))

MSE is  7.243733722973859


#### Mean absolute error

In [42]:
print('Mean Absolute Error is ',mean_absolute_error(Ytest,y_prediction_RF))

Mean Absolute Error is  1.399686009571031


### Example of prediction

In [43]:
random_forest.predict([[1, 14, 100]])

array([18.89594808])

### Feature Importance

In [44]:
random_forest.feature_importances_


array([0.19485481, 0.179389  , 0.62575618])

### Saving model to disk with Pickle:

In [45]:
pickle.dump(random_forest, open('model.pkl', 'wb'))

In [46]:
#testing

In [47]:
model= pickle.load(open('model.pkl', 'rb'))

### Making model for bike stand availabilty:

In [48]:
target_feature2 = ['available_bike_stands']

train = df_avail[train_feature]
target = df_avail[target_feature2]
Xtrain,Xtest,Ytrain,Ytest = train_test_split(train,target,test_size=0.3)
Xtrain

Unnamed: 0,day,hour,number
155297,1,13,30
246817,4,11,107
7741,3,21,25
219982,3,14,18
132315,0,20,48
...,...,...,...
151508,1,10,71
146799,1,7,26
194626,2,19,82
30575,4,14,87


In [49]:
random_forest_2 = RandomForestRegressor(random_state=1)
random_forest_2 = random_forest_2.fit(Xtrain,Ytrain.available_bike_stands)
score_2 = random_forest_2.score(Xtest,Ytest.available_bike_stands)
score_2



0.913491971655391

In [50]:
y_prediction_RF_2 =  random_forest_2.predict(Xtest)
y_prediction_RF_2



array([ 0.15967813, 11.46916313,  8.        , ...,  4.57684671,
       13.        , 12.33835156])

### R2 score for RF bike stands

In [51]:
score =r2_score(Ytest,y_prediction_RF_2)
print('r2 score is ',score)

r2 score is  0.913491971655391


### Mean squared error for RF bike stands

In [52]:
print('MSE is ',mean_squared_error(Ytest,y_prediction_RF_2))

MSE is  7.120550672764191


### Mean absolute error for RF bike stands

In [53]:
print('Mean Absolute Error is ',mean_absolute_error(Ytest,y_prediction_RF_2))

Mean Absolute Error is  1.3893073937279552


### Example prediction for RF bike stands

In [54]:
random_forest_2.predict([[1, 14, 100]])

array([6.23589852])

### Feature Importance

In [55]:
random_forest_2.feature_importances_


array([0.26408113, 0.22713814, 0.50878073])

### Saving model to pickle file

In [65]:
pickle.dump(random_forest_2, open('model_stands.pkl', 'wb'))