# Machine learning for predicitng Bike availability and Bike stand availabilty

##### Linear Regression Model for predicting Bike Availability: 


- Here we will first be implementing a linear regression model in order to predict the number of bikes available and the number of bike stands available at a give bike stand. 
- Linear regression is a statistical method for modeling relationships between a dependent variable with a given set of independent variables.
- In our model the dependent variable will be number of bikes/bike stands and the independent variables will be time of day, day of the week, area, and weather.  

In [2]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sqlalchemy import create_engine

import pickle

from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error


from sklearn.ensemble import RandomForestRegressor





### Connect to database:

In [3]:
URL = "dublin-bikesdb.cmd8vuwgew1e.us-east-1.rds.amazonaws.com"
PORT = "3306"
DB = "dbikes"
USER = "admin"
PASSWORD = "Dbikes123"


### Weather Data 

In [4]:
def weather():
    engine = create_engine("mysql+mysqldb://{}:{}@{}:{}/{}".format(USER, PASSWORD, URL, PORT, DB), echo=True)
    sql_query_weather= """
    SELECT weather.id, weather.description1, weather.temperature, weather.humidity, weather.windspeed, weather.sunset, weather.TIME FROM weather;
    """
    df_weather = pd.read_sql_query(sql_query_weather, engine)

    return df_weather


df_weather = weather()

2022-04-10 16:57:11,560 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'sql_mode'
2022-04-10 16:57:11,566 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-10 16:57:11,696 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'lower_case_table_names'
2022-04-10 16:57:11,697 INFO sqlalchemy.engine.Engine [generated in 0.00180s] ()
2022-04-10 16:57:11,896 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2022-04-10 16:57:11,898 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-10 16:57:12,191 INFO sqlalchemy.engine.Engine 
    SELECT weather.id, weather.description1, weather.temperature, weather.humidity, weather.windspeed, weather.sunset, weather.TIME FROM weather;
    
2022-04-10 16:57:12,193 INFO sqlalchemy.engine.Engine [raw sql] ()


In [5]:
df_weather

Unnamed: 0,id,description1,temperature,humidity,windspeed,sunset,TIME
0,803,broken clouds,279.49,78,8.23,2022-03-30 18:55:15,2022-03-30 15:29:16
1,803,broken clouds,279.49,78,8.23,2022-03-30 18:55:15,2022-03-30 15:34:17
2,803,broken clouds,279.61,78,8.23,2022-03-30 18:55:15,2022-03-30 15:39:17
3,803,broken clouds,279.61,78,8.23,2022-03-30 18:55:15,2022-03-30 15:44:17
4,803,broken clouds,279.45,78,7.20,2022-03-30 18:55:15,2022-03-30 15:49:18
...,...,...,...,...,...,...,...
3167,803,broken clouds,283.05,74,7.72,2022-04-10 19:15:05,2022-04-10 15:36:12
3168,803,broken clouds,282.81,76,7.72,2022-04-10 19:15:05,2022-04-10 15:41:12
3169,803,broken clouds,282.49,77,7.20,2022-04-10 19:15:05,2022-04-10 15:46:13
3170,803,broken clouds,282.49,77,7.20,2022-04-10 19:15:05,2022-04-10 15:51:13


In [6]:
df_weather['just_date'] = df_weather['sunset'].dt.date
df_weather

Unnamed: 0,id,description1,temperature,humidity,windspeed,sunset,TIME,just_date
0,803,broken clouds,279.49,78,8.23,2022-03-30 18:55:15,2022-03-30 15:29:16,2022-03-30
1,803,broken clouds,279.49,78,8.23,2022-03-30 18:55:15,2022-03-30 15:34:17,2022-03-30
2,803,broken clouds,279.61,78,8.23,2022-03-30 18:55:15,2022-03-30 15:39:17,2022-03-30
3,803,broken clouds,279.61,78,8.23,2022-03-30 18:55:15,2022-03-30 15:44:17,2022-03-30
4,803,broken clouds,279.45,78,7.20,2022-03-30 18:55:15,2022-03-30 15:49:18,2022-03-30
...,...,...,...,...,...,...,...,...
3167,803,broken clouds,283.05,74,7.72,2022-04-10 19:15:05,2022-04-10 15:36:12,2022-04-10
3168,803,broken clouds,282.81,76,7.72,2022-04-10 19:15:05,2022-04-10 15:41:12,2022-04-10
3169,803,broken clouds,282.49,77,7.20,2022-04-10 19:15:05,2022-04-10 15:46:13,2022-04-10
3170,803,broken clouds,282.49,77,7.20,2022-04-10 19:15:05,2022-04-10 15:51:13,2022-04-10


In [7]:
df_weather.dtypes


id                       int64
description1            object
temperature            float64
humidity                 int64
windspeed              float64
sunset          datetime64[ns]
TIME            datetime64[ns]
just_date               object
dtype: object

### Availablity Data

In [8]:
def availability():
    engine = create_engine("mysql+mysqldb://{}:{}@{}:{}/{}".format(USER, PASSWORD, URL, PORT, DB), echo=True)
    df_avail = pd.read_sql_table("availability", engine)
    return df_avail

In [9]:
df_avail = availability()

2022-04-10 16:57:13,587 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'sql_mode'
2022-04-10 16:57:13,589 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-10 16:57:13,689 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'lower_case_table_names'
2022-04-10 16:57:13,691 INFO sqlalchemy.engine.Engine [generated in 0.00205s] ()
2022-04-10 16:57:13,886 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2022-04-10 16:57:13,888 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-10 16:57:14,272 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `dbikes`
2022-04-10 16:57:14,274 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-10 16:57:14,381 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `dbikes`
2022-04-10 16:57:14,383 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-10 16:57:14,505 INFO sqlalchemy.engine.Engine SHOW CREATE TABLE `availability`
2022-04-10 16:57:14,506 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-10 16:57:14,740 INFO sqlalchemy.engine.Engine SELECT availabili

In [10]:
df_avail

Unnamed: 0,number,available_bike_stands,available_bikes,last_update
0,42,16,14,2022-02-23 19:50:20
1,30,0,20,2022-02-23 19:41:25
2,54,11,22,2022-02-23 19:48:38
3,108,16,19,2022-02-23 19:51:13
4,56,2,38,2022-02-23 19:45:20
...,...,...,...,...
1199637,39,9,11,2022-04-10 15:49:38
1199638,83,19,21,2022-04-10 15:55:01
1199639,92,37,3,2022-04-10 15:47:09
1199640,21,27,2,2022-04-10 15:47:57


In [11]:
df_avail['day'] = df_avail['last_update'].dt.day_of_week
df_avail['hour'] = df_avail['last_update'].dt.hour
df_avail['just_date'] = df_avail['last_update'].dt.date




In [12]:
df_avail

Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,hour,just_date
0,42,16,14,2022-02-23 19:50:20,2,19,2022-02-23
1,30,0,20,2022-02-23 19:41:25,2,19,2022-02-23
2,54,11,22,2022-02-23 19:48:38,2,19,2022-02-23
3,108,16,19,2022-02-23 19:51:13,2,19,2022-02-23
4,56,2,38,2022-02-23 19:45:20,2,19,2022-02-23
...,...,...,...,...,...,...,...
1199637,39,9,11,2022-04-10 15:49:38,6,15,2022-04-10
1199638,83,19,21,2022-04-10 15:55:01,6,15,2022-04-10
1199639,92,37,3,2022-04-10 15:47:09,6,15,2022-04-10
1199640,21,27,2,2022-04-10 15:47:57,6,15,2022-04-10


In [13]:
df_avail.corr()

Unnamed: 0,number,available_bike_stands,available_bikes,day,hour
number,1.0,0.048988,0.280319,-0.017848,0.004585
available_bike_stands,0.048988,1.0,-0.681418,0.009368,-0.013996
available_bikes,0.280319,-0.681418,1.0,-0.004443,0.011732
day,-0.017848,0.009368,-0.004443,1.0,-0.035585
hour,0.004585,-0.013996,0.011732,-0.035585,1.0


In [14]:
df_avail.groupby(['day']).mean()

Unnamed: 0_level_0,number,available_bike_stands,available_bikes,hour
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,60.290357,12.602416,19.260036,11.551435
1,64.937048,12.46236,19.174136,12.16592
2,60.328196,12.718958,18.993246,11.480344
3,60.322006,12.58763,18.897048,12.421711
4,60.330264,12.634216,19.108924,11.501691
5,60.328145,12.796028,19.06683,11.405015
6,60.32636,12.811906,19.015972,10.915389


In [15]:
df_avail.dtypes


number                            int64
available_bike_stands             int64
available_bikes                   int64
last_update              datetime64[ns]
day                               int64
hour                              int64
just_date                        object
dtype: object

In [16]:
df_avail["number"] = df_avail["number"].astype('category')  


In [17]:
df_avail.shape

(1199642, 7)

In [18]:
df_avail.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
available_bike_stands,1199642.0,12.663722,9.061992,0.0,5.0,12.0,19.0,40.0
available_bikes,1199642.0,19.065333,10.700477,0.0,11.0,19.0,27.0,40.0
day,1199642.0,3.234305,1.876332,0.0,2.0,3.0,5.0,6.0
hour,1199642.0,11.646167,6.890825,0.0,6.0,12.0,18.0,23.0


### why is there 111 stations there should only 110?

In [60]:
df_avail["number"].describe().T

count     1199642
unique        111
top            61
freq        10894
Name: number, dtype: int64

### Removing illogical data from dataset

In [59]:
df_avail.loc[df_avail['number']==507]


Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,hour,just_date
1032891,507,0,1,2022-04-05 09:56:28,1,9,2022-04-05
1033002,507,0,1,2022-04-05 10:06:34,1,10,2022-04-05
1033113,507,0,1,2022-04-05 10:06:34,1,10,2022-04-05
1033224,507,0,1,2022-04-05 10:16:40,1,10,2022-04-05
1033335,507,0,1,2022-04-05 10:16:40,1,10,2022-04-05
...,...,...,...,...,...,...,...
1199169,507,1,0,2022-04-05 14:15:05,1,14,2022-04-05
1199280,507,1,0,2022-04-05 14:15:05,1,14,2022-04-05
1199391,507,1,0,2022-04-05 14:15:05,1,14,2022-04-05
1199502,507,1,0,2022-04-05 14:15:05,1,14,2022-04-05


In [61]:
df_avail = df_avail[df_avail.number != 507]


In [62]:
df_avail["number"].describe().T

count     1198139
unique        110
top            61
freq        10894
Name: number, dtype: int64

#### Attempt at combining the two data frames 

In [20]:
# df_combine= df_avail.merge(df_weather,join='inner', on=['just_date'])

df_combine = pd.merge(df_avail, df_weather, on='just_date', how="outer")

# df_weather = df_weather.just_date.map(df_avail.set_index('just_date')
                                      
# pd.merge_asof(df_weather, df_avail, left_on='just_date', right_on='just_date')


# df_combine= df_avail.merge(df_weather,on=['just_date'])

# inner_merged = pd.concat([df_avail, df_weather],on=['just_date'])

# v = df1.merge(df2[['Date', 'exp']])\
#        .groupby(df1.columns.tolist())\
#        .exp\
#        .apply(pd.Series.tolist)


In [21]:
# df_weather

Unnamed: 0,id,description1,temperature,humidity,windspeed,sunset,TIME,just_date
0,803,broken clouds,279.49,78,8.23,2022-03-30 18:55:15,2022-03-30 15:29:16,2022-03-30
1,803,broken clouds,279.49,78,8.23,2022-03-30 18:55:15,2022-03-30 15:34:17,2022-03-30
2,803,broken clouds,279.61,78,8.23,2022-03-30 18:55:15,2022-03-30 15:39:17,2022-03-30
3,803,broken clouds,279.61,78,8.23,2022-03-30 18:55:15,2022-03-30 15:44:17,2022-03-30
4,803,broken clouds,279.45,78,7.20,2022-03-30 18:55:15,2022-03-30 15:49:18,2022-03-30
...,...,...,...,...,...,...,...,...
3167,803,broken clouds,283.05,74,7.72,2022-04-10 19:15:05,2022-04-10 15:36:12,2022-04-10
3168,803,broken clouds,282.81,76,7.72,2022-04-10 19:15:05,2022-04-10 15:41:12,2022-04-10
3169,803,broken clouds,282.49,77,7.20,2022-04-10 19:15:05,2022-04-10 15:46:13,2022-04-10
3170,803,broken clouds,282.49,77,7.20,2022-04-10 19:15:05,2022-04-10 15:51:13,2022-04-10


In [22]:
# df_combine
# inner_merged

Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,hour,just_date,id,description1,temperature,humidity,windspeed,sunset,TIME
0,42,16,14,2022-02-23 19:50:20,2,19,2022-02-23,,,,,,NaT,NaT
1,30,0,20,2022-02-23 19:41:25,2,19,2022-02-23,,,,,,NaT,NaT
2,54,11,22,2022-02-23 19:48:38,2,19,2022-02-23,,,,,,NaT,NaT
3,108,16,19,2022-02-23 19:51:13,2,19,2022-02-23,,,,,,NaT,NaT
4,56,2,38,2022-02-23 19:45:20,2,19,2022-02-23,,,,,,NaT,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96521308,88,4,26,2022-04-10 15:53:21,6,15,2022-04-10,803.0,broken clouds,283.05,74.0,7.72,2022-04-10 19:15:05,2022-04-10 15:36:12
96521309,88,4,26,2022-04-10 15:53:21,6,15,2022-04-10,803.0,broken clouds,282.81,76.0,7.72,2022-04-10 19:15:05,2022-04-10 15:41:12
96521310,88,4,26,2022-04-10 15:53:21,6,15,2022-04-10,803.0,broken clouds,282.49,77.0,7.20,2022-04-10 19:15:05,2022-04-10 15:46:13
96521311,88,4,26,2022-04-10 15:53:21,6,15,2022-04-10,803.0,broken clouds,282.49,77.0,7.20,2022-04-10 19:15:05,2022-04-10 15:51:13


In [23]:
# df_combine.tail(40)

Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,hour,just_date,id,description1,temperature,humidity,windspeed,sunset,TIME
96521273,88,4,26,2022-04-10 15:53:21,6,15,2022-04-10,803.0,broken clouds,283.21,69.0,6.69,2022-04-10 19:15:05,2022-04-10 12:41:05
96521274,88,4,26,2022-04-10 15:53:21,6,15,2022-04-10,803.0,broken clouds,283.09,70.0,6.71,2022-04-10 19:15:05,2022-04-10 12:46:05
96521275,88,4,26,2022-04-10 15:53:21,6,15,2022-04-10,803.0,broken clouds,283.09,70.0,6.71,2022-04-10 19:15:05,2022-04-10 12:51:05
96521276,88,4,26,2022-04-10 15:53:21,6,15,2022-04-10,803.0,broken clouds,283.09,70.0,6.71,2022-04-10 19:15:05,2022-04-10 12:56:06
96521277,88,4,26,2022-04-10 15:53:21,6,15,2022-04-10,803.0,broken clouds,283.16,71.0,6.71,2022-04-10 19:15:04,2022-04-10 13:01:06
96521278,88,4,26,2022-04-10 15:53:21,6,15,2022-04-10,803.0,broken clouds,283.54,70.0,5.66,2022-04-10 19:15:04,2022-04-10 13:06:06
96521279,88,4,26,2022-04-10 15:53:21,6,15,2022-04-10,803.0,broken clouds,283.54,70.0,5.66,2022-04-10 19:15:04,2022-04-10 13:11:06
96521280,88,4,26,2022-04-10 15:53:21,6,15,2022-04-10,803.0,broken clouds,283.61,70.0,6.69,2022-04-10 19:15:05,2022-04-10 13:16:06
96521281,88,4,26,2022-04-10 15:53:21,6,15,2022-04-10,803.0,broken clouds,283.52,70.0,5.66,2022-04-10 19:15:05,2022-04-10 13:21:06
96521282,88,4,26,2022-04-10 15:53:21,6,15,2022-04-10,803.0,broken clouds,283.52,70.0,5.66,2022-04-10 19:15:05,2022-04-10 13:26:07


### Looking at correlations

In [25]:
# df_combine.corr()

In [26]:
df_avail.head(5)

Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,hour,just_date
0,42,16,14,2022-02-23 19:50:20,2,19,2022-02-23
1,30,0,20,2022-02-23 19:41:25,2,19,2022-02-23
2,54,11,22,2022-02-23 19:48:38,2,19,2022-02-23
3,108,16,19,2022-02-23 19:51:13,2,19,2022-02-23
4,56,2,38,2022-02-23 19:45:20,2,19,2022-02-23


In [27]:
df_avail.tail(5)

Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,hour,just_date
1199637,39,9,11,2022-04-10 15:49:38,6,15,2022-04-10
1199638,83,19,21,2022-04-10 15:55:01,6,15,2022-04-10
1199639,92,37,3,2022-04-10 15:47:09,6,15,2022-04-10
1199640,21,27,2,2022-04-10 15:47:57,6,15,2022-04-10
1199641,88,4,26,2022-04-10 15:53:21,6,15,2022-04-10


In [28]:
df_avail.shape

(1199642, 7)

### Training Model for available bikes 

In [29]:
train_feature = ["day","hour","number"]
target_feature1 = ['available_bikes']

train = df_avail[train_feature]
target = df_avail[target_feature1]
Xtrain,Xtest,Ytrain,Ytest = train_test_split(train,target,test_size=0.3)
Xtrain

Unnamed: 0,day,hour,number
732647,5,8,13
1084773,3,1,112
412951,2,4,98
987946,6,23,68
505008,5,3,105
...,...,...,...
994070,0,4,109
1027719,1,6,2
897911,4,3,36
601961,1,4,56


In [30]:
LR = LinearRegression()
LR.fit(Xtrain,Ytrain)




LinearRegression()

In [31]:
LR.score(Xtest,Ytest)



0.07740671442061542

In [32]:
y_prediction =  LR.predict(Xtest)
y_prediction



array([[23.30136018],
       [23.06076752],
       [16.51853188],
       ...,
       [19.62391581],
       [21.69947684],
       [16.85380042]])

### Example of Prediction

- on Monday, 2pm, station: 100

In [33]:
LR.predict([[1, 14, 100]])

array([[22.26255491]])

### Metrics for model evaluation in linear regression:  

#### R Square/Adjusted R Square

In [34]:
score=r2_score(Ytest,y_prediction)
print('r2 score is ',score)

r2 score is  0.07740671442061542


#### Mean Square Error(MSE)

In [35]:
print('MSE is ',mean_squared_error(Ytest,y_prediction))

MSE is  105.45568896571297


#### Mean Absolute Error

In [36]:
print('Mean Absolute Error is ',mean_absolute_error(Ytest,y_prediction))

Mean Absolute Error is  8.340155664804332


## Using a Random Forest Model for bike availabilty

In [37]:
random_forest = RandomForestRegressor(random_state=0)
random_forest = random_forest.fit(Xtrain,Ytrain.available_bikes)
score = random_forest.score(Xtest,Ytest.available_bikes)
score



0.6543620268229524

In [38]:
y_prediction_RF =  random_forest.predict(Xtest)
y_prediction_RF



array([ 2.86038619, 20.13492949, 13.81667905, ..., 22.15977969,
       22.25888979, 23.59690245])

### Metrics for model evaluation in random forrest:  

#### R squared

In [39]:
score=r2_score(Ytest,y_prediction_RF)
print('r2 score is ',score)

r2 score is  0.6543620268229524


#### Mean squared error

In [40]:
print('MSE is ',mean_squared_error(Ytest,y_prediction_RF))

MSE is  39.50764780518432


#### Mean absolute error

In [41]:
print('Mean Absolute Error is ',mean_absolute_error(Ytest,y_prediction_RF))

Mean Absolute Error is  4.862389647866393


### Example of prediction

In [42]:
random_forest.predict([[1, 14, 100]])

array([22.35225734])

### Feature Importance

In [46]:
random_forest.feature_importances_


array([0.1413276 , 0.15097742, 0.70769497])

### Saving model to disk with Pickle:

In [47]:
# pickle.dump(random_forest, open('model.pkl', 'wb'))

In [48]:
#testing

In [49]:
# model= pickle.load(open('model.pkl', 'rb'))

### Making model for bike stand availabilty:

In [50]:
target_feature2 = ['available_bike_stands']

train = df_avail[train_feature]
target = df_avail[target_feature2]
Xtrain,Xtest,Ytrain,Ytest = train_test_split(train,target,test_size=0.3)
Xtrain

Unnamed: 0,day,hour,number
511128,5,7,9
205827,2,15,87
213081,2,20,98
995046,0,5,12
926282,5,0,79
...,...,...,...
717190,4,20,79
745378,5,17,109
415782,2,7,33
674815,3,12,61


In [51]:
random_forest_2 = RandomForestRegressor(random_state=1)
random_forest_2 = random_forest_2.fit(Xtrain,Ytrain.available_bike_stands)
score_2 = random_forest_2.score(Xtest,Ytest.available_bike_stands)
score_2



0.5421200032433784

In [52]:
y_prediction_RF_2 =  random_forest_2.predict(Xtest)
y_prediction_RF_2



array([21.58022481, 16.49512351,  9.08645231, ..., 14.08416936,
       13.85269262,  3.06311305])

### R2 score for RF bike stands

In [53]:
score =r2_score(Ytest,y_prediction_RF_2)
print('r2 score is ',score)

r2 score is  0.5421200032433784


### Mean squared error for RF bike stands

In [54]:
print('MSE is ',mean_squared_error(Ytest,y_prediction_RF_2))

MSE is  37.52983166368792


### Mean absolute error for RF bike stands

In [55]:
print('Mean Absolute Error is ',mean_absolute_error(Ytest,y_prediction_RF_2))

Mean Absolute Error is  4.784530371837436


### Example prediction for RF bike stands

In [56]:
random_forest_2.predict([[1, 14, 100]])

array([2.96496221])

### Feature Importance

In [57]:
random_forest_2.feature_importances_


array([0.21863274, 0.20083739, 0.58052987])

### Saving model to pickle file

In [58]:
# pickle.dump(random_forest_2, open('model_stands.pkl', 'wb'))