# Machine learning for predicitng Bike availability and Bike stand availabilty

##### Linear Regression Model for predicting Bike Availability: 


- Here we will first be implementing a linear regression model in order to predict the number of bikes available and the number of bike stands available at a give bike stand. 
- Linear regression is a statistical method for modeling relationships between a dependent variable with a given set of independent variables.
- In our model the dependent variable will be number of bikes/bike stands and the independent variables will be time of day, day of the week, area, and weather.  

In [1]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sqlalchemy import create_engine

import pickle

from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error


from sklearn.ensemble import RandomForestRegressor





### Connect to database:

In [2]:
URL = "dublin-bikesdb.cmd8vuwgew1e.us-east-1.rds.amazonaws.com"
PORT = "3306"
DB = "dbikes"
USER = "admin"
PASSWORD = "Dbikes123"


### Weather Data 

In [3]:
def weather():
    engine = create_engine("mysql+mysqldb://{}:{}@{}:{}/{}".format(USER, PASSWORD, URL, PORT, DB), echo=True)
    sql_query_weather= """
    SELECT weather.id, weather.description1, weather.temperature, weather.humidity, weather.windspeed, weather.sunset, weather.TIME FROM weather;
    """
    df_weather = pd.read_sql_query(sql_query_weather, engine)

    return df_weather


df_weather = weather()

2022-04-14 09:11:06,287 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'sql_mode'
2022-04-14 09:11:06,293 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-14 09:11:06,389 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'lower_case_table_names'
2022-04-14 09:11:06,392 INFO sqlalchemy.engine.Engine [generated in 0.00221s] ()
2022-04-14 09:11:06,578 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2022-04-14 09:11:06,581 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-14 09:11:06,855 INFO sqlalchemy.engine.Engine 
    SELECT weather.id, weather.description1, weather.temperature, weather.humidity, weather.windspeed, weather.sunset, weather.TIME FROM weather;
    
2022-04-14 09:11:06,857 INFO sqlalchemy.engine.Engine [raw sql] ()


In [4]:
df_weather

Unnamed: 0,id,description1,temperature,humidity,windspeed,sunset,TIME
0,803,broken clouds,284.17,84,6.69,2022-04-11 19:16:53,2022-04-11 21:14:23
1,803,broken clouds,284.12,84,4.12,2022-04-11 19:16:53,2022-04-11 21:19:23
2,803,broken clouds,284.12,84,4.12,2022-04-11 19:16:53,2022-04-11 21:24:23
3,803,broken clouds,284.12,84,4.12,2022-04-11 19:16:53,2022-04-11 21:29:23
4,803,broken clouds,284.02,84,4.12,2022-04-11 19:16:54,2022-04-11 21:34:23
...,...,...,...,...,...,...,...
702,803,broken clouds,282.82,90,2.06,2022-04-14 19:22:20,2022-04-14 07:47:05
703,803,broken clouds,282.91,90,2.06,2022-04-14 19:22:20,2022-04-14 07:52:05
704,803,broken clouds,282.89,90,2.06,2022-04-14 19:22:19,2022-04-14 07:57:05
705,803,broken clouds,283.02,89,2.06,2022-04-14 19:22:19,2022-04-14 08:02:05


In [5]:
df_weather['just_date'] = df_weather['sunset'].dt.date
df_weather

Unnamed: 0,id,description1,temperature,humidity,windspeed,sunset,TIME,just_date
0,803,broken clouds,284.17,84,6.69,2022-04-11 19:16:53,2022-04-11 21:14:23,2022-04-11
1,803,broken clouds,284.12,84,4.12,2022-04-11 19:16:53,2022-04-11 21:19:23,2022-04-11
2,803,broken clouds,284.12,84,4.12,2022-04-11 19:16:53,2022-04-11 21:24:23,2022-04-11
3,803,broken clouds,284.12,84,4.12,2022-04-11 19:16:53,2022-04-11 21:29:23,2022-04-11
4,803,broken clouds,284.02,84,4.12,2022-04-11 19:16:54,2022-04-11 21:34:23,2022-04-11
...,...,...,...,...,...,...,...,...
702,803,broken clouds,282.82,90,2.06,2022-04-14 19:22:20,2022-04-14 07:47:05,2022-04-14
703,803,broken clouds,282.91,90,2.06,2022-04-14 19:22:20,2022-04-14 07:52:05,2022-04-14
704,803,broken clouds,282.89,90,2.06,2022-04-14 19:22:19,2022-04-14 07:57:05,2022-04-14
705,803,broken clouds,283.02,89,2.06,2022-04-14 19:22:19,2022-04-14 08:02:05,2022-04-14


In [6]:
df_weather.dtypes


id                       int64
description1            object
temperature            float64
humidity                 int64
windspeed              float64
sunset          datetime64[ns]
TIME            datetime64[ns]
just_date               object
dtype: object

### Availablity Data

In [7]:
def availability():
    engine = create_engine("mysql+mysqldb://{}:{}@{}:{}/{}".format(USER, PASSWORD, URL, PORT, DB), echo=True)
    df_avail = pd.read_sql_table("availability", engine)
    return df_avail

In [8]:
df_avail = availability()

2022-04-14 09:11:21,125 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'sql_mode'
2022-04-14 09:11:21,127 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-14 09:11:21,220 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'lower_case_table_names'
2022-04-14 09:11:21,221 INFO sqlalchemy.engine.Engine [generated in 0.00201s] ()
2022-04-14 09:11:21,402 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2022-04-14 09:11:21,404 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-14 09:11:21,774 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `dbikes`
2022-04-14 09:11:21,776 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-14 09:11:21,869 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `dbikes`
2022-04-14 09:11:21,871 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-14 09:11:21,995 INFO sqlalchemy.engine.Engine SHOW CREATE TABLE `availability`
2022-04-14 09:11:21,996 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-04-14 09:11:22,301 INFO sqlalchemy.engine.Engine SELECT availabili

In [9]:
df_avail

Unnamed: 0,number,available_bike_stands,available_bikes,last_update
0,42,16,14,2022-02-23 19:50:20
1,30,0,20,2022-02-23 19:41:25
2,54,11,22,2022-02-23 19:48:38
3,108,16,19,2022-02-23 19:51:13
4,56,2,38,2022-02-23 19:45:20
...,...,...,...,...
1316520,39,8,12,2022-04-14 08:08:31
1316521,83,26,14,2022-04-14 08:00:31
1316522,92,24,15,2022-04-14 08:03:19
1316523,21,21,9,2022-04-14 08:02:57


In [10]:
df_avail['day'] = df_avail['last_update'].dt.day_of_week
df_avail['hour'] = df_avail['last_update'].dt.hour
df_avail['just_date'] = df_avail['last_update'].dt.date




In [11]:
df_avail

Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,hour,just_date
0,42,16,14,2022-02-23 19:50:20,2,19,2022-02-23
1,30,0,20,2022-02-23 19:41:25,2,19,2022-02-23
2,54,11,22,2022-02-23 19:48:38,2,19,2022-02-23
3,108,16,19,2022-02-23 19:51:13,2,19,2022-02-23
4,56,2,38,2022-02-23 19:45:20,2,19,2022-02-23
...,...,...,...,...,...,...,...
1316520,39,8,12,2022-04-14 08:08:31,3,8,2022-04-14
1316521,83,26,14,2022-04-14 08:00:31,3,8,2022-04-14
1316522,92,24,15,2022-04-14 08:03:19,3,8,2022-04-14
1316523,21,21,9,2022-04-14 08:02:57,3,8,2022-04-14


In [12]:
df_avail.corr()

Unnamed: 0,number,available_bike_stands,available_bikes,day,hour
number,1.0,0.04422,0.246424,-0.024263,0.007145
available_bike_stands,0.04422,1.0,-0.681802,0.009805,-0.013674
available_bikes,0.246424,-0.681802,1.0,-0.003627,0.010891
day,-0.024263,0.009805,-0.003627,1.0,-0.015827
hour,0.007145,-0.013674,0.010891,-0.015827,1.0


In [13]:
df_avail.groupby(['day']).mean()

Unnamed: 0_level_0,number,available_bike_stands,available_bikes,hour
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,60.296856,12.60288,19.265501,11.537964
1,66.725767,12.449826,19.055054,12.053771
2,60.328563,12.692112,19.039132,11.4743
3,60.321533,12.599037,18.903449,11.959876
4,60.330264,12.634216,19.108924,11.501691
5,60.328145,12.796028,19.06683,11.405015
6,60.327173,12.814763,19.019129,11.486231


In [14]:
df_avail.dtypes


number                            int64
available_bike_stands             int64
available_bikes                   int64
last_update              datetime64[ns]
day                               int64
hour                              int64
just_date                        object
dtype: object

In [15]:
df_avail["number"] = df_avail["number"].astype('category')  


In [16]:
df_avail.shape

(1316525, 7)

In [17]:
df_avail.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
available_bike_stands,1316525.0,12.65606,9.079112,0.0,5.0,12.0,19.0,40.0
available_bikes,1316525.0,19.061153,10.705276,0.0,11.0,19.0,27.0,40.0
day,1316525.0,3.092364,1.913203,0.0,1.0,3.0,5.0,6.0
hour,1316525.0,11.631434,6.914607,0.0,6.0,12.0,18.0,23.0


### why is there 111 stations there should only 110?

In [18]:
df_avail["number"].describe().T

count     1316525
unique        111
top            61
freq        11947
Name: number, dtype: int64

### Removing illogical data from dataset

In [19]:
df_avail.loc[df_avail['number']==507]


Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,hour,just_date
1032891,507,0,1,2022-04-05 09:56:28,1,9,2022-04-05
1033002,507,0,1,2022-04-05 10:06:34,1,10,2022-04-05
1033113,507,0,1,2022-04-05 10:06:34,1,10,2022-04-05
1033224,507,0,1,2022-04-05 10:16:40,1,10,2022-04-05
1033335,507,0,1,2022-04-05 10:16:40,1,10,2022-04-05
...,...,...,...,...,...,...,...
1316052,507,1,0,2022-04-05 14:15:05,1,14,2022-04-05
1316163,507,1,0,2022-04-05 14:15:05,1,14,2022-04-05
1316274,507,1,0,2022-04-05 14:15:05,1,14,2022-04-05
1316385,507,1,0,2022-04-05 14:15:05,1,14,2022-04-05


In [20]:
df_avail = df_avail[df_avail.number != 507]


In [21]:
df_avail["number"].describe().T

count     1313969
unique        110
top            61
freq        11947
Name: number, dtype: int64

#### Attempt at combining the two data frames 

In [23]:
# df_combine= df_avail.merge(df_weather,join='inner', on=['just_date'])

# df_combine = pd.merge(df_avail, df_weather, on='just_date', how="outer")

# df_weather = df_weather.just_date.map(df_avail.set_index('just_date')
                                      
# pd.merge_asof(df_weather, df_avail, left_on='just_date', right_on='just_date')


# df_combine= df_avail.merge(df_weather,on=['just_date'])

# inner_merged = pd.concat([df_avail, df_weather],on=['just_date'])

# v = df1.merge(df2[['Date', 'exp']])\
#        .groupby(df1.columns.tolist())\
#        .exp\
#        .apply(pd.Series.tolist)


In [24]:
# df_weather

In [25]:
# df_combine
# inner_merged

In [26]:
# df_combine.tail(40)

### Looking at correlations

In [27]:
# df_combine.corr()

In [28]:
df_avail.head(5)

Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,hour,just_date
0,42,16,14,2022-02-23 19:50:20,2,19,2022-02-23
1,30,0,20,2022-02-23 19:41:25,2,19,2022-02-23
2,54,11,22,2022-02-23 19:48:38,2,19,2022-02-23
3,108,16,19,2022-02-23 19:51:13,2,19,2022-02-23
4,56,2,38,2022-02-23 19:45:20,2,19,2022-02-23


In [29]:
df_avail.tail(5)

Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,hour,just_date
1316520,39,8,12,2022-04-14 08:08:31,3,8,2022-04-14
1316521,83,26,14,2022-04-14 08:00:31,3,8,2022-04-14
1316522,92,24,15,2022-04-14 08:03:19,3,8,2022-04-14
1316523,21,21,9,2022-04-14 08:02:57,3,8,2022-04-14
1316524,88,16,14,2022-04-14 08:07:01,3,8,2022-04-14


In [30]:
df_avail.shape

(1313969, 7)

In [33]:
df_new = df_avail.head(len(df_avail)-1300000)
df_new

Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,hour,just_date
0,42,16,14,2022-02-23 19:50:20,2,19,2022-02-23
1,30,0,20,2022-02-23 19:41:25,2,19,2022-02-23
2,54,11,22,2022-02-23 19:48:38,2,19,2022-02-23
3,108,16,19,2022-02-23 19:51:13,2,19,2022-02-23
4,56,2,38,2022-02-23 19:45:20,2,19,2022-02-23
...,...,...,...,...,...,...,...
13964,83,8,32,2022-02-24 13:14:56,3,13,2022-02-24
13965,92,1,39,2022-02-24 13:20:41,3,13,2022-02-24
13966,21,24,6,2022-02-24 13:21:10,3,13,2022-02-24
13967,88,6,24,2022-02-24 13:18:28,3,13,2022-02-24


### Training Model for available bikes 

In [34]:
train_feature = ["day","hour","number"]
target_feature1 = ['available_bikes']

train = df_new[train_feature]
target = df_new[target_feature1]
Xtrain,Xtest,Ytrain,Ytest = train_test_split(train,target,test_size=0.3)
Xtrain

Unnamed: 0,day,hour,number
13008,3,13,99
4608,3,9,77
4855,3,9,51
5490,3,10,29
12042,3,12,47
...,...,...,...
5269,3,10,40
5877,3,10,58
411,2,20,11
10862,3,12,65


In [35]:
LR = LinearRegression()
LR.fit(Xtrain,Ytrain)




LinearRegression()

In [36]:
LR.score(Xtest,Ytest)



0.09216225852060433

In [37]:
y_prediction =  LR.predict(Xtest)
y_prediction



array([[18.69791705],
       [23.34384975],
       [19.76260996],
       ...,
       [20.83560759],
       [15.78289266],
       [19.85940023]])

### Example of Prediction

- on Monday, 2pm, station: 100

In [38]:
LR.predict([[1, 14, 100]])

array([[20.77718377]])

### Metrics for model evaluation in linear regression:  

#### R Square/Adjusted R Square

In [39]:
score=r2_score(Ytest,y_prediction)
print('r2 score is ',score)

r2 score is  0.09216225852060433


#### Mean Square Error(MSE)

In [40]:
print('MSE is ',mean_squared_error(Ytest,y_prediction))

MSE is  89.80461453510334


#### Mean Absolute Error

In [41]:
print('Mean Absolute Error is ',mean_absolute_error(Ytest,y_prediction))

Mean Absolute Error is  7.703008306334672


## Using a Random Forest Model for bike availabilty

In [42]:
random_forest = RandomForestRegressor(random_state=0)
random_forest = random_forest.fit(Xtrain,Ytrain.available_bikes)
score = random_forest.score(Xtest,Ytest.available_bikes)
score



0.9811793304150203

In [43]:
y_prediction_RF =  random_forest.predict(Xtest)
y_prediction_RF



array([28.60204636, 25.        , 25.        , ..., 31.39889732,
       17.76357693, 26.25623263])

### Metrics for model evaluation in random forrest:  

#### R squared

In [44]:
score=r2_score(Ytest,y_prediction_RF)
print('r2 score is ',score)

r2 score is  0.9811793304150203


#### Mean squared error

In [45]:
print('MSE is ',mean_squared_error(Ytest,y_prediction_RF))

MSE is  1.8617676927788371


#### Mean absolute error

In [46]:
print('Mean Absolute Error is ',mean_absolute_error(Ytest,y_prediction_RF))

Mean Absolute Error is  0.7101426773830077


### Example of prediction

In [47]:
random_forest.predict([[1, 14, 100]])

array([10.38])

### Feature Importance

In [48]:
random_forest.feature_importances_


array([0.04673448, 0.10811427, 0.84515125])

### Saving model to disk with Pickle:

In [50]:
pickle.dump(random_forest, open('model.pkl', 'wb'))

In [51]:
#testing

In [53]:
model= pickle.load(open('model.pkl', 'rb'))

### Making model for bike stand availabilty:

In [56]:
target_feature2 = ['available_bike_stands']

train = df_new[train_feature]
target = df_new[target_feature2]
Xtrain,Xtest,Ytrain,Ytest = train_test_split(train,target,test_size=0.3)
Xtrain

Unnamed: 0,day,hour,number
955,2,21,112
10146,3,11,113
13487,3,13,95
5731,3,10,98
182,2,20,98
...,...,...,...
10266,3,11,61
10558,3,12,42
13883,3,13,114
10120,3,11,54


In [57]:
random_forest_2 = RandomForestRegressor(random_state=1)
random_forest_2 = random_forest_2.fit(Xtrain,Ytrain.available_bike_stands)
score_2 = random_forest_2.score(Xtest,Ytest.available_bike_stands)
score_2



0.9716916650705482

In [58]:
y_prediction_RF_2 =  random_forest_2.predict(Xtest)
y_prediction_RF_2



array([ 8.        ,  6.19798131,  2.3963433 , ...,  5.14453341,
       11.91703414, 12.02      ])

### R2 score for RF bike stands

In [59]:
score =r2_score(Ytest,y_prediction_RF_2)
print('r2 score is ',score)

r2 score is  0.9716916650705482


### Mean squared error for RF bike stands

In [60]:
print('MSE is ',mean_squared_error(Ytest,y_prediction_RF_2))

MSE is  2.183514858172077


### Mean absolute error for RF bike stands

In [62]:
print('Mean Absolute Error is ',mean_absolute_error(Ytest,y_prediction_RF_2))

Mean Absolute Error is  0.7385325042302513


### Example prediction for RF bike stands

In [63]:
random_forest_2.predict([[1, 14, 100]])

array([12.70677381])

### Feature Importance

In [64]:
random_forest_2.feature_importances_


array([0.07371562, 0.16285749, 0.7634269 ])

### Saving model to pickle file

In [65]:
pickle.dump(random_forest_2, open('model_stands.pkl', 'wb'))