### Linear Regression Model for predicting Bike Availability: 


- Here we will be implementing a linear regression model in order to predict the number of bikes available and the number of bike stands available at a give bike stand. 
- Linear regression is a statistical method for modeling relationships between a dependent variable with a given set of independent variables.
- In our model the dependent variable will be number of bikes/bike stands and the independent variables will be time of day, day of the week, area, and weather.  

In [1]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sqlalchemy import create_engine

import pickle

from sklearn.linear_model import LinearRegression
from sklearn import linear_model



### Connect to database:

In [2]:
URL = "dublin-bikesdb.cmd8vuwgew1e.us-east-1.rds.amazonaws.com"
PORT = "3306"
DB = "dbikes"
USER = "admin"
PASSWORD = "Dbikes123"


### Weather Data 

In [3]:
def weather():
    engine = create_engine("mysql+mysqldb://{}:{}@{}:{}/{}".format(USER, PASSWORD, URL, PORT, DB), echo=True)
    sql_query_weather= """
    SELECT weather.id, weather.description1, weather.temperature, weather.humidity, weather.windspeed, weather.sunset FROM weather;
    """
    df_weather = pd.read_sql_query(sql_query_weather, engine)

    return df_weather


df_weather = weather()

2022-03-20 17:48:37,841 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'sql_mode'
2022-03-20 17:48:37,846 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-03-20 17:48:37,966 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'lower_case_table_names'
2022-03-20 17:48:37,968 INFO sqlalchemy.engine.Engine [generated in 0.00179s] ()
2022-03-20 17:48:38,209 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2022-03-20 17:48:38,212 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-03-20 17:48:38,573 INFO sqlalchemy.engine.Engine 
    SELECT weather.id, weather.description1, weather.temperature, weather.humidity, weather.windspeed, weather.sunset FROM weather;
    
2022-03-20 17:48:38,576 INFO sqlalchemy.engine.Engine [raw sql] ()


In [4]:
df_weather

Unnamed: 0,id,description1,temperature,humidity,windspeed,sunset
0,803,broken clouds,281.82,77,6.69,2022-03-12 18:22:33
1,803,broken clouds,281.88,77,6.69,2022-03-12 18:22:33
2,803,broken clouds,281.82,77,6.69,2022-03-12 18:22:33
3,803,broken clouds,281.72,78,6.69,2022-03-12 18:22:33
4,803,broken clouds,281.72,78,6.69,2022-03-12 18:22:33
...,...,...,...,...,...,...
2331,802,scattered clouds,282.57,67,4.63,2022-03-20 18:37:11
2332,803,broken clouds,282.40,66,5.66,2022-03-20 18:37:11
2333,803,broken clouds,282.29,66,5.66,2022-03-20 18:37:11
2334,803,broken clouds,282.29,66,5.66,2022-03-20 18:37:11


In [5]:
df_weather['just_date'] = df_weather['sunset'].dt.date
df_weather

Unnamed: 0,id,description1,temperature,humidity,windspeed,sunset,just_date
0,803,broken clouds,281.82,77,6.69,2022-03-12 18:22:33,2022-03-12
1,803,broken clouds,281.88,77,6.69,2022-03-12 18:22:33,2022-03-12
2,803,broken clouds,281.82,77,6.69,2022-03-12 18:22:33,2022-03-12
3,803,broken clouds,281.72,78,6.69,2022-03-12 18:22:33,2022-03-12
4,803,broken clouds,281.72,78,6.69,2022-03-12 18:22:33,2022-03-12
...,...,...,...,...,...,...,...
2331,802,scattered clouds,282.57,67,4.63,2022-03-20 18:37:11,2022-03-20
2332,803,broken clouds,282.40,66,5.66,2022-03-20 18:37:11,2022-03-20
2333,803,broken clouds,282.29,66,5.66,2022-03-20 18:37:11,2022-03-20
2334,803,broken clouds,282.29,66,5.66,2022-03-20 18:37:11,2022-03-20


In [6]:
df_weather.dtypes


id                       int64
description1            object
temperature            float64
humidity                 int64
windspeed              float64
sunset          datetime64[ns]
just_date               object
dtype: object

### Availablity Data

In [7]:
def availability():
    engine = create_engine("mysql+mysqldb://{}:{}@{}:{}/{}".format(USER, PASSWORD, URL, PORT, DB), echo=True)
    df_avail = pd.read_sql_table("availability", engine)
    return df_avail

In [8]:
df_avail = availability()

2022-03-20 17:48:40,258 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'sql_mode'
2022-03-20 17:48:40,260 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-03-20 17:48:40,384 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'lower_case_table_names'
2022-03-20 17:48:40,385 INFO sqlalchemy.engine.Engine [generated in 0.00151s] ()
2022-03-20 17:48:40,628 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2022-03-20 17:48:40,631 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-03-20 17:48:41,121 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `dbikes`
2022-03-20 17:48:41,125 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-03-20 17:48:41,261 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `dbikes`
2022-03-20 17:48:41,264 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-03-20 17:48:41,439 INFO sqlalchemy.engine.Engine SHOW CREATE TABLE `availability`
2022-03-20 17:48:41,441 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-03-20 17:48:41,721 INFO sqlalchemy.engine.Engine SELECT availabili

In [9]:
df_avail

Unnamed: 0,number,available_bike_stands,available_bikes,last_update
0,42,16,14,2022-02-23 19:50:20
1,30,0,20,2022-02-23 19:41:25
2,54,11,22,2022-02-23 19:48:38
3,108,16,19,2022-02-23 19:51:13
4,56,2,38,2022-02-23 19:45:20
...,...,...,...,...
776522,39,1,19,2022-03-20 17:45:09
776523,83,14,26,2022-03-20 17:42:51
776524,92,38,2,2022-03-20 17:42:05
776525,21,14,16,2022-03-20 17:44:13


In [10]:
df_avail['day'] = df_avail['last_update'].dt.day_name()
df_avail['just_date'] = df_avail['last_update'].dt.date




In [11]:
df_avail

Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,just_date
0,42,16,14,2022-02-23 19:50:20,Wednesday,2022-02-23
1,30,0,20,2022-02-23 19:41:25,Wednesday,2022-02-23
2,54,11,22,2022-02-23 19:48:38,Wednesday,2022-02-23
3,108,16,19,2022-02-23 19:51:13,Wednesday,2022-02-23
4,56,2,38,2022-02-23 19:45:20,Wednesday,2022-02-23
...,...,...,...,...,...,...
776522,39,1,19,2022-03-20 17:45:09,Sunday,2022-03-20
776523,83,14,26,2022-03-20 17:42:51,Sunday,2022-03-20
776524,92,38,2,2022-03-20 17:42:05,Sunday,2022-03-20
776525,21,14,16,2022-03-20 17:44:13,Sunday,2022-03-20


In [12]:
df_avail.corr()

Unnamed: 0,number,available_bike_stands,available_bikes
number,1.0,0.082009,0.339029
available_bike_stands,0.082009,1.0,-0.668989
available_bikes,0.339029,-0.668989,1.0


In [13]:
df_avail.groupby(['day']).mean()

Unnamed: 0_level_0,number,available_bike_stands,available_bikes
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Friday,60.334596,12.631224,19.067499
Monday,60.24776,12.648688,19.307449
Saturday,60.326612,12.796052,19.048369
Sunday,60.323301,12.814401,19.022784
Thursday,60.323026,12.581764,18.723763
Tuesday,60.391042,12.590967,19.407145
Wednesday,60.322694,12.667739,18.961902


In [14]:
df_avail.dtypes


number                            int64
available_bike_stands             int64
available_bikes                   int64
last_update              datetime64[ns]
day                              object
just_date                        object
dtype: object

In [15]:
df_avail["number"] = df_avail["number"].astype('category')  


In [16]:
df_avail.shape

(776527, 6)

In [17]:
df_avail.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
available_bike_stands,776527.0,12.678027,9.029941,0.0,5.0,12.0,19.0,40.0
available_bikes,776527.0,19.042221,10.624696,0.0,11.0,19.0,27.0,40.0


In [18]:
df_avail["number"].describe().T

count     776527
unique       110
top           61
freq        7061
Name: number, dtype: int64

#### Combining the two data frames 

In [19]:
df_combine= df_avail.merge(df_weather,on='just_date')


In [20]:
df_combine

Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,just_date,id,description1,temperature,humidity,windspeed,sunset
0,30,0,20,2022-03-12 00:01:03,Saturday,2022-03-12,803,broken clouds,281.82,77,6.69,2022-03-12 18:22:33
1,30,0,20,2022-03-12 00:01:03,Saturday,2022-03-12,803,broken clouds,281.88,77,6.69,2022-03-12 18:22:33
2,30,0,20,2022-03-12 00:01:03,Saturday,2022-03-12,803,broken clouds,281.82,77,6.69,2022-03-12 18:22:33
3,30,0,20,2022-03-12 00:01:03,Saturday,2022-03-12,803,broken clouds,281.72,78,6.69,2022-03-12 18:22:33
4,30,0,20,2022-03-12 00:01:03,Saturday,2022-03-12,803,broken clouds,281.72,78,6.69,2022-03-12 18:22:33
...,...,...,...,...,...,...,...,...,...,...,...,...
71868405,88,17,13,2022-03-20 17:41:47,Sunday,2022-03-20,802,scattered clouds,282.57,67,4.63,2022-03-20 18:37:11
71868406,88,17,13,2022-03-20 17:41:47,Sunday,2022-03-20,803,broken clouds,282.40,66,5.66,2022-03-20 18:37:11
71868407,88,17,13,2022-03-20 17:41:47,Sunday,2022-03-20,803,broken clouds,282.29,66,5.66,2022-03-20 18:37:11
71868408,88,17,13,2022-03-20 17:41:47,Sunday,2022-03-20,803,broken clouds,282.29,66,5.66,2022-03-20 18:37:11


### Looking at correlations

In [21]:
df_combine.corr()

Unnamed: 0,available_bike_stands,available_bikes,id,temperature,humidity,windspeed
available_bike_stands,1.0,-0.634947,-0.002354,-7.5e-05,0.004296,0.010958
available_bikes,-0.634947,1.0,-0.004442,-0.003738,0.00135,0.009577
id,-0.002354,-0.004442,1.0,0.130238,-0.306042,-0.098931
temperature,-7.5e-05,-0.003738,0.130238,1.0,-0.680108,0.408121
humidity,0.004296,0.00135,-0.306042,-0.680108,1.0,-0.104485
windspeed,0.010958,0.009577,-0.098931,0.408121,-0.104485,1.0


### Regression Model 

In [22]:
df_avail_humidity = df_combine[['available_bikes', 'humidity']].copy()


In [23]:
df_avail_humidity

Unnamed: 0,available_bikes,humidity
0,20,77
1,20,77
2,20,77
3,20,78
4,20,78
...,...,...
71868405,13,67
71868406,13,66
71868407,13,66
71868408,13,66


In [24]:
new_df = df_avail_humidity.drop('available_bikes',axis='columns')
new_df


Unnamed: 0,humidity
0,77
1,77
2,77
3,78
4,78
...,...
71868405,67
71868406,66
71868407,66
71868408,66


In [25]:
available_bikes = df_avail_humidity.available_bikes
available_bikes


0           20
1           20
2           20
3           20
4           20
            ..
71868405    13
71868406    13
71868407    13
71868408    13
71868409    13
Name: available_bikes, Length: 71868410, dtype: int64

In [26]:
reg = linear_model.LinearRegression()
reg.fit(new_df, available_bikes)

LinearRegression()

In [27]:
reg.coef_


array([0.00157936])

In [28]:
reg.predict([[77]])




array([18.54998])