### Linear Regression Model for predicting Bike Availability: 


- Here we will be implementing a linear regression model in order to predict the number of bikes available and the number of bike stands available at a give bike stand. 
- Linear regression is a statistical method for modeling relationships between a dependent variable with a given set of independent variables.
- In our model the dependent variable will be number of bikes/bike stands and the independent variables will be time of day, day of the week, area, and weather.  

In [25]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sqlalchemy import create_engine

import pickle

from sklearn.linear_model import LinearRegression
from sklearn import linear_model



### Connect to database:

In [2]:
URL = "dublin-bikesdb.cmd8vuwgew1e.us-east-1.rds.amazonaws.com"
PORT = "3306"
DB = "dbikes"
USER = "admin"
PASSWORD = "Dbikes123"


### Weather Data 

In [3]:
def weather():
    engine = create_engine("mysql+mysqldb://{}:{}@{}:{}/{}".format(USER, PASSWORD, URL, PORT, DB), echo=True)
    sql_query_weather= """
    SELECT weather.id, weather.description1, weather.temperature, weather.humidity, weather.windspeed, weather.sunset FROM weather;
    """
    df_weather = pd.read_sql_query(sql_query_weather, engine)

    return df_weather


df_weather = weather()

2022-03-19 12:49:16,708 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'sql_mode'
2022-03-19 12:49:16,714 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-03-19 12:49:16,835 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'lower_case_table_names'
2022-03-19 12:49:16,837 INFO sqlalchemy.engine.Engine [generated in 0.00212s] ()
2022-03-19 12:49:17,072 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2022-03-19 12:49:17,074 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-03-19 12:49:17,430 INFO sqlalchemy.engine.Engine 
    SELECT weather.id, weather.description1, weather.temperature, weather.humidity, weather.windspeed, weather.sunset FROM weather;
    
2022-03-19 12:49:17,432 INFO sqlalchemy.engine.Engine [raw sql] ()


In [4]:
df_weather

Unnamed: 0,id,description1,temperature,humidity,windspeed,sunset
0,803,broken clouds,281.82,77,6.69,2022-03-12 18:22:33
1,803,broken clouds,281.88,77,6.69,2022-03-12 18:22:33
2,803,broken clouds,281.82,77,6.69,2022-03-12 18:22:33
3,803,broken clouds,281.72,78,6.69,2022-03-12 18:22:33
4,803,broken clouds,281.72,78,6.69,2022-03-12 18:22:33
...,...,...,...,...,...,...
1984,801,few clouds,285.99,54,10.29,2022-03-19 18:35:22
1985,801,few clouds,286.08,54,10.29,2022-03-19 18:35:22
1986,801,few clouds,286.08,54,10.29,2022-03-19 18:35:22
1987,801,few clouds,286.08,54,10.29,2022-03-19 18:35:22


In [5]:
df_weather['just_date'] = df_weather['sunset'].dt.date
df_weather

Unnamed: 0,id,description1,temperature,humidity,windspeed,sunset,just_date
0,803,broken clouds,281.82,77,6.69,2022-03-12 18:22:33,2022-03-12
1,803,broken clouds,281.88,77,6.69,2022-03-12 18:22:33,2022-03-12
2,803,broken clouds,281.82,77,6.69,2022-03-12 18:22:33,2022-03-12
3,803,broken clouds,281.72,78,6.69,2022-03-12 18:22:33,2022-03-12
4,803,broken clouds,281.72,78,6.69,2022-03-12 18:22:33,2022-03-12
...,...,...,...,...,...,...,...
1984,801,few clouds,285.99,54,10.29,2022-03-19 18:35:22,2022-03-19
1985,801,few clouds,286.08,54,10.29,2022-03-19 18:35:22,2022-03-19
1986,801,few clouds,286.08,54,10.29,2022-03-19 18:35:22,2022-03-19
1987,801,few clouds,286.08,54,10.29,2022-03-19 18:35:22,2022-03-19


In [6]:
df_weather.dtypes


id                       int64
description1            object
temperature            float64
humidity                 int64
windspeed              float64
sunset          datetime64[ns]
just_date               object
dtype: object

### Availablity Data

In [7]:
def availability():
    engine = create_engine("mysql+mysqldb://{}:{}@{}:{}/{}".format(USER, PASSWORD, URL, PORT, DB), echo=True)
    df_avail = pd.read_sql_table("availability", engine)
    return df_avail

In [8]:
df_avail = availability()

2022-03-19 12:49:18,936 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'sql_mode'
2022-03-19 12:49:18,939 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-03-19 12:49:19,061 INFO sqlalchemy.engine.Engine SHOW VARIABLES LIKE 'lower_case_table_names'
2022-03-19 12:49:19,063 INFO sqlalchemy.engine.Engine [generated in 0.00231s] ()
2022-03-19 12:49:19,303 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2022-03-19 12:49:19,305 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-03-19 12:49:19,791 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `dbikes`
2022-03-19 12:49:19,794 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-03-19 12:49:19,914 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `dbikes`
2022-03-19 12:49:19,916 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-03-19 12:49:20,074 INFO sqlalchemy.engine.Engine SHOW CREATE TABLE `availability`
2022-03-19 12:49:20,076 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-03-19 12:49:20,321 INFO sqlalchemy.engine.Engine SELECT availabili

In [9]:
df_avail

Unnamed: 0,number,available_bike_stands,available_bikes,last_update
0,42,16,14,2022-02-23 19:50:20
1,30,0,20,2022-02-23 19:41:25
2,54,11,22,2022-02-23 19:48:38
3,108,16,19,2022-02-23 19:51:13
4,56,2,38,2022-02-23 19:45:20
...,...,...,...,...
738462,39,3,17,2022-03-19 12:40:06
738463,83,12,28,2022-03-19 12:38:38
738464,92,29,11,2022-03-19 12:41:50
738465,21,30,0,2022-03-19 12:41:58


In [10]:
df_avail['day'] = df_avail['last_update'].dt.day_name()
df_avail['just_date'] = df_avail['last_update'].dt.date




In [11]:
df_avail

Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,just_date
0,42,16,14,2022-02-23 19:50:20,Wednesday,2022-02-23
1,30,0,20,2022-02-23 19:41:25,Wednesday,2022-02-23
2,54,11,22,2022-02-23 19:48:38,Wednesday,2022-02-23
3,108,16,19,2022-02-23 19:51:13,Wednesday,2022-02-23
4,56,2,38,2022-02-23 19:45:20,Wednesday,2022-02-23
...,...,...,...,...,...,...
738462,39,3,17,2022-03-19 12:40:06,Saturday,2022-03-19
738463,83,12,28,2022-03-19 12:38:38,Saturday,2022-03-19
738464,92,29,11,2022-03-19 12:41:50,Saturday,2022-03-19
738465,21,30,0,2022-03-19 12:41:58,Saturday,2022-03-19


In [12]:
df_avail.corr()

Unnamed: 0,number,available_bike_stands,available_bikes
number,1.0,0.093731,0.333753
available_bike_stands,0.093731,1.0,-0.663657
available_bikes,0.333753,-0.663657,1.0


In [13]:
df_avail.groupby(['day']).mean()

Unnamed: 0_level_0,number,available_bike_stands,available_bikes
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Friday,60.334596,12.631224,19.067499
Monday,60.24776,12.648688,19.307449
Saturday,60.32339,12.846055,19.051525
Sunday,60.32889,12.869888,19.143331
Thursday,60.323026,12.581764,18.723763
Tuesday,60.391042,12.590967,19.407145
Wednesday,60.322694,12.667739,18.961902


In [14]:
df_avail.dtypes


number                            int64
available_bike_stands             int64
available_bikes                   int64
last_update              datetime64[ns]
day                              object
just_date                        object
dtype: object

In [15]:
df_avail["number"] = df_avail["number"].astype('category')  


In [16]:
df_avail.shape

(738467, 6)

In [17]:
df_avail.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
available_bike_stands,738467.0,12.68565,8.928861,0.0,5.0,12.0,19.0,40.0
available_bikes,738467.0,19.0536,10.499396,0.0,11.0,19.0,27.0,40.0


In [18]:
df_avail["number"].describe().T

count     738467
unique       110
top           61
freq        6715
Name: number, dtype: int64

#### Combining the two data frames 

In [19]:
df_combine= df_avail.merge(df_weather,on='just_date')


In [20]:
df_combine

Unnamed: 0,number,available_bike_stands,available_bikes,last_update,day,just_date,id,description1,temperature,humidity,windspeed,sunset
0,30,0,20,2022-03-12 00:01:03,Saturday,2022-03-12,803,broken clouds,281.82,77,6.69,2022-03-12 18:22:33
1,30,0,20,2022-03-12 00:01:03,Saturday,2022-03-12,803,broken clouds,281.88,77,6.69,2022-03-12 18:22:33
2,30,0,20,2022-03-12 00:01:03,Saturday,2022-03-12,803,broken clouds,281.82,77,6.69,2022-03-12 18:22:33
3,30,0,20,2022-03-12 00:01:03,Saturday,2022-03-12,803,broken clouds,281.72,78,6.69,2022-03-12 18:22:33
4,30,0,20,2022-03-12 00:01:03,Saturday,2022-03-12,803,broken clouds,281.72,78,6.69,2022-03-12 18:22:33
...,...,...,...,...,...,...,...,...,...,...,...,...
60411316,88,4,26,2022-03-19 12:42:18,Saturday,2022-03-19,801,few clouds,285.99,54,10.29,2022-03-19 18:35:22
60411317,88,4,26,2022-03-19 12:42:18,Saturday,2022-03-19,801,few clouds,286.08,54,10.29,2022-03-19 18:35:22
60411318,88,4,26,2022-03-19 12:42:18,Saturday,2022-03-19,801,few clouds,286.08,54,10.29,2022-03-19 18:35:22
60411319,88,4,26,2022-03-19 12:42:18,Saturday,2022-03-19,801,few clouds,286.08,54,10.29,2022-03-19 18:35:22


### Looking at correlations

In [21]:
df_combine.corr()

Unnamed: 0,available_bike_stands,available_bikes,id,temperature,humidity,windspeed
available_bike_stands,1.0,-0.609249,-0.002219,0.001015,0.004886,0.013463
available_bikes,-0.609249,1.0,-0.006477,-0.007569,0.006459,0.009588
id,-0.002219,-0.006477,1.0,0.100417,-0.298404,-0.124181
temperature,0.001015,-0.007569,0.100417,1.0,-0.619426,0.379845
humidity,0.004886,0.006459,-0.298404,-0.619426,1.0,-0.003094
windspeed,0.013463,0.009588,-0.124181,0.379845,-0.003094,1.0


### Regression Model 

In [22]:
df_avail_humidity = df_combine[['available_bikes', 'humidity']].copy()


In [23]:
df_avail_humidity

Unnamed: 0,available_bikes,humidity
0,20,77
1,20,77
2,20,77
3,20,78
4,20,78
...,...,...
60411316,26,54
60411317,26,54
60411318,26,54
60411319,26,54


In [28]:
new_df = df_avail_humidity.drop('available_bikes',axis='columns')
new_df


Unnamed: 0,humidity
0,77
1,77
2,77
3,78
4,78
...,...
60411316,54
60411317,54
60411318,54
60411319,54


In [30]:
available_bikes = df_avail_humidity.available_bikes
available_bikes


0           20
1           20
2           20
3           20
4           20
            ..
60411316    26
60411317    26
60411318    26
60411319    26
60411320    26
Name: available_bikes, Length: 60411321, dtype: int64

In [31]:
reg = linear_model.LinearRegression()
reg.fit(new_df, available_bikes)

LinearRegression()

In [32]:
reg.coef_


array([0.00822576])

In [33]:
reg.predict([[77]])




array([18.49505909])