# Machine learning for predicitng Bike availability and Bike stand availabilty

##### Linear Regression Model for predicting Bike Availability: 


- Here we will first be implementing a linear regression model in order to predict the number of bikes available and the number of bike stands available at a give bike stand. 
- Linear regression is a statistical method for modeling relationships between a dependent variable with a given set of independent variables.
- In our model the dependent variable will be number of bikes/bike stands and the independent variables will be time of day, day of the week, area, and weather.  

In [None]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sqlalchemy import create_engine

import pickle

from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error


from sklearn.ensemble import RandomForestRegressor





### Connect to database:

In [None]:
URL = "dublin-bikesdb.cmd8vuwgew1e.us-east-1.rds.amazonaws.com"
PORT = "3306"
DB = "dbikes"
USER = "admin"
PASSWORD = "Dbikes123"


### Weather Data 

In [None]:
def weather():
    engine = create_engine("mysql+mysqldb://{}:{}@{}:{}/{}".format(USER, PASSWORD, URL, PORT, DB), echo=True)
    sql_query_weather= """
    SELECT weather.id, weather.description1, weather.temperature, weather.humidity, weather.windspeed, weather.sunset, weather.TIME FROM weather;
    """
    df_weather = pd.read_sql_query(sql_query_weather, engine)

    return df_weather


df_weather = weather()

In [None]:
df_weather

In [None]:
df_weather['just_date'] = df_weather['sunset'].dt.date
df_weather

In [None]:
df_weather.dtypes


### Availablity Data

In [None]:
def availability():
    engine = create_engine("mysql+mysqldb://{}:{}@{}:{}/{}".format(USER, PASSWORD, URL, PORT, DB), echo=True)
    df_avail = pd.read_sql_table("availability", engine)
    return df_avail

In [None]:
df_avail = availability()

In [None]:
df_avail

In [None]:
df_avail['day'] = df_avail['last_update'].dt.day_of_week
df_avail['hour'] = df_avail['last_update'].dt.hour
df_avail['just_date'] = df_avail['last_update'].dt.date




In [None]:
df_avail

In [None]:
df_avail.corr()

In [None]:
df_avail.groupby(['day']).mean()

In [None]:
df_avail.dtypes


In [None]:
df_avail["number"] = df_avail["number"].astype('category')  


In [None]:
df_avail.shape

In [None]:
df_avail.describe().T

### why is there 111 stations there should only 110?

In [None]:
df_avail["number"].describe().T

### Removing illogical data from dataset

In [None]:
df_avail.loc[df_avail['number']==507]


In [None]:
df_avail = df_avail[df_avail.number != 507]


In [None]:
df_avail["number"].describe().T

#### Attempt at combining the two data frames 

In [None]:
# df_combine= df_avail.merge(df_weather,join='inner', on=['just_date'])

df_combine = pd.merge(df_avail, df_weather, on='just_date', how="outer")

# df_weather = df_weather.just_date.map(df_avail.set_index('just_date')
                                      
# pd.merge_asof(df_weather, df_avail, left_on='just_date', right_on='just_date')


# df_combine= df_avail.merge(df_weather,on=['just_date'])

# inner_merged = pd.concat([df_avail, df_weather],on=['just_date'])

# v = df1.merge(df2[['Date', 'exp']])\
#        .groupby(df1.columns.tolist())\
#        .exp\
#        .apply(pd.Series.tolist)


In [None]:
# df_weather

In [None]:
# df_combine
# inner_merged

In [None]:
# df_combine.tail(40)

### Looking at correlations

In [None]:
# df_combine.corr()

In [None]:
df_avail.head(5)

In [None]:
df_avail.tail(5)

In [None]:
df_avail.shape

### Training Model for available bikes 

In [None]:
train_feature = ["day","hour","number"]
target_feature1 = ['available_bikes']

train = df_avail[train_feature]
target = df_avail[target_feature1]
Xtrain,Xtest,Ytrain,Ytest = train_test_split(train,target,test_size=0.3)
Xtrain

In [None]:
LR = LinearRegression()
LR.fit(Xtrain,Ytrain)


In [None]:
LR.score(Xtest,Ytest)

In [None]:
y_prediction =  LR.predict(Xtest)
y_prediction

### Example of Prediction

- on Monday, 2pm, station: 100

In [None]:
LR.predict([[1, 14, 100]])

### Metrics for model evaluation in linear regression:  

#### R Square/Adjusted R Square

In [None]:
score=r2_score(Ytest,y_prediction)
print('r2 score is ',score)

#### Mean Square Error(MSE)

In [None]:
print('MSE is ',mean_squared_error(Ytest,y_prediction))

#### Mean Absolute Error

In [None]:
print('Mean Absolute Error is ',mean_absolute_error(Ytest,y_prediction))

## Using a Random Forest Model for bike availabilty

In [None]:
random_forest = RandomForestRegressor(random_state=0)
random_forest = random_forest.fit(Xtrain,Ytrain.available_bikes)
score = random_forest.score(Xtest,Ytest.available_bikes)
score

In [None]:
y_prediction_RF =  random_forest.predict(Xtest)
y_prediction_RF

### Metrics for model evaluation in random forrest:  

#### R squared

In [None]:
score=r2_score(Ytest,y_prediction_RF)
print('r2 score is ',score)

#### Mean squared error

In [None]:
print('MSE is ',mean_squared_error(Ytest,y_prediction_RF))

#### Mean absolute error

In [None]:
print('Mean Absolute Error is ',mean_absolute_error(Ytest,y_prediction_RF))

### Example of prediction

In [None]:
random_forest.predict([[1, 14, 100]])

### Feature Importance

In [None]:
random_forest.feature_importances_


### Saving model to disk with Pickle:

In [None]:
# pickle.dump(random_forest, open('model.pkl', 'wb'))

In [None]:
#testing

In [None]:
# model= pickle.load(open('model.pkl', 'rb'))

### Making model for bike stand availabilty:

In [None]:
target_feature2 = ['available_bike_stands']

train = df_avail[train_feature]
target = df_avail[target_feature2]
Xtrain,Xtest,Ytrain,Ytest = train_test_split(train,target,test_size=0.3)
Xtrain

In [None]:
random_forest_2 = RandomForestRegressor(random_state=1)
random_forest_2 = random_forest_2.fit(Xtrain,Ytrain.available_bike_stands)
score_2 = random_forest_2.score(Xtest,Ytest.available_bike_stands)
score_2

In [None]:
y_prediction_RF_2 =  random_forest_2.predict(Xtest)
y_prediction_RF_2

### R2 score for RF bike stands

In [None]:
score =r2_score(Ytest,y_prediction_RF_2)
print('r2 score is ',score)

### Mean squared error for RF bike stands

In [None]:
print('MSE is ',mean_squared_error(Ytest,y_prediction_RF_2))

### Mean absolute error for RF bike stands

In [None]:
print('Mean Absolute Error is ',mean_absolute_error(Ytest,y_prediction_RF_2))

### Example prediction for RF bike stands

In [None]:
random_forest_2.predict([[1, 14, 100]])

### Feature Importance

In [None]:
random_forest_2.feature_importances_


### Saving model to pickle file

In [None]:
# pickle.dump(random_forest_2, open('model_stands.pkl', 'wb'))