In [1]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import scipy.stats as stats
import datetime

This notebook was used to train the bike availability models, using historic availability and weather data

## Data prep

In [3]:
# connect to database
engine = create_engine("mysql://admin:jcdgroup21@jcdecaux-bikes.cti0lbnfidpl.us-east-1.rds.amazonaws.com:3306/jcdecaux-bikes")

In [4]:
# get tables
available = pd.read_sql_table("available", engine)
stations = pd.read_sql_table("stations", engine)
weather = pd.read_sql_table("weather", engine)

In [5]:
# resample weather to average over 30 min intervals
weather_resampled = weather.set_index("time").resample("30min").agg({'temp': np.mean, 'wind_speed': np.mean, 'humidity': np.mean, 'type': lambda x: (stats.mode(x)[0])})

In [6]:
# create dictionary to hold prepared per station data
station = {}

# group available table by station number
grouped = available.groupby(available.number)

# get a list of all station numbers
stationNumbers = stations.number.unique()

# for each station
for i in stationNumbers:
    
    # collect all data entries for this station into new dataframe
    df_new = grouped.get_group(i)
    
    # drop duplicates from the dataframe
    df_new = df_new.drop_duplicates()
    
    # resample availability to average over 30 min intervals
    df_new = df_new.set_index("last_update").resample("30min").mean()
    
    # merge with weather dataframe
    merge = df_new.merge(weather_resampled, left_index = True, right_index = True)
    
    # create day, hour and minute categorical columns
    merge["day"] = merge.index.dayofweek
    merge["hour"] = merge.index.hour
    merge["Minute"] = merge.index.minute
    
    # remove nan rows, time periods when stations are closed
    merge = merge.dropna()
    
    # store in dictionary with station number as key
    station[i] = merge

#### cleaned data is now stored in a dictionary called 'station'

access the required station using the station number as key value, as shown below:

In [14]:
station[59].head()

Unnamed: 0_level_0,number,available_bike_stands,available_bikes,temp,wind_speed,humidity,type,day,hour,Minute
last_update,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2021-03-03 00:00:00,59.0,9.0,11.0,277.93,3.09,87.0,Clouds,2,0,0
2021-03-03 05:00:00,59.0,9.0,11.0,277.84,2.57,89.0,Clouds,2,5,0
2021-03-03 05:30:00,59.0,9.0,11.0,277.81,2.57,93.0,Clouds,2,5,30
2021-03-03 06:00:00,59.0,9.0,11.0,277.685,2.57,93.0,Clouds,2,6,0
2021-03-03 06:30:00,59.0,9.25,10.75,277.66,2.57,93.0,Clouds,2,6,30


## Data encoding

In [15]:
from sklearn.preprocessing import OneHotEncoder

In [16]:
encodedStation = {}
for i in stationNumbers:
    type_encoder = OneHotEncoder()
    type_encoded = type_encoder.fit_transform(np.array(station[i]["type"]).reshape(-1,1))
    type_encoded = pd.DataFrame(type_encoded.toarray(), columns = [category for category in type_encoder.categories_[0]])
    temp = station[i].reset_index(drop=True)
    encodedStation[i] = pd.concat([type_encoded, temp[["day", "hour","Minute", "temp", "humidity", "wind_speed"]]], axis = 1)

In [17]:
yDataStation = {}
for i in stationNumbers:
    yDataStation[i] = station[i]["available_bikes"]

## Model

#### Random Forest Regressor (testing on one station)

In [18]:
X = encodedStation[90]
Y = yDataStation[90]

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [20]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [21]:
from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor(max_depth = 30)
reg.fit(X_train, Y_train)
Y_pred = reg.predict(X_test)

In [22]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
print("RMSE: ", mean_squared_error(Y_test, Y_pred, squared = False))
print("R2: ", r2_score(Y_test, Y_pred))
print("MAE: ", mean_absolute_error(Y_test, Y_pred))

RMSE:  3.4473012109804833
R2:  0.6242498254319916
MAE:  2.5211926392380026


These errors may not be the best but we have decided that with the limited time we were given for model training and the envirnoment around the pandemic messing with network usage, we will not be getting much better.

#### Training a model for each station

In [23]:
# create a dictionary to hold the models
station_models = {}

In [None]:
from sklearn.model_selection import GridSearchCV

# for each station, train model
for number in list(encodedStation.keys()):
    X_data = encodedStation[number]
    y_data = yDataStation[number]
    reg = RandomForestRegressor()
    # the hyper prarmeters i used
    params = {"criterion": ["mse", "mae"], "max_depth": [3,4,5,6], "min_samples_split" : [2,3,4,5], "bootstrap":[True, False]}
    # grid search
    clf = GridSearchCV(estimator=reg,
                 param_grid=params,
                 cv = 5,
                 verbose = True,
                 n_jobs = -1)

    # training
    clf.fit(X_data, y_data)

    # the model to be saved
    model = clf.best_estimator_
    station_models[number] = model

Fitting 5 folds for each of 64 candidates, totalling 320 fits
Fitting 5 folds for each of 64 candidates, totalling 320 fits
Fitting 5 folds for each of 64 candidates, totalling 320 fits
Fitting 5 folds for each of 64 candidates, totalling 320 fits
Fitting 5 folds for each of 64 candidates, totalling 320 fits
Fitting 5 folds for each of 64 candidates, totalling 320 fits
Fitting 5 folds for each of 64 candidates, totalling 320 fits
Fitting 5 folds for each of 64 candidates, totalling 320 fits
Fitting 5 folds for each of 64 candidates, totalling 320 fits
Fitting 5 folds for each of 64 candidates, totalling 320 fits
Fitting 5 folds for each of 64 candidates, totalling 320 fits
Fitting 5 folds for each of 64 candidates, totalling 320 fits
Fitting 5 folds for each of 64 candidates, totalling 320 fits
Fitting 5 folds for each of 64 candidates, totalling 320 fits
Fitting 5 folds for each of 64 candidates, totalling 320 fits
Fitting 5 folds for each of 64 candidates, totalling 320 fits
Fitting 

#### Exporting model files

In [None]:
import pickle

In [None]:
for number in station_models.keys():
    model = station_models[number]
    # Here is how you save using pickle
    filename = 'models/station{id}_model.sav'.format(id = number)
    # model is the best estimator from the gridsearch
    pickle.dump(model, open(filename, 'wb'))