In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import numpy as np
import statsmodels.api as sm
import matplotlib.dates as mdates

# Data Formatting and Preprocessing

In [None]:
data = pd.read_csv('aggregated_dataset_updated_with_time_v2.csv',dtype={'zipcode':'str'})
data = data.set_index('datetime').drop(columns=['zipcode', 'datetime.1', 'change', 'date', 'parking_size', 'num_of_muni'])
data.sort_index(inplace=True)

In [None]:
data.head()

In [None]:
agg_dict = {}
agg_dict['rental'] = 'sum'
for i in range(1, 8):
    agg_dict[data.columns[i]] = 'mean'

for i in range(8, len(data.columns)):
    agg_dict[data.columns[i]] = 'max'

In [None]:
agg_data = data.groupby('datetime').agg(agg_dict)
agg_data = agg_data[:'2019-07-01']
agg_data['pres'].fillna(method='ffill',inplace=True)
agg_data.drop(columns='Days Passed', inplace=True)
hours = agg_data['Hour']
agg_data.drop(columns='Hour', inplace=True)
agg_data['Hour'] = hours

In [None]:
cont_features = agg_data.columns[:9]
clas_features = agg_data.columns[9:11].append(agg_data.columns[14:20])
cont_data = agg_data[cont_features]
clas_data = agg_data[clas_features]

In [None]:
train_mean = cont_data[cont_data.index < '2019-05-01 00:00:00'].mean()
train_std = cont_data[cont_data.index < '2019-05-01 00:00:00'].std()
feature_data = ((cont_data - train_mean)/train_std).join(clas_data)

In [None]:
train_data = feature_data[feature_data.index < '2019-05-01 00:00:00']
val_data = feature_data[(feature_data.index >= '2019-05-01 00:00:00')&(feature_data.index < '2019-06-01 00:00:00')]
test_data = feature_data[feature_data.index >= '2019-06-01 00:00:00']

In [None]:
n = agg_data.shape[0]
split = 0.8
step = 1
past = 120
future = 1
learning_rate = 0.001
batch_size = 64
epochs = 40
train_split = agg_data.index.get_loc('2019-03-20 00:00:00')
val_split = agg_data.index.get_loc('2019-05-06 00:00:00')

In [None]:
train_mean = cont_data[cont_data.index < '2019-03-20 00:00:00'].mean()
train_std = cont_data[cont_data.index < '2019-03-20 00:00:00'].std()
feature_data = ((cont_data - train_mean)/train_std).join(clas_data)

In [None]:
train_data = feature_data[feature_data.index < '2019-03-20 00:00:00']
val_data = feature_data[(feature_data.index >= '2019-03-20 00:00:00')&(feature_data.index < '2019-05-06 00:00:00')]
test_data = feature_data[(feature_data.index >= '2019-05-06 00:00:00')&(feature_data.index < '2019-06-01 00:00:00')]

# Training Data

In [None]:
start = past + future
end = train_split

x_train = train_data
y_train = feature_data.iloc[start:end]['rental']

sequence_length = int(past / step)

In [None]:
dataset_train = keras.preprocessing.timeseries_dataset_from_array(
    x_train,
    y_train,
    sequence_length=sequence_length,
    batch_size=batch_size,
)

# Val and Test Data

In [None]:
x_end = len(val_data) - future

label_start = train_split + past + future

x_val = val_data.iloc[:x_end].values
y_val = feature_data.iloc[label_start:]['rental']

dataset_val = keras.preprocessing.timeseries_dataset_from_array(
    x_val,
    y_val,
    sequence_length=sequence_length,
    batch_size=batch_size,
)


for batch in dataset_train.take(1):
    inputs, targets = batch

print("Input shape:", inputs.numpy().shape)
print("Target shape:", targets.numpy().shape)


In [None]:
x_test_end = len(test_data) - future

label_test_start = val_split + past + future

x_test = test_data.iloc[:x_test_end].values
y_test = feature_data.iloc[label_test_start:]['rental']

dataset_test = keras.preprocessing.timeseries_dataset_from_array(
    x_test,
    y_test,
    sequence_length=sequence_length,
    batch_size=batch_size,
)

# Training

In [None]:
np.random.seed(42)
inputs = keras.layers.Input(shape=(inputs.shape[1], inputs.shape[2]))
gru_1 = keras.layers.GRU(32, return_sequences=True)(inputs)
gru_out = keras.layers.GRU(16)(gru_1)
outputs = keras.layers.Dense(1)(gru_out)

model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate), loss="mse")
model.summary()

In [None]:
path_checkpoint = "model_checkpoint_1.h5"
es_callback = keras.callbacks.EarlyStopping(monitor="val_loss", min_delta=0, patience=4)

modelckpt_callback = keras.callbacks.ModelCheckpoint(
    monitor="val_loss",
    filepath=path_checkpoint,
    verbose=1,
    save_weights_only=True,
    save_best_only=True,
)

history = model.fit(
    dataset_train,
    epochs=epochs,
    validation_data=dataset_val,
    callbacks=[es_callback, modelckpt_callback],
)

In [None]:
mse_val_loss = []
mse_test_loss = []
mse_train_loss = []

In [None]:
mse_val_loss.append(model.evaluate(dataset_val))
mse_test_loss.append(model.evaluate(dataset_test))
mse_train_loss.append(model.evaluate(dataset_train))

In [None]:
predictions = model.predict(dataset_test)

In [None]:
y_data = y_test.loc['2019-05-11 00:00:00':'2019-06-01 00:00:00']
dates = pd.DatetimeIndex(y_test.loc['2019-05-11 00:00:00':'2019-06-01 00:00:00'].index)

plt.figure(figsize=(24,6))
plt.plot(dates, y_data*train_std['rental']+train_mean['rental'], label='True')
plt.plot(dates, predictions*train_std['rental']+train_mean['rental'], label='Pred')
plt.title('RNN Predictions of Rentals per hour', fontsize=24)
plt.xlabel('Date and hour', fontsize=20)
plt.ylabel('Rentals', fontsize=20)
plt.xticks(fontsize = 14)
plt.yticks(fontsize = 14)
plt.legend(prop={'size': 16})
plt.savefig('rnn_pred.png', format='png', bbox_inches='tight')

plt.show()

In [None]:
def Rsquared_from_mse(mse, test=True):
    if test:
        n = y_test.shape[0]
        baseline = np.sum((y_test-y_train.mean())**2)
    else:
        n = y_train.shape[0]
        baseline = np.sum((y_train-y_train.mean())**2)
        
    Rsquared = 1 - n*mse/baseline
    
    return Rsquared

In [None]:
r2_df = pd.DataFrame({'R2 rental':Rsquared_from_mse(mse_train_loss[-1], test=False), 'OSR2 rental':Rsquared_from_mse(mse_test_loss[-1])}, index=['RNN'])