In [3]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from utils import StandardScaler

tf.__version__

'1.15.2'

In [4]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print('GPU device not found')
else:
  print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [5]:
path = "training_data.csv"
df_demand = pd.read_csv(path)
df_demand.columns = ["timebin", "cell", "lon", "lat", "count"]

In [6]:
timebins = pd.unique(df_demand['timebin'])
stations = pd.unique(df_demand['cell'])
num_timebins, num_cells, num_lon, num_lat, num_counts = df_demand.nunique(axis=0)

demand_count = df_demand['count'].values.reshape((num_cells, num_timebins)).T
df_demand_count = pd.DataFrame(demand_count, columns=stations).add_prefix('station_')

In [7]:
datebins = pd.to_datetime(timebins, unit='s', utc=True, infer_datetime_format=True).tz_convert(tz='US/Eastern')

onehot = True
if onehot:
    df_month = pd.get_dummies(datebins.month, prefix='month')
    df_hour = pd.get_dummies(datebins.hour, prefix='hour')
    df_dayofweek = pd.get_dummies(datebins.dayofweek, prefix='dayofweek')
    df_weekend = pd.get_dummies(np.in1d(datebins.dayofweek, [5,6]), prefix='weekend')
else:
    df_month = pd.DataFrame(datebins.month, columns=['month'])
    df_hour = pd.DataFrame(datebins.hour, columns=['hour'])
    df_dayofweek = pd.DataFrame(datebins.dayofweek, columns=['dayofweek'])
    df_weekend = pd.DataFrame(np.in1d(datebins.dayofweek, [5,6]), columns=['weekend'])

df = pd.concat([df_demand_count, df_month, df_hour, df_dayofweek, df_weekend], axis=1)
names_stations = df_demand_count.columns.tolist()
names_month = df_month.columns.tolist()
names_hour = df_hour.columns.tolist()
names_dayofweek = df_dayofweek.columns.tolist()
names_weekend = df_weekend.columns.tolist()
names_all = df.columns.tolist()

train_filter = (datebins >= '2018-01-01') & (datebins < '2019-09-30')
val_filter = (datebins >= '2019-10-01') & (datebins < '2019-12-31')
test_filter = (datebins >= '2019-10-01') & (datebins < '2019-12-31')

df_train = df[train_filter]
df_val = df[val_filter]
df_test = df[test_filter]

num_input = df.shape[1]
num_output = num_cells
num_output = df.shape[1]

In [8]:

def get_dataset_XY(df, batch_size=None, batch_index=0, num_feature=24, num_horizon=1, num_input = num_input, num_output=num_output):
    x_offsets = np.sort(np.concatenate((np.arange(-num_feature+1, 1, 1),)))
    y_offsets = np.sort(np.arange(1, 1+ num_horizon, 1))

    min_t = abs(min(x_offsets))
    max_t = abs(df.shape[0] - abs(max(y_offsets)))  # Exclusive

    X, Y = [], []
    if batch_size is None:
        batch_size = max_t - min_t + 1
        batch_index = 0
    count = 0
    for t in range(min_t, max_t):
        t = t + batch_size * batch_index
        xt = df.iloc[t + x_offsets, 0:num_input].values.flatten('F')
        yt = df.iloc[t + y_offsets, 0:num_output].values.flatten('F')
        X.append(xt)
        Y.append(yt)

        count += 1
        if count == batch_size:
            break

    X = np.stack(X).reshape([-1, num_input, num_feature])
    Y = np.stack(Y)#.reshape([-1, num_input, num_feature])

    return X, Y


In [9]:
batch_size = 100
num_feature = 24
num_horizon = 1

X_train, Y_train = get_dataset_XY(df_train, batch_size, 0, num_feature, num_horizon)
X_train.shape, Y_train.shape

X_val, Y_val = get_dataset_XY(df_val, None, 0, num_feature, num_horizon)
# # X_test, Y_test = get_dataset_XY(df_test)
X_test, Y_test = X_val, Y_val

X_train.shape, Y_train.shape, X_val.shape, Y_val.shape

((100, 225, 24), (100, 225), (8477, 225, 24), (8477, 225))

In [10]:
scaler = StandardScaler(mean=X_train.mean(), std=X_train.std())

In [12]:
import gcn
import importlib
importlib.reload(gcn)

# Hyperparameters
learning_rate = 3e-4 # learning rate
decay = 0.9
batchsize = 100 # batch size 

hidden_num_layer = [10, 20, 20] # determine the number of hidden layers and the vector length at each node of each hidden layer
reg_weight = [0, 0, 0] # regularization weights for adjacency matrices L1 loss

keep = 1 # drop out probability

early_stop_th = 200 # early stopping threshold, if validation RMSE not dropping in continuous 20 steps, break
training_epochs = 10 # 10 total training epochs

# Training
start_time = datetime.datetime.now()

val_error, predic_res, Y_test_pred, test_error, bestWeightA = gcn.gcnn_ddgf(
    hidden_num_layer, reg_weight, 
    num_input, num_output, num_feature, num_horizon, 
    learning_rate, decay, batchsize, 
    keep, early_stop_th, training_epochs, 
    get_dataset_XY, df_train, 
    X_val, Y_val, 
    X_test, Y_test, 
    scaler, 'RMSE')

end_time = datetime.datetime.now()
print('Total training time: ', end_time-start_time)

Loss 2.261: 100%|██████████| 581/581 [01:37<00:00,  5.98it/s]


Epoch: 0001 Training RMSE = 2.543203625
Validation RMSE: 2.6655146


Loss 2.253: 100%|██████████| 581/581 [01:27<00:00,  6.62it/s]


Epoch: 0002 Training RMSE = 2.267094396
Validation RMSE: 2.5791233


Loss 2.250: 100%|██████████| 581/581 [01:25<00:00,  6.83it/s]


Epoch: 0003 Training RMSE = 2.263231476
Validation RMSE: 2.5447612


Loss 2.249: 100%|██████████| 581/581 [01:25<00:00,  6.80it/s]


Epoch: 0004 Training RMSE = 2.262106659
Validation RMSE: 2.5365103


Loss 2.249: 100%|██████████| 581/581 [01:25<00:00,  6.79it/s]


Epoch: 0005 Training RMSE = 2.261640288
Validation RMSE: 2.518071


Loss 2.249: 100%|██████████| 581/581 [01:24<00:00,  6.90it/s]


Epoch: 0006 Training RMSE = 2.261348302
Validation RMSE: 2.5083712


Loss 2.249: 100%|██████████| 581/581 [01:26<00:00,  6.75it/s]


Epoch: 0007 Training RMSE = 2.261114688
Validation RMSE: 2.5034465


Loss 2.249: 100%|██████████| 581/581 [01:23<00:00,  6.92it/s]


Epoch: 0008 Training RMSE = 2.260926744
Validation RMSE: 2.5009067


Loss 2.248: 100%|██████████| 581/581 [01:23<00:00,  6.93it/s]


Epoch: 0009 Training RMSE = 2.260777614
Validation RMSE: 2.4993204


Loss 2.248: 100%|██████████| 581/581 [01:24<00:00,  6.91it/s]


Epoch: 0010 Training RMSE = 2.260657198
Validation RMSE: 2.4987284
Training RMSE =  2.2606571981812915
Validation RMSE =  2.4987284
Test RMSE =  2.4987284
Total training time:  0:14:27.974901


In [14]:
count_pred = np.round(Y_test_pred[:,:180]).flatten()

datebins = pd.to_datetime(df_demand["timebin"].values, unit='s', utc=True, infer_datetime_format=True).tz_convert(tz='US/Eastern')
test_filter = (datebins >= '2019-10-01') & (datebins < '2019-12-31')
df_output = df_demand[test_filter][:count_pred.shape[0]]
df_output["count_pred"] = count_pred
df_output.to_csv("testing_data.csv", index=False)