Author: **Mathis Konarski** </br>
Date: **22/06/2022**

This notebook implement a GNN model on NYC bike and NYC taxi data.

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

from torch import nn
import torch
import torch_geometric as tog
from torch_geometric import nn as tog_nn

import Training_functions as t_func

import gc
gc.enable()

In [None]:
GRID_SIZE = (10, 20) # /!\ Must be the same than used inside Data_Preparation.ipynb
TRAIN_PERIOD = 70*24*2 # time_period defined inside Data_Preparation.ipynb as 30 minutes
BATCH_SIZE = 4 # batch size for the deep learning model
WINDOW_SIZE = 3*24*2 # time_period defined inside Data_Preparation.ipynb as 30 minutes
MIN_VOL_METRICS = 10 # Minimal demand for one area volume to be considered during evaluation

# Flow and Volume creation

In [None]:
data_ytaxi_df = pd.read_csv("Datasets/ytaxi_prepared.csv", index_col=0)
data_gtaxi_df = pd.read_csv("Datasets/gtaxi_prepared.csv", index_col=0)
data_bike_df = pd.read_csv("Datasets/bike_prepared.csv", index_col=0) # Read the data transformed using Data_Preparation.ipynb

In [None]:
bike_graph_flow_ser = t_func.flow_graphs(data_bike_df, GRID_SIZE)
ytaxi_graph_flow_ser = t_func.flow_graphs(data_ytaxi_df, GRID_SIZE)
gtaxi_graph_flow_ser = t_func.flow_graphs(data_gtaxi_df, GRID_SIZE)

In [None]:
bike_vol_np = t_func.volume_data(data_bike_df)
ytaxi_vol_np = t_func.volume_data(data_ytaxi_df)
gtaxi_vol_np = t_func.volume_data(data_gtaxi_df)

In [None]:
class TimeSeriesDataset(torch.utils.data.Dataset):
    '''
    Custome pyTorch datasets in order to handle time series data with flow and volume information
    
    Parameters
    ----------
    data_vol_ten : torch tensor with start and end volume for each period and area
    data_flow_lst : list of flow graphs
    window : length of previous data considered for LSTM
    norm_y : maximal volume value for the training set
    '''
    def __init__(self, data_vol_ten, data_flow_lst, window, norm_y):
        self.data_vol = data_vol_ten
        self.data_flow = data_flow_lst
        self.window = window
        self.norm_y = norm_y
        self.shape = self.__getshape__()
        self.size = self.__getsize__()
 
    def __getitem__(self, index):
        v = self.data_vol[index:index+self.window]
        f = self.data_flow[index:index+self.window]
        y = self.data_vol[index+1:index+self.window+1]/self.norm_y
        return v, f, y
 
    def __len__(self):
        return len(self.data_vol) -  self.window 
    
    def __getshape__(self):
        return (self.__len__(), *self.__getitem__(0)[0].shape)
    
    def __getsize__(self):
        return (self.__len__())
    

def create_dataloader(flow_ser, vol_np, train_period, batch_size, window_size):
    '''
    Create pyTorch DataLoader for train and test data based on TimeSeriesDataset
    
    Parameters
    ----------
    flow_ser : flow informations based on flow_graphs function
    vol_np : volume informations based on volume_data function
    train_period : length of the training period
    batch_size
    window_size : length of previous data considered for LSTM
    
    Returns
    -------
    train_dataloader pytorch_geometic.loader.DataLoader
    test_dataloader pytorch_geometic.loader.DataLoader
    norm_y : maximal volume value for the training set
    '''
    norm_y = vol_np[:train_period].max()
    train_dataset = TimeSeriesDataset(torch.Tensor(vol_np[:train_period]),
                                      list(flow_ser[:train_period]), window_size, norm_y)
    train_dataloader = tog.loader.DataLoader(train_dataset, batch_size=batch_size, shuffle = True)

    test_dataset = TimeSeriesDataset(torch.Tensor(vol_np[train_period-window_size:]),
                                     list(flow_ser[train_period-window_size:]), window_size, norm_y)
    test_dataloader = tog.loader.DataLoader(test_dataset)
    return train_dataloader, test_dataloader, norm_y

In [None]:
bike_train_loader, bike_test_loader, bike_norm_y = create_dataloader(bike_graph_flow_ser, bike_vol_np, TRAIN_PERIOD, BATCH_SIZE, WINDOW_SIZE)
gtaxi_train_loader, gtaxi_test_loader, gtaxi_norm_y = create_dataloader(gtaxi_graph_flow_ser, gtaxi_vol_np, TRAIN_PERIOD, BATCH_SIZE, WINDOW_SIZE)
ytaxi_train_loader, ytaxi_test_loader, ytaxi_norm_y = create_dataloader(ytaxi_graph_flow_ser, ytaxi_vol_np, TRAIN_PERIOD, BATCH_SIZE, WINDOW_SIZE)

# Model

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
class GnnNeuralNetwork(nn.Module):
    def __init__(self):
        super(GnnNeuralNetwork, self).__init__()
        self.dropout_rate = 0.5
        self.conv_vol_size = 2
        self.conv_flow_size = 4
        self.out_flow_vol_size = 4
        self.lstm_input_size = 4
        self.lstm_output_size = 32
        
        self.norm_vol = nn.BatchNorm3d(WINDOW_SIZE)
        self.conv_vol = nn.Sequential(
            nn.Conv3d(2, self.conv_vol_size, (1,3,3)), nn.ReLU(),
            nn.Conv3d(self.conv_vol_size, self.conv_vol_size, (1,3,3)), nn.ReLU(),
            nn.Conv3d(self.conv_vol_size, self.conv_vol_size, (1,3,3)), nn.ReLU() )
        self.dense_vol = nn.Sequential(
            nn.Flatten(2), nn.Dropout(self.dropout_rate), nn.Linear(self.conv_vol_size*4*14, self.out_flow_vol_size), nn.ReLU() )
        
        self.conv_flow = tog_nn.GCN(1, hidden_channels = self.conv_flow_size , num_layers=3, act='relu')
        self.dense_flow = nn.Sequential(
            nn.BatchNorm1d(WINDOW_SIZE),
            nn.Dropout(self.dropout_rate), nn.Linear(200*self.conv_flow_size, self.out_flow_vol_size), nn.ReLU() )
        
        self.pre_lstm = nn.Sequential(
            nn.Dropout(self.dropout_rate), nn.Linear(self.out_flow_vol_size,self.lstm_input_size), nn.ReLU() )
        self.lstm = nn.LSTM(input_size=self.lstm_input_size, hidden_size=self.lstm_output_size, num_layers=1, dropout=0, batch_first=True)
        self.end = nn.Sequential(
            nn.Dropout(self.dropout_rate), nn.Linear(self.lstm_output_size, 400), nn.Tanh(),
            nn.Unflatten(2, (10,20,2)) )

    def forward(self, v, f):
        v = self.norm_vol(v)
        v = self.conv_vol(v.permute(0,4,1,2,3))
        v = self.dense_vol(v.permute(0,2,3,4,1))
        n_batches = len(f[0].ptr)-1
        f_nodes = torch.empty((len(f), n_batches, 200 * self.conv_flow_size), dtype=torch.float)
        for i in range(len(f)):
            f[i].x = self.conv_flow(f[i].x, f[i].edge_index, f[i].edge_weight)
            f_nodes[i] = nn.Flatten(1)(f[i].x.reshape(n_batches,200,self.conv_flow_size))
        f = torch.reshape(f_nodes, (WINDOW_SIZE, n_batches, 200*self.conv_flow_size))
        f = torch.permute(f, (1, 0, 2))
        f = self.dense_flow(f)
        x = torch.mul(v,f)
        x = self.pre_lstm(x)
        (h0, c0) = (torch.zeros(1,x.shape[0], self.lstm_output_size), torch.zeros(1,x.shape[0], self.lstm_output_size))
        (x, (_, _)) = self.lstm(x)
        logits = self.end(x)
        return logits
    
GNNmodel = GnnNeuralNetwork().to(device)

In [None]:
model = GNNmodel
loader_tuple = bike_train_loader, bike_test_loader, bike_norm_y
loss_mse = torch.nn.MSELoss(reduction='none')
learning_rate = 6e-4
epochs = 100
patience=5
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate , eps=1e-7)

t_func.model_training(loader_tuple, model, loss_mse, optimizer, epochs, MIN_VOL_METRICS, patience=5)

# Combined training

In [None]:
class InterModalityCombiner(nn.Module):
    def __init__(self):
        super(InterModalityCombiner, self).__init__()
        self.dropout_rate = 0.5
        self.hid_layer_size = 64
        self.dense_layer = nn.Sequential(
            nn.Flatten(2), nn.Dropout(self.dropout_rate), nn.Linear(3*400, self.hid_layer_size), nn.ReLU(),
            nn.Linear(self.hid_layer_size, 3*400), nn.Tanh(), nn.Unflatten(2, (3, 10, 20, 2))  )

    def forward(self, m1, m2, m3):
        x = torch.concat((m1, m2, m3), axis=2)
        logits = self.dense_layer(x)
        return logits.permute(2, 0, 1, 3, 4, 5)
    
IMCNmodel = InterModalityCombiner().to(device)

In [None]:
train_loaders_tuple = bike_train_loader, gtaxi_train_loader, ytaxi_train_loader
test_loaders_tuple = bike_test_loader, gtaxi_test_loader, ytaxi_test_loader
norms_tuple = bike_norm_y, gtaxi_norm_y, ytaxi_norm_y
GNN_models_tuple = (GnnNeuralNetwork().to(device),
                    GnnNeuralNetwork().to(device),
                    GnnNeuralNetwork().to(device))
combined_model = InterModalityCombiner().to(device)
loss_mse = torch.nn.MSELoss(reduction='none')
epochs = 30
patience = 30
optimizers_tuple = (torch.optim.Adam(GNN_models_tuple[0].parameters(), lr=5e-4 , eps=1e-7),
                    torch.optim.Adam(GNN_models_tuple[1].parameters(), lr=2e-3 , eps=1e-7),
                    torch.optim.Adam(GNN_models_tuple[2].parameters(), lr=4e-3 , eps=1e-7),
                    torch.optim.Adam(combined_model.parameters(), lr=1e-3 , eps=1e-7) )

t_func.combined_training(train_loaders_tuple, test_loaders_tuple, GNN_models_tuple, combined_model, loss_mse, optimizers_tuple, epochs, norms_tuple, MIN_VOL_METRICS, patience)

In [None]:
Scoring

Epoch 15:

100%|████████████████████████████████████████████████████████████████████████████████| 804/804 [07:13<00:00,  1.85it/s]

Train bike : Avg loss: 0.000222 | Start RMSE: 8.30 | Start MAPE: 27.62 % | Stop RMSE: 7.92 | Stop MAPE: 26.77 %
Train bike combination : Avg loss: 0.000297 | Start RMSE: 9.51 | Start MAPE: 31.14 % | Stop RMSE: 9.17 | Stop MAPE: 30.61 %
Train gtaxi : Avg loss: 0.000132 | Start RMSE: 5.77 | Start MAPE: 27.38 % | Stop RMSE: 3.17 | Stop MAPE: 18.89 %
Train gtaxi combination : Avg loss: 0.000297 | Start RMSE: 5.80 | Start MAPE: 27.40 % | Stop RMSE: 3.07 | Stop MAPE: 19.28 %
Train ytaxi : Avg loss: 0.000379 | Start RMSE: 28.07 | Start MAPE: 32.86 % | Stop RMSE: 24.09 | Stop MAPE: 31.42 %
Train ytaxi combination : Avg loss: 0.000297 | Start RMSE: 31.12 | Start MAPE: 33.18 % | Stop RMSE: 26.26 | Stop MAPE: 31.57 %

100%|██████████████████████████████████████████████████████████████████████████████| 1008/1008 [01:29<00:00, 11.26it/s]

Test bike : Avg loss: 0.000350 | Start RMSE: 9.17 | Start MAPE: 25.64 % | Stop RMSE: 9.05 | Stop MAPE: 24.94 %
Test bike combination : Avg loss: 0.000318 | Start RMSE: 10.25 | Start MAPE: 28.28 % | Stop RMSE: 10.20 | Stop MAPE: 27.91 %
Test gtaxi : Avg loss: 0.000080 | Start RMSE: 6.02 | Start MAPE: 27.49 % | Stop RMSE: 2.98 | Stop MAPE: 19.89 %
Test gtaxi combination : Avg loss: 0.000318 | Start RMSE: 6.00 | Start MAPE: 26.85 % | Stop RMSE: 2.98 | Stop MAPE: 19.66 %
Test ytaxi : Avg loss: 0.000366 | Start RMSE: 30.79 | Start MAPE: 38.88 % | Stop RMSE: 27.54 | Stop MAPE: 37.96 %
Test ytaxi combination : Avg loss: 0.000318 | Start RMSE: 30.75 | Start MAPE: 38.13 % | Stop RMSE: 29.38 | Stop MAPE: 37.09 %