In [1]:
import numpy as np
import pandas as pd
# from joblib import * 
from sklearn.metrics import r2_score,mean_absolute_error,mean_absolute_percentage_error,mean_squared_error
from sklearn.preprocessing import MinMaxScaler,StandardScaler
# import geopandas as gpd
# from geopy.distance import distance,geodesic
from joblib import Parallel, delayed
import warnings
import matplotlib.pyplot as plt
import scipy
import networkx as nx
from tqdm import tqdm
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

# different methods, all tested on 09-26 to 10-02

In [2]:
def masked_MAPE(v, v_, axis=None):
    '''
    Mean absolute percentage error.
    :param v: np.ndarray or int, ground truth.
    :param v_: np.ndarray or int, prediction.
    :param axis: axis to do calculation.
    :return: int, MAPE averages on all elements of input.
    '''
    mask = (v == 0)
    percentage = np.abs(v_ - v) / np.abs(v)
    if np.any(mask):
        masked_array = np.ma.masked_array(percentage, mask=mask)  # mask the dividing-zero as invalid
        result = masked_array.mean(axis=axis)
        if isinstance(result, np.ma.MaskedArray):
            return result.filled(np.nan)
        else:
            return result
    return np.mean(percentage, axis).astype(np.float64)

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
torch.__version__

'1.13.0'

In [4]:
class relevance_VNN(nn.Module):
    def __init__(self,node_feat,input_size,n_feature=128):
        super(relevance_VNN, self).__init__()
        
        self.relevance = nn.Sequential(
        nn.Linear(node_feat.shape[1], n_feature),
        nn.Sigmoid(),
        nn.Linear(n_feature,n_feature),
        nn.Sigmoid(),
        nn.Linear(n_feature,input_size),
        )
        
        self.weight = nn.Sequential(
        nn.Linear(input_size, n_feature),
        nn.Sigmoid(),
        nn.Linear(n_feature,n_feature),
        nn.Sigmoid(),
        nn.Linear(n_feature,n_feature),
        nn.Sigmoid(),
        nn.Linear(n_feature,2),
        )
 
    def forward(self,x,node_feat):
#         x = torch.tensor(x, dtype=torch.float32).to(device)
        att = self.relevance(node_feat)
        
#         print(att.shape,x.shape)
#         relevance * flows
        flow = att.mul(x)

        flow = self.weight(x)
        return flow


def get_loss_and_metrics(model,node_feat, data, target,criterion, device):
  # Implement forward pass and loss calculation for one batch.
  # Remember to move the batch to device.
  # 
  # Return a tuple:
  # - loss for the batch (Tensor)
  # - number of correctly classified examples in the batch (Tensor)
#     data, target = batch[0], batch[1]
#     print(data.shape)
    data = torch.tensor(data, dtype=torch.float32)
    target = torch.tensor(target, dtype=torch.float32)
    node_feat = torch.tensor(node_feat, dtype=torch.float32)
    data, target = data.to(device), target.to(device)
    optimizer.zero_grad()
    pred = model(data,node_feat)
#     print(pred,target)
    loss = criterion(pred, target)
    
    
    return (pred,target,loss)
    
def step(loss, optimizer):

    loss.backward()
    optimizer.step()


In [22]:
nodes_dist = pd.read_csv('nodes_dist.csv')
G = nx.read_gpickle('graph.pickle')
od = pd.read_csv('inoutwide.csv')
pbar = tqdm(list(G.nodes))
count = 0

for station in pbar:
    
#     neighbors = list(nx.ego_graph(G, radius=1, n=station, distance='weight', center=False).nodes)
    neighbors = nodes_dist.loc[(nodes_dist['o']==station)&\
                               (nodes_dist['dist']<1)&\
                               (nodes_dist['o']!=nodes_dist['d'])]['d'].values.tolist()
    flows = ['incoming_flow-'+station]+\
            ['incoming_flow-'+i for i in neighbors]+['outgoing_flow-'+station]+\
                      ['outgoing_flow-'+i for i in neighbors]

    fts = od[['Date','Hour']+flows]
    lags = list(range(1,24))+list(np.array(list(range(2,8)))*24)+list(np.array(list(range(2,8)))*24*7)
    for lag in lags:
        temp = fts[flows].shift(lag)
        temp.columns = [i+'-lag-'+str(lag) for i in flows]
        fts = pd.concat([fts,temp],axis=1)
    fts = fts.dropna()
    fts = fts.drop(columns=['Date','Hour'])
#     print('here')
    
    node_feat_raw = pd.DataFrame()
    node_feat_raw['neightbor'] = [station]+neighbors
    node_feat_raw['degree'] = node_feat_raw.apply(lambda x:nx.shortest_path_length(G,station,x['neightbor']),axis=1)
    node_feat_raw['path_distance'] = node_feat_raw.apply(lambda x:nx.dijkstra_path_length(G,station,x['neightbor']),axis=1)
    
    node_feat_raw = node_feat_raw.append([node_feat_raw]*(len(lags)*2*(len(neighbors)+1)-1),ignore_index=True)
    del node_feat_raw['neightbor']
    node_feat_raw = node_feat_raw.to_numpy().flatten()
    node_feat_raw = node_feat_raw.reshape(1,node_feat_raw.shape[0],)
    
    # len(lags) is number of lags, 2 is bidirectional
    # remove flows at time t
    fts_train = fts.iloc[:-24*7,(len(neighbors)+1)*2:].values
    scaler = StandardScaler()
    scaler.fit(fts_train)
    fts_train = torch.tensor(scaler.transform(fts_train))

    y = fts[['incoming_flow-'+station,'outgoing_flow-'+station]]
    y_train = y.iloc[:-24*7,:].values
    y_scaler = StandardScaler()
    y_scaler.fit(y_train)
    y_train = y_scaler.transform(y_train)
    y_train = torch.tensor(y_train)
    train_dataset = torch.utils.data.TensorDataset(fts_train,y_train)
 

    fts_val = torch.tensor(scaler.transform(fts.iloc[-24*7:,(len(neighbors)+1)*2:].values)) 
    y_val = y.iloc[-24*7:,:].values
    y_val = torch.tensor(y_val)    
    device = torch.device("mps")

    N_EPOCHS = 501
    BATCH_SIZE = 64
#     print('here')
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE,
                                                   num_workers=0)

    node_feat = np.repeat(node_feat_raw,BATCH_SIZE,axis=0)
    node_feat = torch.tensor(node_feat, dtype=torch.float32).to(device)
    model = relevance_VNN(node_feat,input_size=(1+len(neighbors))*len(lags)*2)
    model = model.to(device)
    criterion = nn.SmoothL1Loss()
    
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01) 
#     qbar = tqdm(range(N_EPOCHS), leave=False)
    validation_mae,validation_r2,validation_mape = 999,999,999
    for i in range(N_EPOCHS): 

        total_train_loss = 0.0
        model.train()
        training_batch_count = 1
        for batch in train_dataloader:
            
            node_feat = np.repeat(node_feat_raw,len(batch[0]),axis=0)
            node_feat = torch.tensor(node_feat, dtype=torch.float32).to(device)
            y_train,y_true,loss = get_loss_and_metrics(model,node_feat, batch[0],batch[1], criterion, device)
            
            total_train_loss += loss.item()
            mean_train_loss = total_train_loss / training_batch_count
            pbar.set_description('epoch:' + str(i) +\
                ' training batch:' + str(training_batch_count) +\
                ' total train loss:'+ str(round(mean_train_loss,3))+ \
                 ' validation_mae:'+ str(round(validation_mae,3)) +\
                ' validation R2: '+ str(round(validation_r2,3)) +\
                ' validation MAPE: '+ str(round(validation_mape,3)))
            training_batch_count += 1
            
            step(loss,optimizer)
        
        if i%50 == 0 and i >=50:
#         if i >= 0:
            with torch.no_grad(): 
                node_feat = np.repeat(node_feat_raw,len(fts_val),axis=0)
                node_feat = torch.tensor(node_feat, dtype=torch.float32).to(device)
                y_pred,y_true,loss = get_loss_and_metrics(model,node_feat, fts_val, y_val, criterion, device)
                y_pred_val,y_true_val = y_pred.cpu().numpy(),y_true.cpu().numpy()
                y_pred_val = y_scaler.inverse_transform(y_pred_val)

                validation_mae = mean_absolute_error(y_true_val,y_pred_val)
                validation_r2 = r2_score(y_true_val,y_pred_val)
                validation_mape = masked_MAPE(y_true_val,y_pred_val)
                pbar.set_description('epoch:' + str(i) +\
                ' training batch:' + str(training_batch_count) +\
                ' total train loss:'+ str(round(mean_train_loss,3))+ \
                 ' validation_mae:'+ str(round(validation_mae,3)) +\
                ' validation R2: '+ str(round(validation_r2,3)) +\
                ' validation MAPE: '+ str(round(validation_mape,3)))
                        
                
                
    if count == 0: 
        y_true_total = y_true_val
        y_pred_total = y_pred_val
    else:
        y_true_total = np.concatenate([y_true_total,y_true_val])
        y_pred_total = np.concatenate([y_pred_total,y_pred_val])
    count += 1


epoch:500 training batch:84 total train loss:0.035 validation_mae:10.332 validation R2: 0.942 validation MAPE: 0.352: 100%|█| 50/5


In [24]:
# y_true,y_pred = y_true.cpu().numpy(),y_pred.cpu().numpy()
print(r2_score(y_true_total,y_pred_total))
print(mean_absolute_error(y_true_total,y_pred_total))
print(mean_squared_error(y_true_total,y_pred_total,squared=False))
print(mean_squared_error(y_true_total,y_pred_total,squared=False)/np.mean(np.std(y_pred_total)))

print(masked_MAPE(y_true_total,y_pred_total))



0.973310894198095
17.328087
36.184677
0.17374575
0.4805081237336012
