In [1]:
import numpy as np
import pandas as pd
# from joblib import * 
from sklearn.metrics import r2_score,mean_absolute_error,mean_absolute_percentage_error,mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler,StandardScaler
# import geopandas as gpd
# from geopy.distance import distance,geodesic
from joblib import Parallel, delayed
import warnings
import matplotlib.pyplot as plt
import scipy
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import acf, pacf,adfuller, kpss,range_unit_root_test
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import networkx as nx
from tqdm import tqdm
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

# different methods, all tested on 09-26 to 10-02

In [2]:
def masked_MAPE(v, v_, axis=None):
    '''
    Mean absolute percentage error.
    :param v: np.ndarray or int, ground truth.
    :param v_: np.ndarray or int, prediction.
    :param axis: axis to do calculation.
    :return: int, MAPE averages on all elements of input.
    '''
    mask = (v == 0)
    percentage = np.abs(v_ - v) / np.abs(v)
    if np.any(mask):
        masked_array = np.ma.masked_array(percentage, mask=mask)  # mask the dividing-zero as invalid
        result = masked_array.mean(axis=axis)
        if isinstance(result, np.ma.MaskedArray):
            return result.filled(np.nan)
        else:
            return result
    return np.mean(percentage, axis).astype(np.float64)

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
torch.__version__

'1.13.0'

In [209]:
class relevance_VNN(nn.Module):
    def __init__(self,input_size, n_feature, output_size):
        super(relevance_VNN, self).__init__()
        
        self.relevance = nn.Sequential(
        nn.Linear(input_size, n_feature),
        nn.Sigmoid(),
        nn.Linear(n_feature,input_size),
        )
        
        self.weight = nn.Sequential(
        nn.Linear(input_size, n_feature),
        nn.Sigmoid(),
        nn.Linear(n_feature,n_feature),
        nn.Sigmoid(),
        nn.Linear(n_feature,n_feature),
        nn.Sigmoid(),
        nn.Linear(n_feature,2),
        )
 
    def forward(self, x,verbose=False):
        att = self.relevance(x)
        # relevance * flows
        flow = att.mul(x)
#         print(att.shape,x.shape,flow.shape)
        # reshape to get weighted avaerge from all neighbors at each time lag
        flow = self.weight(x)
        flow = torch.mean(flow,0)   
        return flow

def get_loss_and_metrics(model, batch,criterion, device):
  # Implement forward pass and loss calculation for one batch.
  # Remember to move the batch to device.
  # 
  # Return a tuple:
  # - loss for the batch (Tensor)
  # - number of correctly classified examples in the batch (Tensor)
    data, target = batch[0], batch[1]
#     print(data.shape)
    target = torch.mean(target,0)
    data = torch.tensor(data, dtype=torch.float32)
    target = torch.tensor(target, dtype=torch.float32)
    data, target = data.to(device), target.to(device)
    optimizer.zero_grad()
    pred = model(data)
#     print(pred,target)
    loss = criterion(pred, target)
    
    
    return (pred,target,loss)
    
def step(loss, optimizer):
  # Implement backward pass and update.

  # TODO
    loss = loss
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


In [212]:
nodes_dist = pd.read_csv('nodes_dist.csv')
G = nx.read_gpickle('graph.pickle')
od = pd.read_csv('inoutwide.csv')
pbar = tqdm(list(G.nodes))
count = 0

for station in pbar:
    
#     neighbors = list(nx.ego_graph(G, radius=1, n=station, distance='weight', center=False).nodes)
    neighbors = nodes_dist.loc[(nodes_dist['o']==station)&\
                               (nodes_dist['dist']<1)&\
                               (nodes_dist['o']!=nodes_dist['d'])]['d'].values.tolist()
    flows = ['incoming_flow-'+station,'outgoing_flow-'+station]+\
            ['incoming_flow-'+i for i in neighbors]+\
                      ['outgoing_flow-'+i for i in neighbors]

    subod = od[['Date','Hour']+flows]
    lags = list(range(1,24))+list(np.array(list(range(2,8)))*24)+list(np.array(list(range(2,8)))*24*7)
    for lag in lags:
        temp = subod[flows].shift(lag)
        temp.columns = [i+'-lag-'+str(lag) for i in flows]
        subod = pd.concat([subod,temp],axis=1)
    subod = subod.dropna()
#     print('here')
    subod_melt = subod.drop(columns=flows).melt(id_vars=['Date','Hour'])
    subod_melt = subod_melt.sort_values(by=['Date','Hour'])
    subod_melt['nearby'] = subod_melt['variable'].apply(lambda x:x.split('flow-')[1].split('-')[0])
    subod_melt['degree'] = subod_melt.apply(lambda x:nx.shortest_path_length(G,station,x['nearby']),axis=1)
    subod_melt['path_distance'] = subod_melt.apply(lambda x:nx.dijkstra_path_length(G,station,x['nearby']),axis=1)
    subod_melt['lag'] = subod_melt['variable'].apply(lambda x: int(x.split('-')[-1]))
    subod_melt['variable_test'] = subod_melt['variable'].apply(lambda x: x.split('-lag')[0] if '-lag' in x else x)
    subod_melt.sort_values(by=['Date','Hour','variable_test','lag'])
    fts = subod_melt[['value','degree','path_distance']]
    # len(lags) is number of lags, 2 is bidirectional
    fts_train = fts.iloc[:-24*7*2*len(lags)*(len(neighbors)+1),:].values
    scaler = StandardScaler()
    scaler.fit(fts_train)
    fts_train = torch.tensor(scaler.transform(fts_train))

    y = subod[['incoming_flow-'+station,'outgoing_flow-'+station]]
    y_train = y.iloc[:-24*7,:].values
    y_scaler = StandardScaler()
    y_scaler.fit(y_train)
    y_train = y_scaler.transform(y_train)
    y_train = np.repeat(y_train,len(lags)*2*(len(neighbors)+1),0)
    y_train = torch.tensor(y_train)
    train_dataset = torch.utils.data.TensorDataset(fts_train,y_train)
 

    fts_val = torch.tensor(scaler.transform(fts.iloc[-24*7*2*len(lags)*(len(neighbors)+1):,:].values)) 
    y_val = y.iloc[-24*7:,:].values
    y_val = np.repeat(y_val,len(lags)*2*(len(neighbors)+1),0)
    y_val = torch.tensor(y_val)

                      
    validation_dataset = torch.utils.data.TensorDataset(fts_val, y_val)
    
    device = torch.device("mps")

    N_EPOCHS = 200
    BATCH_SIZE = int(1*(len(neighbors)+1)*len(lags))*2
#     print('here')
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE,
                                                   num_workers=0)
    validation_dataloader = torch.utils.data.DataLoader(validation_dataset,
                                                        batch_size=BATCH_SIZE,
                                                        num_workers=0)
    model = relevance_VNN(input_size=fts.shape[1],n_feature=64,output_size=2)
    model = model.to(device)
    criterion = nn.SmoothL1Loss()
    
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01) 
#     qbar = tqdm(range(N_EPOCHS), leave=False)
    validation_mae,validation_r2,validation_mape = 999,999,999
    for i in range(N_EPOCHS): 

        total_train_loss = 0.0
        model.train()
        training_batch_count = 1
        for batch in train_dataloader:

#             print('here')
            y_train,y_true,loss = get_loss_and_metrics(model, batch, criterion, device)
            step(loss.float(), optimizer)
            total_train_loss += loss.item()
            mean_train_loss = total_train_loss / training_batch_count
            pbar.set_description('epoch:' + str(i) +\
                ' training batch:' + str(training_batch_count) +\
                ' total train loss:'+ str(round(mean_train_loss,3))+ \
                 ' validation_mae:'+ str(round(validation_mae,3)) +\
                ' validation R2: '+ str(round(validation_r2,3)) +\
                ' validation MAPE: '+ str(round(validation_mape,3)))
            training_batch_count += 1

        batch_count = 0
        
#         if i%50 == 0 and i >=50:
        if i >= 0:
            for batch in validation_dataloader:
    #             print(batch_count)
                with torch.no_grad(): 
                    y_pred,y_true,loss = get_loss_and_metrics(model, batch, criterion, device)
                    y_pred,y_true = y_pred.cpu().numpy(),y_true.cpu().numpy()
                    y_pred = y_scaler.inverse_transform(y_pred.reshape(1, 2))
                    y_true = y_true.reshape(1,2)

                    if batch_count == 0: 
                        y_true_batch = y_true
                        y_pred_batch = y_pred
                    else:
                        y_true_batch = np.concatenate([y_true_batch,y_true])
                        y_pred_batch = np.concatenate([y_pred_batch,y_pred])

                        validation_mae = mean_absolute_error(y_true_batch,y_pred_batch)
                        validation_r2 = r2_score(y_true_batch,y_pred_batch)
                        validation_mape = masked_MAPE(y_true_batch,y_pred_batch)
                        pbar.set_description('epoch:' + str(i) +\
                        ' training batch:' + str(training_batch_count) +\
                        ' total train loss:'+ str(round(mean_train_loss,3))+ \
                         ' validation_mae:'+ str(round(validation_mae,3)) +\
                        ' validation R2: '+ str(round(validation_r2,3)) +\
                        ' validation MAPE: '+ str(round(validation_mape,3)))
                    batch_count += 1
                        
                
                
    if count == 0: 
        y_true_total = y_true_batch
        y_pred_total = y_pred_batch
    else:
        y_true_total = np.concatenate([y_true_total,y_true_batch])
        y_pred_total = np.concatenate([y_pred_total,y_pred_batch])
    count += 1
    break

epoch:199 training batch:5257 total train loss:0.214 validation_mae:25.538 validation R2: 0.38 validation MAPE: 1.607:   0%| | 0/5


In [213]:
# y_true,y_pred = y_true.cpu().numpy(),y_pred.cpu().numpy()
print(r2_score(y_true_total,y_pred_total))
print(mean_absolute_error(y_true_total,y_pred_total))
print(mean_squared_error(y_true_total,y_pred_total,squared=False))
print(masked_MAPE(y_true_total,y_pred_total))


0.3800343019404123
25.537624
42.151985
1.6065904029925389
