In [1]:
import numpy as np
import pandas as pd
from joblib import * 
from sklearn.metrics import r2_score,mean_absolute_error,mean_absolute_percentage_error,mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler,StandardScaler
import geopandas as gpd
# from geopy.distance import distance,geodesic
from joblib import Parallel, delayed
import warnings
import matplotlib.pyplot as plt
import scipy
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import acf, pacf,adfuller, kpss,range_unit_root_test
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import networkx as nx
from tqdm import tqdm
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')



# different methods, all tested on 09-26 to 10-02

In [2]:
def masked_MAPE(v, v_, axis=None):
    '''
    Mean absolute percentage error.
    :param v: np.ndarray or int, ground truth.
    :param v_: np.ndarray or int, prediction.
    :param axis: axis to do calculation.
    :return: int, MAPE averages on all elements of input.
    '''
    mask = (v == 0)
    percentage = np.abs(v_ - v) / np.abs(v)
    if np.any(mask):
        masked_array = np.ma.masked_array(percentage, mask=mask)  # mask the dividing-zero as invalid
        result = masked_array.mean(axis=axis)
        if isinstance(result, np.ma.MaskedArray):
            return result.filled(np.nan)
        else:
            return result
    return np.mean(percentage, axis).astype(np.float64)

# station-level incoming/outgoing

In [3]:
od = pd.read_csv('inoutwide.csv')
od = od.sort_values(by=['Date','Hour'])
od_test = od.iloc[-24*7:,:]
## lag linear regression, many to one

# 48 continuous hourly lag, 3-7 days ago same hour-of-day lag, 7 weeks ago lag
for station in tqdm([i for i in od.columns if '_flow' in i]):
    subod = od[['Date','Hour',station,
 'dow-0','dow-1','dow-2','dow-3','dow-4','dow-5','dow-6',
 'hour-0','hour-1','hour-2','hour-3','hour-4','hour-5','hour-6','hour-7','hour-8','hour-9','hour-10',
 'hour-11','hour-12','hour-13','hour-14','hour-15','hour-16','hour-17','hour-18','hour-19','hour-20',
 'hour-21','hour-22','hour-23']]
    for lag in list(range(1,48))+list(np.array(list(range(3,8)))*24)+list(np.array(list(range(2,8)))*24*7):
        temp = subod[[station]].shift(lag)
        temp.columns = ['station'+'-lag-'+str(lag)]
        subod = pd.concat([subod,temp],axis=1)

    subod = subod.sort_values(by=['Date','Hour'])
    subod = subod.dropna()

    x = subod[[col for col in subod.columns if '-lag-' in col]]
    y = subod[[station]]


    x_train = x.iloc[:-24*7,:].values
    y_train = y.iloc[:-24*7,].values

    scaler = StandardScaler()
    scaler.fit(x_train)
    x_train = scaler.transform(x_train)

    y_scaler = StandardScaler()
    y_scaler.fit(y_train)
    y_train = y_scaler.transform(y_train)

    model = LinearRegression(fit_intercept=False).fit(x_train, y_train)

#     print('out of sample R2')
    x_test = x.iloc[-24*7:,:].values
    y_test = y.iloc[-24*7:].values

    x_test = scaler.transform(x_test)
    y_pred = model.predict(x_test)
    y_pred = y_scaler.inverse_transform(y_pred)

    od_test[station+'_pred'] = y_pred

flow_columns = [i for i in od.columns if '_flow' in i]
y_test = od_test[flow_columns].values
y_pred = od_test[[i+'_pred' for i in flow_columns]].values
print(r2_score(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))
print(mean_squared_error(y_test,y_pred,squared=False))
print(np.mean(mean_squared_error(y_test,y_pred,multioutput='raw_values',squared=False)/np.std(y_test)))
print(masked_MAPE(y_test,y_pred))

100%|███████████████████████████████████████████████████████████████████████████████████████████| 100/100 [02:24<00:00,  1.45s/it]

0.9387008610819879
16.6033665998665
26.67203137438082
0.12036546191120305
0.41237860129069803





## adding  surrounding flows

In [8]:
od = pd.read_csv('inoutlongNear.csv')

od = od.sort_values(by=['station','Date','Hour'])
for lag in list(range(1,48))+list(np.array(list(range(3,8)))*24)+list(np.array(list(range(2,8)))*24*7):
    temp = od[['station','incoming_flow','outgoing_flow','nearby_incoming','nearby_outgoing']].shift(lag)
    temp.columns = ['station'+'-'+str(lag),
                    'incoming_flow'+'-'+str(lag),'outgoing_flow'+'-'+str(lag),
                    'nearby_incoming'+'-'+str(lag),'nearby_outgoing'+'-'+str(lag)]
    od = pd.concat([od,temp],axis=1)
    
od = od.sort_values(by=['Date','Hour','station'])
od = od.dropna()
od = od.loc[od['station']==od['station'+'-'+str(lag)]]
od = od.drop(columns=[i for i in od.columns if 'station-' in i])

In [10]:
# total nearby incoming outgoing
pbar = tqdm(od.station.unique())
for station in pbar:
    i = 0
    subod = od.loc[od['station']==station]
    x = subod.drop(columns=['Date','Hour','incoming_flow','outgoing_flow','station','nearby_incoming','nearby_outgoing'])
    y = subod[['incoming_flow','outgoing_flow']]

    # incoming model  training
    x_train = x.iloc[:-24*7,:].values
    x_test = x.iloc[-24*7:,:].values
    y_train_in = y.iloc[:-24*7,0].values.reshape(-1,1)
    y_train_out = y.iloc[:-24*7,1].values.reshape(-1,1)
    y_test_in = y.iloc[:-24*7,0].values.reshape(-1,1)
    y_test_out = y.iloc[-24*7:,1].values.reshape(-1,1)

    scaler = StandardScaler()
    scaler.fit(x_train)
    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)
    
    y_in_scaler = StandardScaler()
    y_in_scaler.fit(y_train_in)
    y_train_in = y_in_scaler.transform(y_train_in)
    
    model_in = LinearRegression(fit_intercept=False).fit(x_train, y_train_in)
    y_pred_in = model_in.predict(x_test)
    y_pred_in = y_in_scaler.inverse_transform(y_pred_in)
    
    y_pred_in_train = model_in.predict(x_train)
    y_pred_in_train = y_in_scaler.inverse_transform(y_pred_in_train)
    
    in_trian_r2_score = r2_score(y_in_scaler.inverse_transform(y_train_in),y_pred_in_train)
    in_test_r2_score = r2_score(y_test_in,y_pred_in_train)
    # outgoing model training
    
    y_out_scaler = StandardScaler()
    y_out_scaler.fit(y_train_out)
    y_train_out = y_out_scaler.transform(y_train_out)

    model_out = LinearRegression(fit_intercept=False).fit(x_train, y_train_out)
    y_pred_out = model_out.predict(x_test)
    y_pred_out = y_out_scaler.inverse_transform(y_pred_out)
    
    y_pred_out_train = model_out.predict(x_train)
    y_pred_out_train = y_in_scaler.inverse_transform(y_pred_out_train)
    
    out_trian_r2_score = r2_score(y_in_scaler.inverse_transform(y_train_out),y_pred_out_train)
    out_test_r2_score = r2_score(y_test_out,y_pred_out)
    if i == 0: 
        y_true = y.iloc[-24*7:,:].values
        y_pred = np.concatenate([y_pred_in,y_pred_out],axis=1)
    else:
        y_true = np.concatenate([y_true,y.iloc[-24*7:,:].values])
        y_pred = np.concatenate([y_true,np.concatenate([y_pred_in,y_pred_out],axis=1)])
    i += 1
    
    pbar.set_description('incoming_train_R2:'+ str(round(in_trian_r2_score,4))+\
                        'incoming_test_R2:'+ str(round(in_test_r2_score,4))+\
                         'outgoing_train_R2:'+ str(round(out_trian_r2_score,4))+\
                        'outgoing_test_R2:'+ str(round(out_test_r2_score,4)))
    
print(r2_score(y_true,y_pred))
print(mean_absolute_error(y_true,y_pred))
print(mean_squared_error(y_true,y_pred,squared=False))
print(np.mean(mean_squared_error(y_true,y_pred,multioutput='raw_values',squared=False)/np.std(y_test)))
print(masked_MAPE(y_true,y_pred))    




0.9662666138880707
16.912960001963743
24.40287604161488
0.11012522464007464
0.42478471631772374


In [11]:
# nearby stations incoming outgoing, separately
od = pd.read_csv('inoutwide.csv')
G = nx.read_gpickle('graph.pickle')
pbar = tqdm(list(G.nodes))
for station in pbar:
    i = 0
    neightbor = list(nx.ego_graph(G, radius=1, n=station, distance='weight', center=False).nodes)
    flows = ['incoming_flow-'+i for i in [station]+neightbor]+\
              ['outgoing_flow-'+i for i in [station]+neightbor]
    subod = od[['Date','Hour']+['hour-'+str(i) for i in range(24)]+['dow-'+str(i) for i in range(7)]+flows]
    
    for lag in list(range(1,48))+list(np.array(list(range(3,8)))*24)+list(np.array(list(range(2,8)))*24*7):
        temp = subod[flows].shift(lag)
        temp.columns = [i+'-lag-'+str(lag) for i in flows]
        subod = pd.concat([subod,temp],axis=1)

    subod = subod.sort_values(by=['Date','Hour'])
    subod = subod.dropna()
    lags = [i for i in subod.columns if 'lag' in i]
    x = subod[lags+['hour-'+str(i) for i in range(24)]+['dow-'+str(i) for i in range(7)]]
    y = subod[['incoming_flow-'+station,'outgoing_flow-'+station]]

    # incoming model  training
    x_train = x.iloc[:-24*7,:].values
    x_test = x.iloc[-24*7:,:].values
    y_train_in = y.iloc[:-24*7,0].values.reshape(-1,1)
    y_train_out = y.iloc[:-24*7,1].values.reshape(-1,1)
    y_test_in = y.iloc[:-24*7,0].values.reshape(-1,1)
    y_test_out = y.iloc[-24*7:,1].values.reshape(-1,1)

    scaler = StandardScaler()
    scaler.fit(x_train)
    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)
    
    y_in_scaler = StandardScaler()
    y_in_scaler.fit(y_train_in)
    y_train_in = y_in_scaler.transform(y_train_in)
    
    model_in = LinearRegression(fit_intercept=False).fit(x_train, y_train_in)
    y_pred_in = model_in.predict(x_test)
    y_pred_in = y_in_scaler.inverse_transform(y_pred_in)
    
    y_pred_in_train = model_in.predict(x_train)
    y_pred_in_train = y_in_scaler.inverse_transform(y_pred_in_train)
    
    in_trian_r2_score = r2_score(y_in_scaler.inverse_transform(y_train_in),y_pred_in_train)
    in_test_r2_score = r2_score(y_test_in,y_pred_in_train)
    # outgoing model training
    
    y_out_scaler = StandardScaler()
    y_out_scaler.fit(y_train_out)
    y_train_out = y_out_scaler.transform(y_train_out)

    model_out = LinearRegression(fit_intercept=False).fit(x_train, y_train_out)
    y_pred_out = model_out.predict(x_test)
    y_pred_out = y_out_scaler.inverse_transform(y_pred_out)
    
    y_pred_out_train = model_out.predict(x_train)
    y_pred_out_train = y_in_scaler.inverse_transform(y_pred_out_train)
    
    out_trian_r2_score = r2_score(y_in_scaler.inverse_transform(y_train_out),y_pred_out_train)
    out_test_r2_score = r2_score(y_test_out,y_pred_out)
    if i == 0: 
        y_true = y.iloc[-24*7:,:].values
        y_pred = np.concatenate([y_pred_in,y_pred_out],axis=1)
    else:
        y_true = np.concatenate([y_true,y.iloc[-24*7:,:].values])
        y_pred = np.concatenate([y_true,np.concatenate([y_pred_in,y_pred_out],axis=1)])
    i += 1
    
    pbar.set_description('incoming_train_R2:'+ str(round(in_trian_r2_score,4))+\
                        'incoming_test_R2:'+ str(round(in_test_r2_score,4))+\
                         'outgoing_train_R2:'+ str(round(out_trian_r2_score,4))+\
                        'outgoing_test_R2:'+ str(round(out_test_r2_score,4)))
    
print(r2_score(y_true,y_pred))
print(mean_absolute_error(y_true,y_pred))
print(mean_squared_error(y_true,y_pred,squared=False))
print(np.mean(mean_squared_error(y_true,y_pred,multioutput='raw_values',squared=False)/np.std(y_test)))
print(masked_MAPE(y_true,y_pred))    




incoming_train_R2:0.9529incoming_test_R2:0.9529outgoing_train_R2:0.9535outgoing_test_R2:0.9747: 100%|█| 50/50 [00:17<00:00,  2.85i

0.9674945899484979
10.053871554330986
15.665256287706036
0.07069412083993876
0.3914008420693012





In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
torch.__version__

'1.13.0'

In [4]:
class relevance_VNN(nn.Module):
    def __init__(self,input_size,n_neighbor, n_feature, output_size):
        super(relevance_VNN, self).__init__()
        
        self.relevance = nn.Sequential(
        nn.Linear(2, 1),
        nn.Sigmoid(),
        nn.Linear(1,1),
        )
        
        self.weight = nn.Sequential(
        nn.Linear((n_neighbor+1)*2, n_feature),
        nn.Sigmoid(),
        nn.Linear(n_feature,n_feature),
        nn.Sigmoid(),
#         nn.Linear(n_feature,n_feature),
#         nn.Sigmoid(),
        nn.Linear(n_feature,2),
        )
 
    def forward(self, x, n_neighbor,verbose=False):
        att = self.relevance(x)
        
        # relevance * flows
        flow = att.mul(x[:,:1])
        print(att.shape,x[:,:1].shape,flow.shape)
        # reshape to get weighted avaerge from all neighbors at each time lag
        flow = flow.view(int(flow.shape[0]/(n_neighbor+1)/2),(n_neighbor+1)*2)
        flow = self.weight(x)
        return flow

def get_loss_and_metrics(model, batch, n_neighbor,criterion, device):
  # Implement forward pass and loss calculation for one batch.
  # Remember to move the batch to device.
  # 
  # Return a tuple:
  # - loss for the batch (Tensor)
  # - number of correctly classified examples in the batch (Tensor)
    data, target = batch[0], batch[1]
    print(data.shape)
    target[:int(target.shape[0]/(58*2*(n_neighbor+1)))]
    data = torch.tensor(data, dtype=torch.float32)
    target = torch.tensor(target, dtype=torch.float32)
    data, target = data.to(device), target.to(device)
    optimizer.zero_grad()
    pred = model(data,n_neighbor)
    loss = criterion(pred, target)
    
    
    return (pred,target,loss)
    
def step(loss, optimizer):
  # Implement backward pass and update.

  # TODO
    loss = loss
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


In [17]:
nodes_dist.loc[(nodes_dist['o']==station)&(nodes_dist['dist']<1)]['d'].values.tolist()

['PLZA']

In [26]:
nodes_dist = pd.read_csv('nodes_dist.csv')
# neighbors = list(nx.ego_graph(G, radius=1, n=station, distance='weight', center=False).nodes)
neighbors = nodes_dist.loc[(nodes_dist['o']==station)&\
                           (nodes_dist['dist']<1)&\
                           (nodes_dist['o']!=nodes_dist['d'])]['d'].values.tolist()
flows = ['incoming_flow-'+station,'outgoing_flow-'+station]+\
        ['incoming_flow-'+i for i in neighbors]+\
                  ['outgoing_flow-'+i for i in neighbors]

subod = od[['Date','Hour']+flows]
lags = list(range(1,4))+list(np.array(list(range(2,8)))*24)+list(np.array(list(range(2,8)))*24*7)
for lag in lags:
    temp = subod[flows].shift(lag)
    temp.columns = [i+'-lag-'+str(lag) for i in flows]
    subod = pd.concat([subod,temp],axis=1)
subod = subod.dropna()
print('here')
subod_melt = subod.drop(columns=flows).melt(id_vars=['Date','Hour'])
subod_melt = subod_melt.sort_values(by=['Date','Hour'])
subod_melt['nearby'] = subod_melt['variable'].apply(lambda x:x.split('flow-')[1].split('-')[0])
subod_melt['degree'] = subod_melt.apply(lambda x:nx.shortest_path_length(G,station,x['nearby']),axis=1)
subod_melt['path_distance'] = subod_melt.apply(lambda x:nx.dijkstra_path_length(G,station,x['nearby']),axis=1)

fts = subod_melt[['value','degree']]
# 58 is number of lags, 2 is bidirectional
fts_train = fts.iloc[:-24*7*len(lags)*2*(len(neighbors)+1),:].values

scaler = StandardScaler()
scaler.fit(fts_train)
fts_train = torch.tensor(scaler.transform(fts_train))

here


NameError: name 'torch' is not defined

In [None]:
G = nx.read_gpickle('graph.pickle')
od = pd.read_csv('inoutwide.csv')
pbar = tqdm(list(G.nodes))
count = 0

for station in pbar:
    
    neighbors = list(nx.ego_graph(G, radius=1, n=station, distance='weight', center=False).nodes)
    flows = ['incoming_flow-'+station,'outgoing_flow-'+station]+\
            ['incoming_flow-'+i for i in neighbors]+\
                      ['outgoing_flow-'+i for i in neighbors]

    subod = od[['Date','Hour']+flows]
    lags = list(range(1,4))+list(np.array(list(range(2,8)))*24)+list(np.array(list(range(2,8)))*24*7)
    for lag in lags:
        temp = subod[flows].shift(lag)
        temp.columns = [i+'-lag-'+str(lag) for i in flows]
        subod = pd.concat([subod,temp],axis=1)
    subod = subod.dropna()
    print('here')
    subod_melt = subod.drop(columns=flows).melt(id_vars=['Date','Hour'])
    subod_melt = subod_melt.sort_values(by=['Date','Hour'])
    subod_melt['nearby'] = subod_melt['variable'].apply(lambda x:x.split('flow-')[1].split('-')[0])
    subod_melt['degree'] = subod_melt.apply(lambda x:nx.shortest_path_length(G,station,x['nearby']),axis=1)

    fts = subod_melt[['value','degree']]
    # 58 is number of lags, 2 is bidirectional
    fts_train = fts.iloc[:-24*7*len(lags)*2*(len(neighbors)+1),:].values

    scaler = StandardScaler()
    scaler.fit(fts_train)
    fts_train = torch.tensor(scaler.transform(fts_train))

    y = subod[['incoming_flow-'+station]]
    y_train = y.iloc[:-24*7,:].values
    y_scaler = StandardScaler()
    y_scaler.fit(y_train)
    y_train = torch.tensor(y_scaler.transform(y_train))
    y_train = y_train.repeat(len(lags)*2*(len(neighbors)+1),1)
    train_dataset = torch.utils.data.TensorDataset(fts_train,y_train)
 

    fts_val = torch.tensor(scaler.transform(fts.iloc[-24*7*len(lags)*2*(len(neighbors)+1):,:].values)) 
    y_val = y.iloc[-24*7:,:].values
    y_val = torch.tensor(y_val)
    y_val = y_val.repeat(len(lags)*2*(len(neighbors)+1),1)
    validation_dataset = torch.utils.data.TensorDataset(fts_val, y_val)
    
    device = torch.device("mps")

    N_EPOCHS = 1000
    BATCH_SIZE = int(1*2*(len(neighbors)+1)*len(lags))
    print('here')
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
    validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=len(validation_dataset),
                                                        num_workers=0)
    model = relevance_VNN(input_size=fts.shape[1],n_neighbor=len(neighbors),n_feature=64,output_size=2)
    model = model.to(device)
    criterion = nn.SmoothL1Loss()
    
    optimizer = torch.optim.SGD(model.parameters(), lr=0.00001) 
    train_losses = []

    for i in range(N_EPOCHS): 
        total_train_loss = 0.0
        model.train()

        for batch in train_dataloader:
            y_train,y_true,loss = get_loss_and_metrics(model, batch, len(neighbors),criterion, device)
            step(loss.float(), optimizer)
            total_train_loss += loss.item()
        for batch in validation_dataloader:
            with torch.no_grad(): 
                y_pred,y_true,loss = get_loss_and_metrics(model, batch,len(neighbors), criterion, device)
                y_pred,y_true = y_pred.cpu().numpy(),y_true.cpu().numpy()
                y_pred = y_scaler.inverse_transform(y_pred)
                validation_mae = mean_absolute_error(y_true,y_pred)
                validation_r2 = r2_score(y_true,y_pred)
                validation_mape = masked_MAPE(y_true,y_pred)
    
        mean_train_loss = total_train_loss / len(train_dataloader)
        pbar.set_description('train_loss:'+ str(round(mean_train_loss,4))+ \
                             ' validation_mae:'+ str(round(validation_mae,4)) +\
                            ' validation R2: '+ str(round(validation_r2,4)) +\
                            ' validation MAPE: '+ str(round(validation_mape,4)))
    if count == 0: 
        y_true_total = y_true
        y_pred_total = y_val
    else:
        y_true_total = np.concatenate([y_true_total,y_true])
        y_pred_total = np.concatenate([y_pred_total,y_val])
    count += 1

  0%|                                                                                                      | 0/50 [00:00<?, ?it/s]

In [6]:
int(1*2*(len(neighbors)+1)*58)

87

In [102]:
subod_melt = subod.drop(columns=['incoming_flow-'+station,
                                 'outgoing_flow-'+station]).melt(id_vars=['Date','Hour'])
subod_melt['nearby'] = subod_melt['variable'].apply(lambda x:x.split('flow-')[1].split('-')[0])
subod_melt['degree'] = subod_melt.apply(lambda x:nx.shortest_path_length(G,station,x['nearby']),axis=1)

In [188]:
station = '19TH'
neighbors = list(nx.ego_graph(G, radius=1, n=station, distance='weight', center=False).nodes)
flows = ['incoming_flow-'+station,'outgoing_flow-'+station]+\
    ['incoming_flow-'+i for i in neighbors]+\
              ['outgoing_flow-'+i for i in neighbors]

subod = od[['Date','Hour']+flows]

for lag in list(range(1,48))+list(np.array(list(range(3,8)))*24)+list(np.array(list(range(2,8)))*24*7):
    temp = subod[flows].shift(lag)
    temp.columns = [i+'-lag-'+str(lag) for i in flows]
    subod = pd.concat([subod,temp],axis=1)
subod = subod.dropna()

subod_melt = subod.drop(columns=flows).melt(id_vars=['Date','Hour'])
subod_melt['nearby'] = subod_melt['variable'].apply(lambda x:x.split('flow-')[1].split('-')[0])
subod_melt['degree'] = subod_melt.apply(lambda x:nx.shortest_path_length(G,station,x['nearby']),axis=1)

fts = subod_melt[['value','degree']]
fts_train = fts.iloc[:-24*7*58*2*(len(neighbors)+1),:].values
scaler = StandardScaler()
scaler.fit(fts_train)
fts_train = torch.tensor(scaler.transform(fts_train))

y = subod[['incoming_flow-'+station]]
y_train = y.iloc[:-24*7,:].values
y_scaler = StandardScaler()
y_scaler.fit(y_train)
y_train = torch.tensor(y_scaler.transform(y_train))
y_train = y_train.repeat(58*2*(len(neighbors)+1),1)
train_dataset = torch.utils.data.TensorDataset(fts_train,y_train)

In [189]:

y_train.shape

torch.Size([1829088, 1])

In [190]:
y_train

tensor([[-0.7100],
        [-0.9838],
        [-0.9916],
        ...,
        [-0.6866],
        [-0.6084],
        [-0.5927]], dtype=torch.float64)

In [183]:
subod_melt.sort_values(by=['Date','Hour'])

Unnamed: 0,Date,Hour,variable,value,nearby,degree
0,2022-02-19,0,incoming_flow-19TH-lag-1,85.0,19TH,0
5424,2022-02-19,0,outgoing_flow-19TH-lag-1,56.0,19TH,0
10848,2022-02-19,0,incoming_flow-LAKE-lag-1,66.0,LAKE,2
16272,2022-02-19,0,incoming_flow-12TH-lag-1,95.0,12TH,1
21696,2022-02-19,0,outgoing_flow-LAKE-lag-1,19.0,LAKE,2
...,...,...,...,...,...,...
1865855,2022-10-02,23,outgoing_flow-19TH-lag-1176,22.0,19TH,0
1871279,2022-10-02,23,incoming_flow-LAKE-lag-1176,38.0,LAKE,2
1876703,2022-10-02,23,incoming_flow-12TH-lag-1176,52.0,12TH,1
1882127,2022-10-02,23,outgoing_flow-LAKE-lag-1176,19.0,LAKE,2


In [175]:
y_train.shape

torch.Size([1829088, 1])

In [176]:
fts_train.shape

torch.Size([1850784, 2])

In [146]:
fts_train.shape[0]

609696

In [147]:
fts_train.shape

torch.Size([609696, 2])

In [129]:
609696/116

5256.0

In [114]:
len(list(range(1,48))+list(np.array(list(range(3,8)))*24)+list(np.array(list(range(2,8)))*24*7))

58

In [118]:
640032/5424

118.0

In [119]:
58*2

116

In [117]:
subod.shape

(5424, 120)

In [115]:
subod_melt

Unnamed: 0,Date,Hour,variable,value,nearby,degree
0,2022-02-19,0,incoming_flow-CAST,7.0,CAST,0
1,2022-02-19,1,incoming_flow-CAST,1.0,CAST,0
2,2022-02-19,2,incoming_flow-CAST,0.0,CAST,0
3,2022-02-19,3,incoming_flow-CAST,0.0,CAST,0
4,2022-02-19,4,incoming_flow-CAST,0.0,CAST,0
...,...,...,...,...,...,...
640027,2022-10-02,19,outgoing_flow-CAST-lag-1176,14.0,CAST,0
640028,2022-10-02,20,outgoing_flow-CAST-lag-1176,13.0,CAST,0
640029,2022-10-02,21,outgoing_flow-CAST-lag-1176,10.0,CAST,0
640030,2022-10-02,22,outgoing_flow-CAST-lag-1176,12.0,CAST,0


In [72]:
neighbors = list(nx.ego_graph(G, radius=1, n=station, distance='weight', center=False).nodes)
flows = ['incoming_flow-'+station,'outgoing_flow-'+station]+\
    ['incoming_flow-'+i for i in neighbors]+\
              ['outgoing_flow-'+i for i in neighbors]

subod = od[['Date','Hour']+flows]

for lag in list(range(1,48))+list(np.array(list(range(3,8)))*24)+list(np.array(list(range(2,8)))*24*7):
    temp = subod[flows].shift(lag)
    temp.columns = [i+'-lag-'+str(lag) for i in flows]
    subod = pd.concat([subod,temp],axis=1)
subod = subod.dropna()

In [100]:
test.head()

Unnamed: 0,Date,Hour,variable,value,nearby,degree
0,2022-02-19,0,incoming_flow-CAST,7.0,CAST,0
1,2022-02-19,1,incoming_flow-CAST,1.0,CAST,0
2,2022-02-19,2,incoming_flow-CAST,0.0,CAST,0
3,2022-02-19,3,incoming_flow-CAST,0.0,CAST,0
4,2022-02-19,4,incoming_flow-CAST,0.0,CAST,0


In [91]:
'incoming_flow-CAST'.split('flow-')[1].split('-')[0]

'CAST'

In [97]:
test = subod.melt(id_vars=['Date','Hour'])
test['nearby'] = test['variable'].apply(lambda x:x.split('flow-')[1].split('-')[0])
test['degree'] = test.apply(lambda x:nx.shortest_path_length(G,station,x['nearby']),axis=1)

In [87]:
test = np.array([1.1,1.2,1.3,2.1,2.2,2.3]).reshape(-1,1)
test = torch.tensor(test)
test.view((2,3))

tensor([[1.1000, 1.2000, 1.3000],
        [2.1000, 2.2000, 2.3000]], dtype=torch.float64)

In [53]:
print(r2_score(y_test,y_pred_vnn))
print(mean_absolute_error(y_test,y_pred_vnn))
print(mean_squared_error(y_test,y_pred_vnn,squared=False))
print(mean_squared_error(y_test,y_pred_vnn,squared=False)/np.mean(np.var(y_pred)))
print(masked_MAPE(y_test,y_pred_vnn))


0.9783333554260818
16.76745470988621
32.61365709111786
0.0006806279690539939
0.5030006782152145


In [446]:
for batch in validation_dataloader:
    with torch.no_grad(): 
        y_pred,y_true,loss = get_loss_and_metrics(model, batch, criterion, device)

In [452]:
y_true.numpy()

array([[33., 11.],
       [35., 33.],
       [57.,  4.],
       ...,
       [22., 16.],
       [ 7.,  5.],
       [71., 31.]], dtype=float32)

In [453]:
y_true,y_pred = y_true.cpu().numpy(),y_pred.cpu().numpy()
print(r2_score(y_true,y_pred))
print(mean_absolute_error(y_true,y_pred))
print(mean_squared_error(y_true,y_pred,squared=False))
print(masked_MAPE(y_true,y_pred))


0.9097482689371424
21.502193
66.24005
0.30688763313285594
