In [30]:
import numpy as np
import pandas as pd
from joblib import * 
from sklearn.metrics import r2_score,mean_absolute_error,mean_absolute_percentage_error,mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler,StandardScaler
import geopandas as gpd
# from geopy.distance import distance,geodesic
from joblib import Parallel, delayed
import warnings
import matplotlib.pyplot as plt
import scipy
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import acf, pacf,adfuller, kpss,range_unit_root_test
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import networkx as nx
from tqdm import tqdm
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

# different methods, all tested on 09-26 to 10-02

In [31]:
def masked_MAPE(v, v_, axis=None):
    '''
    Mean absolute percentage error.
    :param v: np.ndarray or int, ground truth.
    :param v_: np.ndarray or int, prediction.
    :param axis: axis to do calculation.
    :return: int, MAPE averages on all elements of input.
    '''
    mask = (v == 0)
    percentage = np.abs(v_ - v) / np.abs(v)
    if np.any(mask):
        masked_array = np.ma.masked_array(percentage, mask=mask)  # mask the dividing-zero as invalid
        result = masked_array.mean(axis=axis)
        if isinstance(result, np.ma.MaskedArray):
            return result.filled(np.nan)
        else:
            return result
    return np.mean(percentage, axis).astype(np.float64)

# station-level incoming/outgoing

In [None]:
od = pd.read_csv('inoutwide.csv')
od = od.sort_values(by=['Date','Hour'])
od_test = od.iloc[-24*7:,:]
## lag linear regression, many to one

# 48 continuous hourly lag, 3-7 days ago same hour-of-day lag, 7 weeks ago lag
for station in tqdm([i for i in od.columns if '_flow' in i]):
    subod = od[['Date','Hour',station,
 'dow-0','dow-1','dow-2','dow-3','dow-4','dow-5','dow-6',
 'hour-0','hour-1','hour-2','hour-3','hour-4','hour-5','hour-6','hour-7','hour-8','hour-9','hour-10',
 'hour-11','hour-12','hour-13','hour-14','hour-15','hour-16','hour-17','hour-18','hour-19','hour-20',
 'hour-21','hour-22','hour-23']]
    for lag in list(range(1,48))+list(np.array(list(range(3,8)))*24)+list(np.array(list(range(2,8)))*24*7):
        temp = subod[[station]].shift(lag)
        temp.columns = ['station'+'-lag-'+str(lag)]
        subod = pd.concat([subod,temp],axis=1)

    subod = subod.sort_values(by=['Date','Hour'])
    subod = subod.dropna()

    x = subod[[col for col in subod.columns if '-lag-' in col]]
    y = subod[[station]]


    x_train = x.iloc[:-24*7,:].values
    y_train = y.iloc[:-24*7,].values

    scaler = StandardScaler()
    scaler.fit(x_train)
    x_train = scaler.transform(x_train)

    y_scaler = StandardScaler()
    y_scaler.fit(y_train)
    y_train = y_scaler.transform(y_train)

    model = LinearRegression(fit_intercept=False).fit(x_train, y_train)

#     print('out of sample R2')
    x_test = x.iloc[-24*7:,:].values
    y_test = y.iloc[-24*7:].values

    x_test = scaler.transform(x_test)
    y_pred = model.predict(x_test)
    y_pred = y_scaler.inverse_transform(y_pred)

    od_test[station+'_pred'] = y_pred

flow_columns = [i for i in od.columns if '_flow' in i]
y_test = od_test[flow_columns].values
y_pred = od_test[[i+'_pred' for i in flow_columns]].values
print(r2_score(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))
print(mean_squared_error(y_test,y_pred,squared=False))
print(np.mean(mean_squared_error(y_test,y_pred,multioutput='raw_values',squared=False)/np.std(y_test)))
print(masked_MAPE(y_test,y_pred))

 43%|███████████████████████████████████████▌                                                    | 43/100 [00:12<00:16,  3.35it/s]

## adding  surrounding flows

In [None]:
od = pd.read_csv('inoutlongNear.csv')

od = od.sort_values(by=['station','Date','Hour'])
for lag in list(range(1,48))+list(np.array(list(range(3,8)))*24)+list(np.array(list(range(2,8)))*24*7):
    temp = od[['station','incoming_flow','outgoing_flow','nearby_incoming','nearby_outgoing']].shift(lag)
    temp.columns = ['station'+'-'+str(lag),
                    'incoming_flow'+'-'+str(lag),'outgoing_flow'+'-'+str(lag),
                    'nearby_incoming'+'-'+str(lag),'nearby_outgoing'+'-'+str(lag)]
    od = pd.concat([od,temp],axis=1)
    
od = od.sort_values(by=['Date','Hour','station'])
od = od.dropna()
od = od.loc[od['station']==od['station'+'-'+str(lag)]]
od = od.drop(columns=[i for i in od.columns if 'station-' in i])

In [None]:
# total nearby incoming outgoing
pbar = tqdm(od.station.unique())
for station in pbar:
    i = 0
    subod = od.loc[od['station']==station]
    x = subod.drop(columns=['Date','Hour','incoming_flow','outgoing_flow','station','nearby_incoming','nearby_outgoing'])
    y = subod[['incoming_flow','outgoing_flow']]

    # incoming model  training
    x_train = x.iloc[:-24*7,:].values
    x_test = x.iloc[-24*7:,:].values
    y_train_in = y.iloc[:-24*7,0].values.reshape(-1,1)
    y_train_out = y.iloc[:-24*7,1].values.reshape(-1,1)
    y_test_in = y.iloc[:-24*7,0].values.reshape(-1,1)
    y_test_out = y.iloc[-24*7:,1].values.reshape(-1,1)

    scaler = StandardScaler()
    scaler.fit(x_train)
    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)
    
    y_in_scaler = StandardScaler()
    y_in_scaler.fit(y_train_in)
    y_train_in = y_in_scaler.transform(y_train_in)
    
    model_in = LinearRegression(fit_intercept=False).fit(x_train, y_train_in)
    y_pred_in = model_in.predict(x_test)
    y_pred_in = y_in_scaler.inverse_transform(y_pred_in)
    
    y_pred_in_train = model_in.predict(x_train)
    y_pred_in_train = y_in_scaler.inverse_transform(y_pred_in_train)
    
    in_trian_r2_score = r2_score(y_in_scaler.inverse_transform(y_train_in),y_pred_in_train)
    in_test_r2_score = r2_score(y_test_in,y_pred_in_train)
    # outgoing model training
    
    y_out_scaler = StandardScaler()
    y_out_scaler.fit(y_train_out)
    y_train_out = y_out_scaler.transform(y_train_out)

    model_out = LinearRegression(fit_intercept=False).fit(x_train, y_train_out)
    y_pred_out = model_out.predict(x_test)
    y_pred_out = y_out_scaler.inverse_transform(y_pred_out)
    
    y_pred_out_train = model_out.predict(x_train)
    y_pred_out_train = y_in_scaler.inverse_transform(y_pred_out_train)
    
    out_trian_r2_score = r2_score(y_in_scaler.inverse_transform(y_train_out),y_pred_out_train)
    out_test_r2_score = r2_score(y_test_out,y_pred_out)
    if i == 0: 
        y_true = y.iloc[-24*7:,:].values
        y_pred = np.concatenate([y_pred_in,y_pred_out],axis=1)
    else:
        y_true = np.concatenate([y_true,y.iloc[-24*7:,:].values])
        y_pred = np.concatenate([y_true,np.concatenate([y_pred_in,y_pred_out],axis=1)])
    i += 1
    
    pbar.set_description('incoming_train_R2:'+ str(round(in_trian_r2_score,4))+\
                        'incoming_test_R2:'+ str(round(in_test_r2_score,4))+\
                         'outgoing_train_R2:'+ str(round(out_trian_r2_score,4))+\
                        'outgoing_test_R2:'+ str(round(out_test_r2_score,4)))
    
print(r2_score(y_true,y_pred))
print(mean_absolute_error(y_true,y_pred))
print(mean_squared_error(y_true,y_pred,squared=False))
print(np.mean(mean_squared_error(y_true,y_pred,multioutput='raw_values',squared=False)/np.std(y_true)))
print(masked_MAPE(y_true,y_pred))    




In [None]:
# nearby stations incoming outgoing, separately
od = pd.read_csv('inoutwide.csv')
G = nx.read_gpickle('graph.pickle')
pbar = tqdm(list(G.nodes))
for station in pbar:
    i = 0
    neightbor = list(nx.ego_graph(G, radius=1, n=station, distance='weight', center=False).nodes)
    flows = ['incoming_flow-'+i for i in [station]+neightbor]+\
              ['outgoing_flow-'+i for i in [station]+neightbor]
    subod = od[['Date','Hour']+['hour-'+str(i) for i in range(24)]+['dow-'+str(i) for i in range(7)]+flows]
    
    for lag in list(range(1,48))+list(np.array(list(range(3,8)))*24)+list(np.array(list(range(2,8)))*24*7):
        temp = subod[flows].shift(lag)
        temp.columns = [i+'-lag-'+str(lag) for i in flows]
        subod = pd.concat([subod,temp],axis=1)

    subod = subod.sort_values(by=['Date','Hour'])
    subod = subod.dropna()
    lags = [i for i in subod.columns if 'lag' in i]
    x = subod[lags+['hour-'+str(i) for i in range(24)]+['dow-'+str(i) for i in range(7)]]
    y = subod[['incoming_flow-'+station,'outgoing_flow-'+station]]

    # incoming model  training
    x_train = x.iloc[:-24*7,:].values
    x_test = x.iloc[-24*7:,:].values
    y_train_in = y.iloc[:-24*7,0].values.reshape(-1,1)
    y_train_out = y.iloc[:-24*7,1].values.reshape(-1,1)
    y_test_in = y.iloc[:-24*7,0].values.reshape(-1,1)
    y_test_out = y.iloc[-24*7:,1].values.reshape(-1,1)

    scaler = StandardScaler()
    scaler.fit(x_train)
    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)
    
    y_in_scaler = StandardScaler()
    y_in_scaler.fit(y_train_in)
    y_train_in = y_in_scaler.transform(y_train_in)
    
    model_in = LinearRegression(fit_intercept=False).fit(x_train, y_train_in)
    y_pred_in = model_in.predict(x_test)
    y_pred_in = y_in_scaler.inverse_transform(y_pred_in)
    
    y_pred_in_train = model_in.predict(x_train)
    y_pred_in_train = y_in_scaler.inverse_transform(y_pred_in_train)
    
    in_trian_r2_score = r2_score(y_in_scaler.inverse_transform(y_train_in),y_pred_in_train)
    in_test_r2_score = r2_score(y_test_in,y_pred_in_train)
    # outgoing model training
    
    y_out_scaler = StandardScaler()
    y_out_scaler.fit(y_train_out)
    y_train_out = y_out_scaler.transform(y_train_out)

    model_out = LinearRegression(fit_intercept=False).fit(x_train, y_train_out)
    y_pred_out = model_out.predict(x_test)
    y_pred_out = y_out_scaler.inverse_transform(y_pred_out)
    
    y_pred_out_train = model_out.predict(x_train)
    y_pred_out_train = y_in_scaler.inverse_transform(y_pred_out_train)
    
    out_trian_r2_score = r2_score(y_in_scaler.inverse_transform(y_train_out),y_pred_out_train)
    out_test_r2_score = r2_score(y_test_out,y_pred_out)
    if i == 0: 
        y_true = y.iloc[-24*7:,:].values
        y_pred = np.concatenate([y_pred_in,y_pred_out],axis=1)
    else:
        y_true = np.concatenate([y_true,y.iloc[-24*7:,:].values])
        y_pred = np.concatenate([y_true,np.concatenate([y_pred_in,y_pred_out],axis=1)])
    i += 1
    
    pbar.set_description('incoming_train_R2:'+ str(round(in_trian_r2_score,4))+\
                        'incoming_test_R2:'+ str(round(in_test_r2_score,4))+\
                         'outgoing_train_R2:'+ str(round(out_trian_r2_score,4))+\
                        'outgoing_test_R2:'+ str(round(out_test_r2_score,4)))
    
print(r2_score(y_true,y_pred))
print(mean_absolute_error(y_true,y_pred))
print(mean_squared_error(y_true,y_pred,squared=False))
print(np.mean(mean_squared_error(y_true,y_pred,multioutput='raw_values',squared=False)/np.std(y_true)))
print(masked_MAPE(y_true,y_pred))    




In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
torch.__version__

In [None]:
class VNN(nn.Module):
    def __init__(self,input_size,n_feature=128):
        super(VNN, self).__init__()
        
        
        self.weight = nn.Sequential(
        nn.Linear(input_size, n_feature),
        nn.Sigmoid(),
        nn.Linear(n_feature,n_feature),
        nn.Sigmoid(),
        nn.Linear(n_feature,n_feature),
        nn.Sigmoid(),
        nn.Linear(n_feature,2),
        )
 
    def forward(self,x):
        flow = self.weight(x)
        return flow


def get_loss_and_metrics(model, data, target,criterion, device):
  # Implement forward pass and loss calculation for one batch.
  # Remember to move the batch to device.
  # 
  # Return a tuple:
  # - loss for the batch (Tensor)
  # - number of correctly classified examples in the batch (Tensor)
#     data, target = batch[0], batch[1]
#     print(data.shape)
    data = torch.tensor(data, dtype=torch.float32)
    target = torch.tensor(target, dtype=torch.float32)
    data, target = data.to(device), target.to(device)
    optimizer.zero_grad()
    pred = model(data)
#     print(pred,target)
    loss = criterion(pred, target)
    
    
    return (pred,target,loss)
    
def step(loss, optimizer):

    loss.backward()
    optimizer.step()


In [None]:
nodes_dist = pd.read_csv('nodes_dist.csv')
G = nx.read_gpickle('graph.pickle')
od = pd.read_csv('inoutwide.csv')
pbar = tqdm(list(G.nodes))
count = 0

for station in pbar:
    
#     neighbors = list(nx.ego_graph(G, radius=1, n=station, distance='weight', center=False).nodes)
    neighbors = nodes_dist.loc[(nodes_dist['o']==station)&\
                               (nodes_dist['dist']<1)&\
                               (nodes_dist['o']!=nodes_dist['d'])]['d'].values.tolist()
    flows = ['incoming_flow-'+station]+\
            ['incoming_flow-'+i for i in neighbors]+['outgoing_flow-'+station]+\
                      ['outgoing_flow-'+i for i in neighbors]

    fts = od[['Date','Hour']+flows]
    lags = list(range(1,24))+list(np.array(list(range(2,8)))*24)+list(np.array(list(range(2,8)))*24*7)
    for lag in lags:
        temp = fts[flows].shift(lag)
        temp.columns = [i+'-lag-'+str(lag) for i in flows]
        fts = pd.concat([fts,temp],axis=1)
    fts = fts.dropna()
    fts = fts.drop(columns=['Date','Hour'])

    
    # len(lags) is number of lags, 2 is bidirectional
    # remove flows at time t
    fts_train = fts.iloc[:-24*7,(len(neighbors)+1)*2:].values
    scaler = StandardScaler()
    scaler.fit(fts_train)
    fts_train = torch.tensor(scaler.transform(fts_train))

    y = fts[['incoming_flow-'+station,'outgoing_flow-'+station]]
    y_train = y.iloc[:-24*7,:].values
    y_scaler = StandardScaler()
    y_scaler.fit(y_train)
    y_train = y_scaler.transform(y_train)
    y_train = torch.tensor(y_train)
    train_dataset = torch.utils.data.TensorDataset(fts_train,y_train)
 

    fts_val = torch.tensor(scaler.transform(fts.iloc[-24*7:,(len(neighbors)+1)*2:].values)) 
    y_val = y.iloc[-24*7:,:].values
    y_val = torch.tensor(y_val)    
    device = torch.device("mps")

    N_EPOCHS = 501
    BATCH_SIZE = 64
#     print('here')
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE,
                                                   num_workers=0)

    model = VNN(input_size=(1+len(neighbors))*len(lags)*2)
    model = model.to(device)
    criterion = nn.SmoothL1Loss()
    
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01) 
#     qbar = tqdm(range(N_EPOCHS), leave=False)
    validation_mae,validation_r2,validation_mape = 999,999,999
    for i in range(N_EPOCHS): 

        total_train_loss = 0.0
        model.train()
        training_batch_count = 1
        for batch in train_dataloader:
            y_train,y_true,loss = get_loss_and_metrics(model,batch[0],batch[1], criterion, device)
            
            total_train_loss += loss.item()
            mean_train_loss = total_train_loss / training_batch_count
            pbar.set_description('epoch:' + str(i) +\
                ' training batch:' + str(training_batch_count) +\
                ' total train loss:'+ str(round(mean_train_loss,3))+ \
                 ' validation_mae:'+ str(round(validation_mae,3)) +\
                ' validation R2: '+ str(round(validation_r2,3)) +\
                ' validation MAPE: '+ str(round(validation_mape,3)))
            training_batch_count += 1
            
            step(loss,optimizer)
        
        if i%50 == 0 and i >=50:
#         if i >= 0:
            with torch.no_grad(): 
                
                y_pred,y_true,loss = get_loss_and_metrics(model, fts_val, y_val, criterion, device)
                y_pred_val,y_true_val = y_pred.cpu().numpy(),y_true.cpu().numpy()
                y_pred_val = y_scaler.inverse_transform(y_pred_val)

                validation_mae = mean_absolute_error(y_true_val,y_pred_val)
                validation_r2 = r2_score(y_true_val,y_pred_val)
                validation_mape = masked_MAPE(y_true_val,y_pred_val)
                pbar.set_description('epoch:' + str(i) +\
                ' training batch:' + str(training_batch_count) +\
                ' total train loss:'+ str(round(mean_train_loss,3))+ \
                 ' validation_mae:'+ str(round(validation_mae,3)) +\
                ' validation R2: '+ str(round(validation_r2,3)) +\
                ' validation MAPE: '+ str(round(validation_mape,3)))
                        
                
                
    if count == 0: 
        y_true_total = y_true_val
        y_pred_total = y_pred_val
    else:
        y_true_total = np.concatenate([y_true_total,y_true_val])
        y_pred_total = np.concatenate([y_pred_total,y_pred_val])
    count += 1


In [29]:
# y_true,y_pred = y_true.cpu().numpy(),y_pred.cpu().numpy()
print(r2_score(y_true_total,y_pred_total))
print(mean_absolute_error(y_true_total,y_pred_total))
print(mean_squared_error(y_true_total,y_pred_total,squared=False))
print(mean_squared_error(y_true_total,y_pred_total,squared=False)/np.mean(np.std(y_true_total)))

print(masked_MAPE(y_true_total,y_pred_total))

0.9732133272481034
17.420902
36.256836
0.16361974
0.4888177925810495
