In [1]:
import argparse
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
import datetime
import time
import matplotlib.pyplot as plt
from torchinfo import summary
import yaml
import json
import sys
import glob
import copy
import random
from tqdm import tqdm, trange
# from lib.utils import print_log, StandardScaler, vrange
import torch.utils.data as utils

In [2]:
sys.path.append("..")
from lib.utils import (
    MaskedMAELoss,
    MaskedHuberLoss,
    print_log,
    seed_everything,
    set_cpu_num,
    masked_mae_loss,
    CustomJSONEncoder,
)
from lib.metrics import RMSE_MAE_MAPE
from lib.data_prepare import get_dataloaders_from_index_data
from model.STGformer import STGformer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from lib.utils1 import adj_reader, shp_and_tmc_reader, feature_reader

In [8]:
seed = random.randint(0,1000)  # set random seed here
seed_everything(seed)
set_cpu_num(1)

In [9]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [6]:
dataset_name = 'NYC_INRNIX'
dataset_name = dataset_name.upper()

In [7]:
model_name = STGformer.__name__

In [10]:
adj_path = "/home/dachuan/Productivities/Spectral GAT/NY/adj_manhattan.npy"
feature_path = "/home/dachuan/Productivities/Spectral GAT/SPGAT/Data/speed_19_Manhattan_5min_py36"
shp_path = "/home/dachuan/Productivities/Spectral GAT/NY/Manhattan_FinalVersion.shp"
tmc_path = "/home/dachuan/Productivities/Spectral GAT/NY/TMC_FinalVersion.csv"

In [11]:
df = pd.read_pickle(feature_path )

In [16]:
df.values.mean()

17.783092395153858

In [32]:
class MinMaxScaler:
    """
    Min-Max scaler to scale input to [0, 1] (or custom range).
    Based on structure from Graph-WaveNet's StandardScaler.
    """

    def __init__(self, min_val, max_val, min_feature=0, max_feature=1):
        self.min_val = min_val
        self.max_val = max_val
        self.min_feature = min_feature
        self.max_feature = max_feature

        self.val_range = max_val - min_val
        self.feature_range = max_feature - min_feature

    def transform(self, data):
        return (data - self.min_val) / self.val_range * self.feature_range + self.min_feature

    def inverse_transform(self, data):
        return (data - self.min_feature) / self.feature_range * self.val_range + self.min_val


In [24]:
x = np.random.uniform(0, 100, size=(105095, 12, 1212)).astype(np.float32)
print(x.std())   # might give warning
print(x.std())   # fine

28.867449
28.867449


In [33]:
scaler = MinMaxScaler(min_val=0, max_val=99, min_feature=0, max_feature=1)

In [26]:
x = np.random.uniform(0, 100, size=(105095, 12, 1212)).astype(np.float64)
print(x.mean())   # might give warning
print(x.mean()) 

50.00041203897676
50.00041203897676


In [18]:
adj = adj_reader(adj_path)
_, tmc = shp_and_tmc_reader(shp_path, tmc_path)
train_loader, valid_loader, test_loader, max_feature = feature_reader(  feature_path,
                                                                        32, 
                                                                        12, 
                                                                        1, 
                                                                        0.7, 
                                                                        0.2,
                                                                        tmc,
                                                                        None,
                                                                        False,
                                                                    )

--- Building Sequences ---


100%|██████████| 105095/105095 [00:06<00:00, 15913.29it/s]


--- Scaling Sequences ---


  x = um.multiply(x, x, out=x)


Trainset:	x-(73566, 12, 1212, 1)	y-(73566, 1, 1212, 1)
Valset:  	x-(21019, 12, 1212, 1)  	y-(21019, 1, 1212, 1)
Testset:	x-(10510, 12, 1212, 1)	y-(10510, 1, 1212, 1)


In [None]:
def feature_reader(path, batch_size, seq_len, pred_len,
                   train_proportion, valid_proportion, tmc,
                   split_indices_path=None, save_split=False, log=None):
    np.random.seed(99)
    torch.manual_seed(99)

    df = pd.read_pickle(path)
    feature_matrix = df.reindex(columns=tmc, fill_value=0)
    time_len = feature_matrix.shape[0]
    
    # Build sequences
    print ("--- Building Sequences ---")
    feature_seq, feature_label = [], []
    for i in trange(time_len - seq_len - pred_len):
        feature_seq.append(feature_matrix.iloc[i:i+seq_len].values)
        feature_label.append(feature_matrix.iloc[i+seq_len:i+seq_len+pred_len].values)
    feature_seq, feature_label = np.expand_dims(np.asarray(feature_seq), axis=-1), np.expand_dims(np.asarray(feature_label), axis=-1)

    sample_size = feature_seq.shape[0]

    # --- Load or create consistent split indices ---
    if split_indices_path and os.path.exists(split_indices_path):
        split_data = np.load(split_indices_path)
        print (f"Load split from {split_indices_path}")
        index = split_data['index']
    else:
        index = np.arange(sample_size, dtype=int)
        np.random.shuffle(index)
        if split_indices_path and save_split:
            if not os.path.exists(split_indices_path):
                print (f"New split saved to {split_indices_path}")
                np.savez(split_indices_path, index=index)

    # Split
    train_index = int(np.floor(sample_size * train_proportion))
    valid_index = int(np.floor(sample_size * (train_proportion + valid_proportion)))

    train_data, train_label = feature_seq[index[:train_index]], feature_label[index[:train_index]]
    valid_data, valid_label = feature_seq[index[train_index:valid_index]], feature_label[index[train_index:valid_index]]
    test_data, test_label = feature_seq[index[valid_index:]], feature_label[index[valid_index:]]

    print (f"-- mean:{train_data.mean()}, std:{train_data.std()}, max:{train_data.max()}, min:{train_data.min()} --")

    print ("--- Scaling Sequences ---")
    scaler = StandardScaler(mean=train_data.mean(), std=train_data.std())

    train_data = scaler.transform(train_data)
    valid_data = scaler.transform(valid_data)
    test_data = scaler.transform(test_data)

    print_log(f"Trainset:\tx-{train_data.shape}\ty-{train_label.shape}", log=log)
    print_log(f"Valset:  \tx-{valid_data.shape}  \ty-{valid_label.shape}", log=log)
    print_log(f"Testset:\tx-{test_data.shape}\ty-{test_label.shape}", log=log)

    # Convert to tensors
    train_dataset = utils.TensorDataset(torch.Tensor(train_data), torch.Tensor(train_label))
    valid_dataset = utils.TensorDataset(torch.Tensor(valid_data), torch.Tensor(valid_label))
    test_dataset = utils.TensorDataset(torch.Tensor(test_data), torch.Tensor(test_label))

    # Create dataloaders
    train_dataloader = utils.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    valid_dataloader = utils.DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    test_dataloader = utils.DataLoader(test_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

    return train_dataloader, valid_dataloader, test_dataloader, scaler 

In [44]:
train_loader, valid_loader, test_loader, scaler = feature_reader(  feature_path,
                                                                        32, 
                                                                        12, 
                                                                        1, 
                                                                        0.6, 
                                                                        0.2,
                                                                        tmc,
                                                                        None,
                                                                        False,
                                                                    )

--- Building Sequences ---


100%|██████████| 105095/105095 [00:05<00:00, 18519.47it/s]


(63057, 12, 1212, 1) 14.71416593950576 99.0 0.0
--- Scaling Sequences ---
Trainset:	x-(63057, 12, 1212, 1)	y-(63057, 1, 1212, 1)
Valset:  	x-(21019, 12, 1212, 1)  	y-(21019, 1, 1212, 1)
Testset:	x-(21019, 12, 1212, 1)	y-(21019, 1, 1212, 1)
