In [34]:
import os
import sys
import math
import torch

PROJECT_DIR = os.path.abspath(os.path.abspath('') + "/..")
sys.path.append(PROJECT_DIR)

import numpy as np
from tqdm import tqdm
from basicts.data import TimeSeriesForecastingDataset


dataset_name = "METR-LA"
batch_size = 8
metric = "cosine" # metric used to calculate the similarity.
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

history_seq_len = 12 # the historical sequence length used in computing the spatial indistinguishability ratio, which should be larger than or equal to the patch_size.
future_seq_len = 12 # the future sequence length used in computing the spatial indistinguishability ratio.


## utilities

In [35]:
# similarity computation
def cosine_similarity(x, y):
    # denominator
    l2_x = torch.norm(x, dim=2, p=2) + 1e-7
    l2_y = torch.norm(y, dim=2, p=2) + 1e-7
    l2_n = torch.matmul(l2_x.unsqueeze(dim=2), l2_y.unsqueeze(dim=2).transpose(1, 2))
    # numerator
    l2_d = torch.matmul(x, y.transpose(1, 2))
    return l2_d / l2_n

def get_similarity_matrix(data, metric):
    if metric == "cosine":
        sim = cosine_similarity(data, data)
    elif metric == "mse":
        sim = torch.cdist(data, data, p=2)
    elif metric == "mae":
        sim = torch.cdist(data, data, p=1)
    else:
        raise NotImplementedError
    return sim

In [36]:
# dataloader
def load_data(dataset_name, history_seq_len, future_seq_len, mode, batch_size):
    """Get a dataloader.

    Args:
        name (str): dataset name.
        tsformer_in_seq_len (int): historical sequence length.
        tsformer_out_seq_len (int): tsformer_out_seq_len does not matter, it is only used to load datasets.
        mode (str): ["train", "val", "test"].
        batch_size (int): batch size.
    """
    assert mode in ["train", "valid", "test"]
    data_path = PROJECT_DIR + "/datasets/{0}/data_in{1}_out{2}.pkl".format(dataset_name, history_seq_len, future_seq_len)
    index_path = PROJECT_DIR + "/datasets/{0}/index_in{1}_out{2}.pkl".format(dataset_name, history_seq_len, future_seq_len)
    dataset = TimeSeriesForecastingDataset(data_path, index_path, mode=mode)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=3)
    return dataloader


## Generate Similarity Matrix

In [37]:
# get similarity matrices

# inference pipeline for a given dataloader
history_adjs_all = []
future_adjs_all = []
def inference(dataloader):
    for batch in tqdm(dataloader):
        future_data, history_data = batch
        history_data = history_data[..., 0].transpose(1, 2) # batch_size, num_nodes, history_seq_len
        future_data = future_data[..., 0].transpose(1, 2) # batch_size, num_nodes, future_seq_len
        history_adjs = get_similarity_matrix(history_data, metric) # batch_size, num_nodes, num_nodes
        future_adjs = get_similarity_matrix(future_data, metric) # batch_size, num_nodes, num_nodes
        history_adjs_all.append(history_adjs)
        future_adjs_all.append(future_adjs)
# get similarity matrices
# for mode in ["valid"]:
for mode in ["train"]:
    dataloader = load_data(dataset_name, history_seq_len, future_seq_len, mode, batch_size)
    inference(dataloader)


100%|██████████| 2997/2997 [00:11<00:00, 250.75it/s]


In [28]:
# get spatial indistinguishability ratio
history_similarity = torch.cat(history_adjs_all, dim=0).detach().cpu() # num_samples, num_modes, num_nodes
future_similarity = torch.cat(future_adjs_all, dim=0).detach().cpu() # num_samples, num_modes, num_nodes


In [29]:
# save results
save_path = PROJECT_DIR + "/experiments/history_{0}_{1}.npy".format(dataset_name, metric)
with open(save_path, "wb") as f:
    np.save(f, history_similarity)

save_path = PROJECT_DIR + "/experiments/future_{0}_{1}.npy".format(dataset_name, metric)
with open(save_path, "wb") as f:
    np.save(f, future_similarity)


In [30]:
L, N, N = future_similarity.shape
print(future_similarity.shape)



torch.Size([36586, 21, 21])


## Get Spatial Indistinguishability Ratio

In [31]:
e_u = 0.9
e_l = 0.4

history_similarity_filtered = torch.where(history_similarity > e_u, torch.ones_like(history_similarity), torch.zeros_like(history_similarity))
future_similarity_filtered = torch.where(future_similarity < e_l, torch.ones_like(future_similarity), torch.zeros_like(future_similarity))
overlap = history_similarity_filtered * future_similarity_filtered


In [32]:
# overlap ratio
overlap_ratio = overlap.sum() / (L * N * N)
print(overlap_ratio * 100)

tensor(2.5232)


In [33]:
# indistinguishability ratio
indistinguishability_ratio = overlap.sum() / history_similarity_filtered.sum()
print(indistinguishability_ratio * 100)


tensor(11.0950)


## Test

In [None]:
import numpy as np
save_path = PROJECT_DIR + "/experiments/history_{0}_{1}.npy".format(dataset_name, metric)
with open(save_path, "rb") as f:
    history_similarity = np.load(f)

save_path = PROJECT_DIR + "/experiments/future_{0}_{1}.npy".format(dataset_name, metric)
with open(save_path, "rb") as f:
    future_similarity = np.load(f)

history_similarity = torch.Tensor(history_similarity)
future_similarity = torch.Tensor(future_similarity)


In [None]:
t_future_similarity = future_similarity[:10, :, :]
t_history_similarity = history_similarity[:10, :, :]


In [None]:
t_history_similarity_sorted = t_history_similarity.view(-1).sort()[0].numpy()[700000:]
t_future_similarity_sorted = t_future_similarity.view(-1).sort()[0].numpy()[700000:]

In [None]:
# plot
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [20, 5]
plt.plot(range(t_history_similarity_sorted.size), t_history_similarity_sorted, linewidth=2)
plt.plot(range(t_future_similarity_sorted.size), t_future_similarity_sorted, linewidth=2)
plt.grid()
plt.show()

In [None]:
t_history_similarity_sorted.shape

In [None]:
t_history_similarity_sorted[600000:].shape

In [None]:
import torch

In [13]:
X = torch.randn(32, 207, 12) # batch_size, num_nodes, history_seq_len
Y = torch.randn(32, 207, 12) # batch_size, num_nodes, future_seq_len
# calculate the euclidean distance between each row of X and each row of Y
dist = torch.cdist(X, Y, p=2)
# calculate the batch cosine similarity pairwisely between each row of X and each row of Y




In [15]:
def batch_cosine_sim(data):
    # data: batch_size, num_nodes, seq_len
    # calculate the cosine similarity among nodes pairwisely in each sample
    data = data.transpose(1, 2) # batch_size, seq_len, num_nodes
    data = data.unsqueeze(1) # batch_size, 1, seq_len, num_nodes
    data = data.repeat(1, data.size(2), 1, 1) # batch_size, seq_len, seq_len, num_nodes
    data = data.transpose(2, 3) # batch_size, seq_len, num_nodes, seq_len
    data = data.reshape(-1, data.size(1), data.size(3)) # batch_size * num_nodes, seq_len, seq_len
    sim = torch.cosine_similarity(data, data, dim=2) # batch_size * num_nodes, seq_len
    sim = sim.reshape(-1, data.size(1), data.size(1)) # batch_size * num_nodes, seq_len, seq_len
    return sim


In [17]:
batch_cosine_sim(X).shape

torch.Size([552, 12, 12])