# Multi-Workers

In [21]:
from torch_geometric.datasets import Flickr
from torch.utils.data import DataLoader
import torch_geometric.transforms as T
import torch

transform = T.Compose([])
dataset = Flickr("/mnt/nfs-ssd/raw-datasets/pyg-format/Flickr", transform=transform)
data = dataset[0]

kwargs = {'batch_size': 64, 'num_workers': 6, 'persistent_workers': True}
node_index = data.train_mask.nonzero(as_tuple=False).view(-1)
loader = DataLoader(node_index.tolist(), shuffle=True, pin_memory=True, **kwargs)
iter_loader = iter(loader)

In [None]:
torch.utils.data.get_worker_info()

# SubGraph

In [24]:
train_data = data.subgraph(data.train_mask)
train_data.num_nodes

44625

In [None]:
from torch_geometric.utils import subgraph

subg, attr = subgraph(data.train_mask, data.edge_index)
subg.size()

# EgoSampler
设置最大hop数目，可以限制时间开销随节点budget线性增长
时间开销随batch_size线性增长
(64) 100: 1.66; 200: 3.0; 400: 4.0
(32) 100: 0.7
(1) 100: 0.02

In [14]:
import sys
import os
curPath = os.path.abspath(os.path.dirname('/home/xhh/notebooks/GNN/pytorch-template/notebooks/'))
rootPath = os.path.split(curPath)[0]
sys.path.append(rootPath)

In [16]:
from src.datamodules.datasets.loader import EgoGraphLoader
from src.models.components.assort_sampler import AdaptiveSampler

kwargs = {'batch_size': 32,
          'num_workers': 0,
          'persistent_workers': False,
          'pin_memory': False,
          'shuffle': True}

sampler = AdaptiveSampler(data, 50, max_hop=10)
ego_loader = EgoGraphLoader(data.train_mask, sampler, **kwargs)
iter_graphs = iter(ego_loader)
next(iter_graphs)

EgoDataBatch(x=[1538, 500], y=[32], p=[1538], hop=[32], ego_ptr=[32], batch=[1538], ptr=[33], batch_size=32, adj_t=[1538, 1538, nnz=3304])

## Time Analysis

In [75]:
from time import time
t = time()

runs = 5
for i in range(runs):
    batch = next(iter_graphs)

print(f'{(time() - t) / runs: .2f}s')

 0.72s


## Tensor Batch

In [64]:
from torch_sparse import SparseTensor
import copy
from src.datamodules.datasets.loader import to_sparse

nd = copy.copy(data)
row, col = nd.edge_index.cpu()
self_adj_t = SparseTensor(
                row=row, col=col,
                value=torch.arange(col.size(0)),
                sparse_sizes=(data.num_nodes, data.num_nodes)).t()

loader = DataLoader(node_index.tolist(), shuffle=True, batch_size=10)
iter_loader = iter(loader)
batch_nodes = next(iter_loader)
batch_size = batch_nodes.size(0)

In [190]:
adj_t_1, v = self_adj_t.sample_adj(batch_nodes, -1, replace=False)
print(adj_t_1)

row, col, layer_e = adj_t_1.coo()

e_mask, v_idx, ptr = [], [], []

for bn in range(batch_size):
    m = row == bn
    idx = torch.unique(col[m])
    e_mask.append(m)
    v_idx.append(idx)
    ptr.append(torch.full((len(idx),), bn))

true_v = v[torch.cat(v_idx)]
batch_idx = torch.cat(ptr)
p_v = torch.rand(true_v.size(0))

mask = torch.zeros(true_v.size(0), dtype=torch.bool)
mask[torch.rand(true_v.size(0)) > 0.5] = 1

saved_p = p_v[mask]
next_v = true_v[mask]
next_idx = batch_idx[mask]

SparseTensor(row=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3,
                           3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5,
                           5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 9,
                           9, 9, 9]),
             col=tensor([10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
                           28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
                           46, 47, 48, 49, 50, 51, 52, 22, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
                           63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
                           81, 82, 83]),
             val=tensor([289655, 378752, 389616, 395895, 413177, 423719, 475812, 573193, 627007,
                           790157, 827732, 860831,   8546,  77655,  95388, 510110, 796627,  86920,
                           211813, 2

In [162]:
# 针对于每一个batch node的指针
torch.cat(true_v.size(0)).unique()

tensor([10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
        28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
        46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
        64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
        82, 83])

In [195]:
# 拼接成新的u
# layer score 基于新的图上面（冗余的边）
# ego score 独立
next_idx

tensor([0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6,
        6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9])

In [None]:
u = v[torch.cat(v_idx)]
adj_t_2, u2 = self_adj_t.sample_adj(u, -1, replace=False)
row, col, layer_e = adj_t_2.coo()

print(adj_t_2, len(u), len(u2), len(u.unique()), len(u2.unique()))

In [157]:
# 选中范围
node_i = 0
print((row == ptr[node_i]).nonzero()[0], (row==ptr[node_i+1]).nonzero()[0])

tensor([0]) tensor([140])


In [143]:
torch.unique(u2, return_inverse=True)

(tensor([   11,    14,    25,  ..., 89197, 89234, 89240]),
 tensor([1349, 2126, 2224,  ..., 6447, 7765, 7908]))

In [34]:
# ego score
# 计算batch_node score
# 计算每一层u的score
# 根据batch_idx计算cos

x = torch.Tensor([[1], [2], [3]])
x.expand(3, 4)

tensor([[1., 1., 1., 1.],
        [2., 2., 2., 2.],
        [3., 3., 3., 3.]])

In [45]:
from torch.nn.utils.rnn import pack_sequence, pack_padded_sequence

x1 = torch.Tensor([1, 2, 3])
x2 = torch.Tensor([1, 2])
x3 = torch.Tensor([1])

ps = pack_sequence([x1, x2, x3])
ps

PackedSequence(data=tensor([1., 1., 1., 2., 2., 3.]), batch_sizes=tensor([3, 2, 1]), sorted_indices=None, unsorted_indices=None)

# Graph Loader

In [8]:
from src.datamodules.datasets.data import get_data

nd, _, _, _ = get_data('cora')
nd

Data(x=[2708, 1433], edge_index=[2, 13264], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])

In [9]:
from torch_geometric.utils import k_hop_subgraph

sub = k_hop_subgraph(nd.val_mask, 2, nd.edge_index)
sub[3].sum()

tensor(11857)

In [11]:
from torch_geometric.loader import ClusterData
from src.datamodules.datasets.loader import NeighborLoader, ClusterLoader, SaintRwLoader, ShadowLoader

kwargs = {'batch_size': 512, 'shuffle': True}
train_loader = NeighborLoader(nd, input_nodes=nd.train_mask, num_neighbors=[25, 10], **kwargs)
# train_loader = ClusterLoader(ClusterData(nd, num_parts=1500, recursive=False, save_dir=dataset.processed_dir,), **kwargs)
# train_loader = SaintRwLoader(nd, batch_size=6000, walk_length=2, num_steps=5, sample_coverage=100, save_dir=dataset.processed_dir)
# train_loader = ShadowLoader(nd, depth=2, num_neighbors=10, node_idx=data.train_mask, **kwargs)
batch = next(train_loader.__iter__())
batch

Data(x=[1355, 1433], y=[140], train_mask=[1355], val_mask=[1355], test_mask=[1355], batch_size=140, adj_t=[1355, 1355, nnz=3556], ego_ptr=[140])