## group_hetero_graph 操作

利用 pyg的group_hetero_graph方法, 将异构图转为节点index统一编号的形式! 

注意到, 该函数的输入仅为基本的torch数据, 因此不影响. 在DGL中只要得到对应类型的输入即可!


In [1]:
from torch_geometric.utils.hetero import group_hetero_graph

# load sample torch_geometric data
from torch_geometric.datasets import TUDataset
from torch_geometric.data import DataLoader
from torch_geometric.utils import degree

# dataset = TUDataset(root='/tmp/ENZYMES', name='ENZYMES')
# data = dataset[0]
# print(data)

# hetrograph OGB_MAG
from torch_geometric.datasets import OGB_MAG
data = OGB_MAG(root='/tmp/OGB_MAG', transform=None)[0]



  from .autonotebook import tqdm as notebook_tqdm
Downloading http://snap.stanford.edu/ogb/data/nodeproppred/mag.zip
Extracting /tmp/OGB_MAG/mag/raw/mag.zip
Processing...
Done!


In [2]:
data

HeteroData(
  [1mpaper[0m={
    x=[736389, 128],
    year=[736389],
    y=[736389],
    train_mask=[736389],
    val_mask=[736389],
    test_mask=[736389]
  },
  [1mauthor[0m={ num_nodes=1134649 },
  [1minstitution[0m={ num_nodes=8740 },
  [1mfield_of_study[0m={ num_nodes=59965 },
  [1m(author, affiliated_with, institution)[0m={ edge_index=[2, 1043998] },
  [1m(author, writes, paper)[0m={ edge_index=[2, 7145660] },
  [1m(paper, cites, paper)[0m={ edge_index=[2, 5416271] },
  [1m(paper, has_topic, field_of_study)[0m={ edge_index=[2, 7505078] }
)

In [3]:
type(data)

torch_geometric.data.hetero_data.HeteroData

In [4]:
# data.node_year_dict = None
edge_index_dict = data.edge_index_dict

In [5]:
out = group_hetero_graph(data.edge_index_dict, data.num_nodes_dict)

In [15]:
# {('author','affiliated_with','institution'): Tensor(2,M)}
data.edge_index_dict
# {'paper': 736389}
data.num_nodes_dict

{'paper': 736389,
 'author': 1134649,
 'institution': 8740,
 'field_of_study': 59965}

In [6]:
# pyg/utils/hetro.py
import torch

""" 实际调用的转换函数, 可以看到, 和PYG的图结构无关! 因此DGL也可以使用 """
def pyg_group_hetero_graph(edge_index_dict, num_nodes_dict=None):
    # num_nodes_dict = maybe_num_nodes_dict(edge_index_dict, num_nodes_dict)

    tmp = list(edge_index_dict.values())[0]

    key2int = {}

    cumsum, offset = 0, {}  # Helper data.
    node_types, local_node_indices = [], []
    local2global = {}
    for i, (key, N) in enumerate(num_nodes_dict.items()):
        key2int[key] = i
        node_types.append(tmp.new_full((N, ), i))
        local_node_indices.append(torch.arange(N, device=tmp.device))
        offset[key] = cumsum
        local2global[key] = local_node_indices[-1] + cumsum
        local2global[i] = local2global[key]
        cumsum += N

    node_type = torch.cat(node_types, dim=0)
    local_node_idx = torch.cat(local_node_indices, dim=0)

    edge_indices, edge_types = [], []
    for i, (keys, edge_index) in enumerate(edge_index_dict.items()):
        key2int[keys] = i
        inc = torch.tensor([offset[keys[0]], offset[keys[-1]]]).view(2, 1)
        edge_indices.append(edge_index + inc.to(tmp.device))
        edge_types.append(tmp.new_full((edge_index.size(1), ), i))

    edge_index = torch.cat(edge_indices, dim=-1)
    edge_type = torch.cat(edge_types, dim=0)

    return (edge_index, edge_type, node_type, local_node_idx, local2global,
            key2int)

edge_index, edge_type, node_type, local_node_idx, local2global, key2int = pyg_group_hetero_graph(data.edge_index_dict, data.num_nodes_dict)
print(edge_index)

tensor([[ 736389,  736390,  736391,  ...,  736388,  736388,  736388],
        [1871883, 1872034, 1874235,  ..., 1901236, 1902061, 1911712]])


In [16]:
data.x_dict

{'paper': tensor([[-0.0954,  0.0408, -0.2109,  ...,  0.0616, -0.0277, -0.1338],
         [-0.1510, -0.1073, -0.2220,  ...,  0.3458, -0.0277, -0.2185],
         [-0.1148, -0.1760, -0.2606,  ...,  0.1731, -0.1564, -0.2780],
         ...,
         [ 0.0228, -0.0865,  0.0981,  ..., -0.0547, -0.2077, -0.2305],
         [-0.2891, -0.2029, -0.1525,  ...,  0.1042,  0.2041, -0.3528],
         [-0.0890, -0.0348, -0.2642,  ...,  0.2601, -0.0875, -0.5171]])}

In [17]:
data.x_dict['paper'].shape

torch.Size([736389, 128])

In [18]:
data.y_dict

{'paper': tensor([246, 131, 189,  ..., 266, 289,   1])}

## 在DGL图上实现 pyg.HeteroData 接口

总结 sampler.py 涉及到的data接口 (HeteroData)

```python
data.edge_index_dict: {('author','affiliated_with','institution'): Tensor(2,M)}
data.num_nodes_dict: {'paper': 736389}
# N=736389 paper数量
data.x_dict: 只包含 {'paper': Tensor(N,features)}
data.y_dict: 只包含 {'paper': Tensor(N)}
```

In [60]:
from torch_geometric.data.hetero_data import HeteroData

data = HeteroData()
data.y_dict
data.x_dict
data.num_nodes_dict
data.edge_index_dict


{}

In [44]:
from dgl.data.utils import load_graphs, save_graphs, Subset
pre_processed_file_path = "/home/ec2-user/workspace/tab2graph/datasets/tiny/dgl_data_processed"
# pre_processed_file_path = "/home/ec2-user/workspace/ogb/dataset/tiny/dgl_data_processed"
graph, label_dict = load_graphs(pre_processed_file_path)
graph

[Graph(num_nodes={'affiliated_with': 5, 'author': 5, 'cites': 100, 'has_topic': 100, 'institution': 3, 'paper': 100, 'paper2': 100, 'writes': 100},
       num_edges={('affiliated_with', 'affiliated_with-author-author', 'author'): 5, ('affiliated_with', 'affiliated_with-institution-institution', 'institution'): 5, ('author', 'reverse-affiliated_with-author-author', 'affiliated_with'): 5, ('author', 'reverse-writes-author-author', 'writes'): 100, ('cites', 'cites-paper_cite-paper', 'paper'): 100, ('cites', 'cites-paper_cited-paper2', 'paper2'): 100, ('has_topic', 'has_topic-paper-paper', 'paper'): 100, ('institution', 'reverse-affiliated_with-institution-institution', 'affiliated_with'): 5, ('paper', 'reverse-cites-paper_cite-paper', 'cites'): 100, ('paper', 'reverse-has_topic-paper-paper', 'has_topic'): 100, ('paper', 'reverse-writes-paper-paper', 'writes'): 100, ('paper2', 'reverse-cites-paper_cited-paper2', 'cites'): 100, ('writes', 'writes-author-author', 'author'): 100, ('writes', '

In [45]:
from dgl import DGLGraph

g:DGLGraph = graph[0]
type(g)

dgl.heterograph.DGLGraph

In [54]:
def add_properties_to_dgl(g:DGLGraph):
    """ 添加 edge_index_dict, num_nodes_dict, x_dict, y_dict 属性
    """
    graph = g.clone()

    # 在 convert_dglData.py 中, 预先将数据放进去
    graph.x_dict = graph.ndata['feat']
    graph.y_dict = graph.ndata['label']
    
    edge_index_dict = {}
    for edge_type in graph.canonical_etypes:
        src, dst = graph.all_edges(form='uv', etype=edge_type, order='srcdst')
        edge_index_dict[edge_type] = torch.stack([src, dst], dim=0)
    graph.edge_index_dict = edge_index_dict
    
    num_nodes_dict = {}
    for ntype in graph.ntypes:
        num_nodes_dict[ntype] = graph.number_of_nodes(ntype)
    graph.num_nodes_dict = num_nodes_dict
    
    return graph

g_added = add_properties_to_dgl(g)
print(g_added)

Graph(num_nodes={'affiliated_with': 5, 'author': 5, 'cites': 100, 'has_topic': 100, 'institution': 3, 'paper': 100, 'paper2': 100, 'writes': 100},
      num_edges={('affiliated_with', 'affiliated_with-author-author', 'author'): 5, ('affiliated_with', 'affiliated_with-institution-institution', 'institution'): 5, ('author', 'reverse-affiliated_with-author-author', 'affiliated_with'): 5, ('author', 'reverse-writes-author-author', 'writes'): 100, ('cites', 'cites-paper_cite-paper', 'paper'): 100, ('cites', 'cites-paper_cited-paper2', 'paper2'): 100, ('has_topic', 'has_topic-paper-paper', 'paper'): 100, ('institution', 'reverse-affiliated_with-institution-institution', 'affiliated_with'): 5, ('paper', 'reverse-cites-paper_cite-paper', 'cites'): 100, ('paper', 'reverse-has_topic-paper-paper', 'has_topic'): 100, ('paper', 'reverse-writes-paper-paper', 'writes'): 100, ('paper2', 'reverse-cites-paper_cited-paper2', 'cites'): 100, ('writes', 'writes-author-author', 'author'): 100, ('writes', 'wr

In [48]:
# # https://docs.dgl.ai/en/latest/api/python/dgl.DGLGraph.html
# # 相较于 etyps, g.canonical_etypes 是三元组形式的
# g.etypes
# g.canonical_etypes
# etype = 'affiliated_with-author-author'
# etype = ('affiliated_with', 'affiliated_with-author-author', 'author')

# # g.edges(etype=etype, order='srcdst') #, form='all')
# for etype in g.canonical_etypes:
#     print(etype)
#     print(g.edges(etype=etype, order='srcdst'))


('affiliated_with', 'affiliated_with-author-author', 'author')
(tensor([0, 1, 2, 3, 4]), tensor([0, 1, 2, 3, 4]))
('affiliated_with', 'affiliated_with-institution-institution', 'institution')
(tensor([0, 2, 1, 3, 4]), tensor([0, 0, 1, 1, 2]))
('author', 'reverse-affiliated_with-author-author', 'affiliated_with')
(tensor([0, 1, 2, 3, 4]), tensor([0, 1, 2, 3, 4]))
('author', 'reverse-writes-author-author', 'writes')
(tensor([3, 1, 4, 4, 3, 4, 4, 1, 0, 0, 1, 0, 3, 0, 4, 1, 2, 4, 4, 3, 2, 3, 1, 2,
        1, 0, 4, 0, 1, 3, 4, 4, 2, 0, 1, 1, 1, 1, 3, 1, 3, 3, 2, 2, 0, 1, 3, 2,
        1, 2, 2, 0, 3, 3, 2, 1, 3, 0, 3, 0, 1, 2, 1, 1, 0, 1, 2, 3, 0, 0, 1, 1,
        1, 4, 2, 1, 4, 0, 2, 4, 0, 3, 3, 2, 4, 4, 0, 3, 3, 1, 4, 2, 3, 2, 2, 1,
        3, 2, 3, 1]), tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,

In [31]:
# g.ntypes
# g.num_nodes(ntype='paper')

100

In [62]:
# 测试 torch的 Linear层
from torch import nn
import torch

# 1. 定义一个线性层
linear = nn.Linear(5, 3)
# 2. 定义一个输入
x = torch.randn(2, 5)
# 3. 计算输出
y = linear(x)
print(y)


tensor([[-0.0409,  0.1130,  0.5719],
        [-0.6358, -0.0969, -0.0738]], grad_fn=<AddmmBackward0>)


In [63]:
type(x)

torch.Tensor

In [64]:
x.dtype

torch.float32

## 空间占用分析



In [None]:
import torch

def get_size_tensor(t:torch.Tensor):
    return t.element_size() * t.numel()

def get_size(x):
    if type(x) == torch.Tensor:
        return get_size_tensor(x)
    elif type(x) == list:
        return sum([get_size_tensor(t) for t in x])
    elif type(x) == dict:
        return sum([get_size_tensor(t) for t in x.values()])

for name in 'x_dict edge_type node_type local_node_idx y_global'.split():
    x = eval(name)
    size_GB = get_size(x) / 1024**3
    print(f'{name}: {size_GB:.2f} GB')