统计图的统计特征

1. degree_info(graph) 图上不同类型的边的分位数 in_degrees 信息

In [13]:
import numpy as np
import pandas as pd
import torch
import dgl


In [2]:
from ogb.nodeproppred import PygNodePropPredDataset, Evaluator, DglNodePropPredDataset

dataset = DglNodePropPredDataset(name='ogbn-mag')
graph = dataset.graph[0]
print(graph)

Graph(num_nodes={'author': 1134649, 'field_of_study': 59965, 'institution': 8740, 'paper': 736389},
      num_edges={('author', 'affiliated_with', 'institution'): 1043998, ('author', 'writes', 'paper'): 7145660, ('paper', 'cites', 'paper'): 5416271, ('paper', 'has_topic', 'field_of_study'): 7505078},
      metagraph=[('author', 'institution', 'affiliated_with'), ('author', 'paper', 'writes'), ('paper', 'paper', 'cites'), ('paper', 'field_of_study', 'has_topic')])


In [16]:
graph.canonical_etypes

[('author', 'affiliated_with', 'institution'),
 ('author', 'writes', 'paper'),
 ('paper', 'cites', 'paper'),
 ('paper', 'has_topic', 'field_of_study')]

In [15]:
graph.ntypes

['author', 'field_of_study', 'institution', 'paper']

In [9]:

# 针对 dgl.add_reverse_edges(graph) 的报错: AssertionError: add_reverse_edges is not well defined for unidirectional bipartite graphs, but ('author', 'affiliated_with', 'institution') is unidirectional bipartite
# dgl.to_bidirected(graph)
# dgl.add_reverse_edges(graph)
def convert_hetro_bidirected_to_directed(graph):
    # BUG: 好像默认就是 unidirectional bipartite?? 
    graph_edges = {}
    for etype in graph.canonical_etypes:
        # 获取边类型的源节点类型、目标节点类型和边类型
        src_type, _, dst_type = etype
        # 获取原始图中该边类型的边索引
        edges = graph.edges(etype=etype)
        graph_edges[etype] = edges
    return dgl.heterograph(graph_edges)
# graph_directed = convert_hetro_bidirected_to_directed(graph)
# print(graph_directed)


In [18]:
def add_reverse_edges(graph:dgl.DGLGraph):
    """ add reversed edges to HetroGraph """
    graph_edges = {}
    for etype in graph.canonical_etypes:
        # 获取边类型的源节点类型、目标节点类型和边类型
        src_type, _etype, dst_type = etype
        # 获取原始图中该边类型的边索引
        edges = graph.edges(etype=etype)
        graph_edges[etype] = edges
        graph_edges[(dst_type, _etype + "_rev", src_type)] = edges[::-1]
        
    return dgl.heterograph(graph_edges)

graph_rev = add_reverse_edges(graph)
print(graph_rev)

Graph(num_nodes={'author': 1134649, 'field_of_study': 59965, 'institution': 8740, 'paper': 736389},
      num_edges={('author', 'affiliated_with', 'institution'): 1043998, ('author', 'writes', 'paper'): 7145660, ('field_of_study', 'has_topic_rev', 'paper'): 7505078, ('institution', 'affiliated_with_rev', 'author'): 1043998, ('paper', 'cites', 'paper'): 5416271, ('paper', 'cites_rev', 'paper'): 5416271, ('paper', 'has_topic', 'field_of_study'): 7505078, ('paper', 'writes_rev', 'author'): 7145660},
      metagraph=[('author', 'institution', 'affiliated_with'), ('author', 'paper', 'writes'), ('institution', 'author', 'affiliated_with_rev'), ('paper', 'paper', 'cites'), ('paper', 'paper', 'cites_rev'), ('paper', 'field_of_study', 'has_topic'), ('paper', 'author', 'writes_rev'), ('field_of_study', 'paper', 'has_topic_rev')])


In [19]:
# from tab2graph, 统计每种类型的边的信息
def degree_info(graph):
    deg = [
        graph.in_degrees(etype=etype)
        for etype in graph.canonical_etypes
    ]
    deg_quantile = [
        np.quantile(d, [0.2, 0.4, 0.6, 0.8, 1.0])
        for d in deg
    ]
    df = pd.DataFrame(
        {
            et : qt
            for (_, et, _), qt in zip(graph.canonical_etypes, deg_quantile)
        },
        index=["20%", "40%", "60%", "80%", "100%"]
    )
    return df.transpose()

print(degree_info(graph))

                 20%  40%   60%   80%      100%
affiliated_with  2.0  5.0  16.0  78.0   28169.0
writes           2.0  3.0   4.0   7.0    5050.0
cites            1.0  2.0   4.0   9.0    4744.0
has_topic        1.0  3.0   7.0  30.0  736389.0


In [20]:
print(degree_info(graph_rev))

                      20%   40%   60%   80%      100%
affiliated_with       2.0   5.0  16.0  78.0   28169.0
writes                2.0   3.0   4.0   7.0    5050.0
has_topic_rev        10.0  10.0  11.0  11.0      14.0
affiliated_with_rev   0.0   1.0   1.0   1.0      47.0
cites                 1.0   2.0   4.0   9.0    4744.0
cites_rev             1.0   3.0   5.0  11.0     609.0
has_topic             1.0   3.0   7.0  30.0  736389.0
writes_rev            1.0   1.0   2.0   4.0    1046.0
