In [6]:
import os
import numpy as np
import pandas as pd
import torch
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T


FILE_DIR = ""
data_path = os.path.join(FILE_DIR, "data", "processed_normalized_data.pkl")
output_path = os.path.join(FILE_DIR, "data", "hetero_data.pt")

data: pd.DataFrame = pd.read_pickle(data_path)

In [4]:
data.columns

Index(['name', 'abstract', 'url', 'authors', 'conference', 'year',
       'embedding'],
      dtype='object')

In [5]:
data.year

0        2020
1        2020
2        2020
3        2020
4        2020
         ... 
63849    2012
63850    2012
63851    2012
63852    2012
63853    2012
Name: year, Length: 63854, dtype: Int64

In [9]:
# Graph stats
#!/usr/bin/env python3
import argparse
from collections import Counter, defaultdict

import torch
import networkx as nx
from torch_geometric.utils import to_networkx
import torch_geometric.transforms as T


def main(path: str, list_top: int):
    # 1) Load hetero graph
    hetero_data = torch.load(path, weights_only=False)
    # Ensure undirected (safe even if already done)
    hetero_data = T.ToUndirected()(hetero_data)

    # 2) Collapse to a single homogeneous graph (keeps node/edge type ids)
    homo = hetero_data.to_homogenous()

    # 3) Convert to a NetworkX graph
    #    Using an undirected simple Graph for connected components
    G = to_networkx(homo, to_undirected=True)

    # 4) Connected components and sizes
    components = list(nx.connected_components(G))
    sizes = [len(c) for c in components]
    sizes_sorted = sorted(sizes, reverse=True)

    print(f"# Nodes (homogeneous): {G.number_of_nodes()}")
    print(f"# Edges (homogeneous): {G.number_of_edges()}")
    print(f"# Connected components: {len(components)}\n")

    # 5) Print size list (largest first)
    print("Component sizes (descending):")
    print(sizes_sorted)
    if list_top > 0:
        print(f"\nTop {min(list_top, len(components))} components shown with type breakdown:")

    # 6) Optional: per-component type breakdown
    #    - homo.node_type is a tensor of ints mapping each node -> type index
    #    - hetero_data.node_types lists type names in matching order
    node_type_tensor = homo.node_type.cpu() if hasattr(homo, "node_type") else None
    type_names = list(getattr(hetero_data, "node_types", []))

    # Build a helper for quick lookups
    # G nodes are 0..N-1 in the same order as homogeneous nodes
    for rank, comp_nodes in enumerate(
        sorted(components, key=lambda c: len(c), reverse=True)[:max(0, list_top)]
    ):
        comp_size = len(comp_nodes)
        breakdown = {}
        if node_type_tensor is not None and type_names:
            # Count node types inside this component
            counts = Counter(int(node_type_tensor[n].item()) for n in comp_nodes)
            breakdown = {type_names[t]: counts[t] for t in counts}
        print(f"  - Component #{rank+1}: size={comp_size}, types={breakdown if breakdown else 'n/a'}")

    # 7) If you’d like the raw list for downstream use, you can easily return or save it here.
    # (For now we just print it.)


main("data/hetero_data.pt", list_top=0)

AttributeError: 'HeteroData' has no attribute 'to_homogenous'