### dataset statistics


In [None]:
from tqdm import tqdm
import numpy as np
import os
import torch
import dgl
import latex_utils as lu

# DATASETS = ["amazon-ratings", "minesweeper", "questions", "roman-empire", "tolokers", "Children", "Computers", "Fitness", "History", "Photo", "cora", "pubmed", "citeseer"]
DATASETS = ["Children", "Computers", "Fitness", "History", "Photo", "amazon-ratings", "minesweeper", "questions", "roman-empire", "tolokers"]
DATASETS_SHOW_NAME = ["Children", "Comp.", "Fitness", "History", "Photo", "Amazon.", "Mines.", "Questions", "Roman.", "tolokers"]
DATASETS_DOMAIN = ["E-commerce", "E-commerce", "E-commerce", "E-commerce", "E-commerce", "E-commerce", "Games", "Website", "Website","Social"]
DATASETS_EMBEDDING_METHOD = ["PLMs", "PLMs", "PLMs", "PLMs", "PLMs", "FastText", "One-hot", "FastText", "FastText", "Statistics"]
# DATASETS = ["cora", "pubmed", "citeseer"]
NODES = []
EDGES = []
NODE_FEATURES = []
CLASSES = []
AVE_DEGREES = []
H_NODE = []
H_EDGE = []
H_CLASS = []
H_ADJ = []
AVE_FI = []

for dataset_name in tqdm(DATASETS):
    data = np.load(os.path.join('../data', f'{dataset_name.replace("-", "_")}.npz'))
    node_features = torch.tensor(data['node_features'])
    labels = torch.tensor(data['node_labels'])
    edges = torch.tensor(data['edges'])
    train_masks = torch.tensor(data['train_masks'])
    val_masks = torch.tensor(data['val_masks'])
    test_masks = torch.tensor(data['test_masks'])
    
    # if not calculated before 
    graph = dgl.graph((edges[:, 0], edges[:, 1]), num_nodes=len(node_features), idtype=torch.long)
    
    edge_index = torch.stack(graph.edges())
    adj = torch.sparse_coo_tensor(edge_index, torch.ones(edge_index.shape[1]), [node_features.shape[0], node_features.shape[0]])
    adj = lu.normalize_tensor(adj.to_dense(),
                            symmetric=0)
    adj = adj.to_sparse()

    NODES.append(len(node_features))
    EDGES.append(len(edges))
    NODE_FEATURES.append(node_features.shape[1])
    CLASSES.append(len(labels.unique()))
    AVE_DEGREES.append(edges.shape[0] / len(node_features))

    h_node_path = f'../data/homophily/table_latex/{dataset_name}_h_node.npy'
    h_edge_path = f'../data/homophily/table_latex/{dataset_name}_h_edge.npy'
    h_class_path = f'../data/homophily/table_latex/{dataset_name}_h_class.npy'
    h_adj_path = f'../data/homophily/table_latex/{dataset_name}_h_adj.npy'
    mi_agg_path = f'../data/homophily/table_latex/{dataset_name}_mi_agg.npy'
    
    print('get H_NODE...')
    if os.path.exists(h_node_path):
        H_NODE.append(np.load(h_node_path))
    else:
        h_node = lu.node_homophily(adj, labels).numpy()
        np.save(h_node_path, h_node)
        H_NODE.append(h_node)

    print('get H_EDGE...')
    if os.path.exists(h_edge_path):
        H_EDGE.append(np.load(h_edge_path))
    else:
        h_edge = lu.edge_homophily(adj, labels).numpy()
        np.save(h_edge_path, h_edge)
        H_EDGE.append(h_edge)


    print('get H_CLASS...')
    if os.path.exists(h_class_path):
        H_CLASS.append(np.load(h_class_path))
    else:
        h_class = lu.class_homophily(adj, labels).numpy()
        np.save(h_class_path, h_class)
        H_CLASS.append(h_class)

    print('get H_ADJ...')
    if os.path.exists(h_adj_path):
        H_ADJ.append(np.load(h_adj_path))
    else:
        h_adj = lu.adjusted_homo(adj, labels).numpy()
        np.save(h_adj_path, h_adj)
        H_ADJ.append(h_adj)

    print('get AVE_FI...')
    if os.path.exists(mi_agg_path):
        AVE_FI.append(np.mean(np.load(mi_agg_path)))
    else:
        mi_agg = lu.mi_agg(graph, node_features, labels).numpy()
        np.save(mi_agg_path, mi_agg)
        AVE_FI.append(np.mean(mi_agg))

 10%|█         | 1/10 [00:02<00:24,  2.76s/it]

get H_NODE...
get H_EDGE...
get H_CLASS...
get H_ADJ...
get AVE_FI...


 20%|██        | 2/10 [00:06<00:25,  3.13s/it]

get H_NODE...
get H_EDGE...
get H_CLASS...
get H_ADJ...
get AVE_FI...


 30%|███       | 3/10 [00:09<00:23,  3.41s/it]

get H_NODE...
get H_EDGE...
get H_CLASS...
get H_ADJ...
get AVE_FI...


 40%|████      | 4/10 [00:13<00:20,  3.40s/it]

get H_NODE...
get H_EDGE...
get H_CLASS...
get H_ADJ...
get AVE_FI...


 50%|█████     | 5/10 [00:15<00:15,  3.13s/it]

get H_NODE...
get H_EDGE...
get H_CLASS...
get H_ADJ...
get AVE_FI...


 60%|██████    | 6/10 [00:16<00:09,  2.26s/it]

get H_NODE...
get H_EDGE...
get H_CLASS...
get H_ADJ...
get AVE_FI...


 70%|███████   | 7/10 [00:16<00:04,  1.66s/it]

get H_NODE...
get H_EDGE...
get H_CLASS...
get H_ADJ...
get AVE_FI...


 80%|████████  | 8/10 [00:19<00:04,  2.05s/it]

get H_NODE...
get H_EDGE...
get H_CLASS...
get H_ADJ...
get AVE_FI...


 90%|█████████ | 9/10 [00:20<00:01,  1.57s/it]

get H_NODE...
get H_EDGE...
get H_CLASS...
get H_ADJ...
get AVE_FI...


100%|██████████| 10/10 [00:20<00:00,  2.07s/it]

get H_NODE...
get H_EDGE...
get H_CLASS...
get H_ADJ...
get AVE_FI...





In [None]:
import pandas as pd

if type(H_NODE[0]) is torch.tensor:
    H_NODE = [h.item() for h in H_NODE]
    H_EDGE = [h.item() for h in H_EDGE]
    H_CLASS = [h.item() for h in H_CLASS]
    H_ADJ = [h.item() for h in H_ADJ]

# NODES = [f"{int(n):,}" for n in NODES]
# EDGES = [f"{int(n):,}" for n in EDGES]
# NODE_FEATURES = [f"{int(n):,}" for n in NODE_FEATURES]
# CLASSES = [f"{int(n):,}" for n in CLASSES]

H_NODE = [f"{float(h):.4f}" for h in H_NODE]
H_EDGE = [f"{float(h):.4f}" for h in H_EDGE]
H_CLASS = [f"{float(h):.4f}" for h in H_CLASS]
H_ADJ = [f"{float(h):.4f}" for h in H_ADJ]
AVE_DEGREES = [f"{float(h):.2f}" for h in AVE_DEGREES]

# collect data into a dictionary
data_dict = {
    r"\textbf{Dataset}": DATASETS_SHOW_NAME,
    r"\textbf{\#Nodes}": NODES,
    r"\textbf{\#Edges}": EDGES,
    r"\textbf{\#Features}": NODE_FEATURES,
    r"\textbf{\#Classes}": CLASSES,
    r"\textbf{Ave. Degrees}": AVE_DEGREES,
    r"\textbf{Domain}": DATASETS_DOMAIN,
    r"\textbf{Feat. Modeling}": DATASETS_EMBEDDING_METHOD,
    r"$\mathbf{h_{node}}$": H_NODE,
    r"$\mathbf{h_{edge}}$": H_EDGE,
    r"$\mathbf{h_{class}}$": H_CLASS,
    r"$\mathbf{h_{adj}}$": H_ADJ,
    r"\textbf{Ave. TFI}": AVE_FI
}

# create DataFrame
df = pd.DataFrame(data_dict)

# generate LaTeX table
latex_table = df.to_latex(index=False, column_format="*{"+str(len(data_dict.keys()))+"}{c}", escape=False,
                          label="tab:dataset_statistics",
                          float_format="{:.4f}".format,
                          position="htbp",)

# generate LaTeX table with booktabs and resizebox
lines = latex_table.splitlines()

lines.insert(2, r'\resizebox{1\hsize}{!}{')
lines.insert(len(lines)-1, r'}')
lines.insert(len(lines)-1, r'\caption{Dataset Statistics}')

# combine into final LaTeX table
latex_table_with_rules = "\n".join(lines)

# output LaTeX table
print(latex_table_with_rules)

\begin{table}[htbp]
\label{tab:dataset_statistics}
\resizebox{1\hsize}{!}{
\begin{tabular}{*{13}{c}}
\toprule
\textbf{Dataset} & \textbf{\#Nodes} & \textbf{\#Edges} & \textbf{\#Features} & \textbf{\#Classes} & \textbf{Ave. Degrees} & \textbf{Domain} & \textbf{Feat. Modeling} & $\mathbf{h_{node}}$ & $\mathbf{h_{edge}}$ & $\mathbf{h_{class}}$ & $\mathbf{h_{adj}}$ & \textbf{Ave. TFI} \\
\midrule
Children & 76,875 & 1,554,578 & 768 & 24 & 20.22 & E-commerce & PLMs & 0.4579 & 0.4220 & 0.2372 & 0.2913 & 0.0225 \\
Comp. & 87,229 & 721,081 & 768 & 10 & 8.27 & E-commerce & PLMs & 0.8469 & 0.8322 & 0.7601 & 0.7988 & 0.0208 \\
Fitness & 173,055 & 1,773,500 & 768 & 13 & 10.25 & E-commerce & PLMs & 0.8991 & 0.9004 & 0.7940 & 0.8528 & 0.0366 \\
History & 41,551 & 358,574 & 768 & 12 & 8.63 & E-commerce & PLMs & 0.7812 & 0.6626 & 0.2654 & 0.5463 & 0.0296 \\
Photo & 48,362 & 500,939 & 768 & 12 & 10.36 & E-commerce & PLMs & 0.7792 & 0.7491 & 0.7229 & 0.6892 & 0.0234 \\
Amazon. & 24,492 & 93,050 & 300 & 

In [8]:
df

Unnamed: 0,\textbf{Dataset},\textbf{#Nodes},\textbf{#Edges},\textbf{#Features},\textbf{#Classes},\textbf{Ave. Degrees},\textbf{Domain},\textbf{Feat. Modeling},$\mathbf{h_{node}}$,$\mathbf{h_{edge}}$,$\mathbf{h_{class}}$,$\mathbf{h_{adj}}$,\textbf{Ave. TFI}
0,Children,76875,1554578,768,24,20.22,E-commerce,PLMs,0.4579,0.422,0.2372,0.2913,0.022467
1,Comp.,87229,721081,768,10,8.27,E-commerce,PLMs,0.8469,0.8322,0.7601,0.7988,0.020779
2,Fitness,173055,1773500,768,13,10.25,E-commerce,PLMs,0.8991,0.9004,0.794,0.8528,0.036571
3,History,41551,358574,768,12,8.63,E-commerce,PLMs,0.7812,0.6626,0.2654,0.5463,0.029634
4,Photo,48362,500939,768,12,10.36,E-commerce,PLMs,0.7792,0.7491,0.7229,0.6892,0.023436
5,Amazon.,24492,93050,300,5,3.8,E-commerce,FastText,0.3793,0.3804,0.127,0.1357,0.017658
6,Mines.,10000,39402,7,2,3.94,Games,One-hot,0.6832,0.6828,0.0094,0.0108,0.020152
7,Questions,48921,153540,301,2,3.14,Website,FastText,0.8963,0.8396,0.0722,0.2759,0.004896
8,Roman.,22662,32927,300,18,1.45,Website,FastText,0.0415,0.0469,0.023,-0.0778,0.487023
9,tolokers,11758,519000,10,2,44.14,Social,Statistics,0.6331,0.5945,0.1867,0.0887,0.004429
