In [None]:
from gnnbench.data.io import SparseGraph, load_dataset
from tqdm.notebook import tqdm
from grapht.data import get_BASBM, get_benchmark
from grapht.perturb import khop_edge_deletion, khop_rewire
import scipy.sparse as sp
import numpy as np
import networkx as nx
import pandas as pd
import pickle
import requests

In [None]:
datasets = ['cora', 'cora_full', 'citeseer', 'pubmed',
            'amazon_electronics_photo', 'amazon_electronics_computers',
            'ms_academic_cs', 'ms_academic_phy']

def gnnbench_data(name):
    G = load_dataset(f'../data/{name}')
    G.standardize()
    A, X, y = G.unpack()
    return A, X, y

# Download and standardise data using gnn-bench
Process data using gnn-bench and saves it. Eventually will add this functionality to the data submodule

In [None]:
for dataset in tqdm(datasets, desc='Downloading data'):
    request = requests.get(f'https://raw.github.com/shchur/gnn-benchmark/master/data/npz/{dataset}.npz')
    open(f'../data/{dataset}.npz', 'wb').write(request.content)

HBox(children=(FloatProgress(value=0.0, description='Downloading data', max=8.0, style=ProgressStyle(descripti…




In [None]:
for dataset in datasets:
    A, X, y = gnnbench_data(dataset)
    np.savez(f'../data/standardised_{dataset}', A=A, X=X, y=y)

# Linegraph path lengths

This is used to make processing lengths in the linegraph faster by caching all distances. Cora took my computer 15 seconds and citeseer takes 6 seconds. It may not be feasible to compute for larger datasets.

In [None]:
import scipy.sparse as sp
from timebudget import timebudget

In [None]:
for dataset in ['cora', 'citeseer']:
    A, X, y = gnnbench_data(dataset)
    G = nx.from_scipy_sparse_matrix(A)
    G = nx.line_graph(G)
    A = nx.to_scipy_sparse_matrix(G)
    with timebudget(dataset):
        D = sp.csgraph.dijkstra(A)
    np.save(f'../data/linegraph_distances_{dataset}', D)

cora took 13.412sec
citeseer took 5.885sec
BASBM took 153.22ms


# BASBM dataset

In [None]:
def get_valid_graph(sizes, p):
    # makes sure there is a valid perturbation of 2% edge removals and 5% edge rewires
    valid = False
    while not valid:
        G = get_BASBM(sizes, p)
        result_remove = khop_edge_deletion(G, k=2, r=int(G.number_of_edges()*0.02)) 
        result_rewire = khop_rewire(G, k=2, r=int(G.number_of_edges()*0.05)) 
        if result_remove is not None and result_rewire is not None:
            valid = True
    return G

In [None]:
# Generate A 
sizes = [100, 100, 100]
p = 0.001
G = get_valid_graph(sizes, p)
A = nx.adj_matrix(G)

# Generate X 
ps = np.random.dirichlet(alpha=[1,1,1], size=3)
print(ps)
X = []
for i, size in enumerate(sizes):
    X.append(np.random.choice([0, 1, 2], size=size, p=ps[i]))
X = np.concatenate(X)
X = np.expand_dims(X, 1)
X = sp.csr_matrix(X)

# Generate y
y = np.concatenate([[i for _ in range(sizes[i])] for i in range(len(sizes))])

[[0.34514    0.21521006 0.43964994]
 [0.36976793 0.44430321 0.18592886]
 [0.02348958 0.73739406 0.23911636]]


In [None]:
G = SparseGraph(adj_matrix=A, attr_matrix=X, labels=y)
G.standardize()
A, X, y = G.unpack()
np.savez('../data/standardised_BASBM', A=A, X=X, y=y)

In [None]:
A, X, y = get_benchmark('BASBM')
G = nx.from_scipy_sparse_matrix(A)
G = nx.line_graph(G)
A = nx.to_scipy_sparse_matrix(G)
with timebudget(dataset):
    D = sp.csgraph.dijkstra(A)
np.save(f'../data/linegraph_distances_{dataset}', D)   

BASBM took 148.12ms
