In [None]:
from gnnbench.data.io import load_dataset
from tqdm.notebook import tqdm
import numpy as np
import networkx as nx
import pandas as pd
import pickle
import requests

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [None]:
datasets = ['cora', 'cora_full', 'citeseer', 'pubmed',
            'amazon_electronics_photo', 'amazon_electronics_computers',
            'ms_academic_cs', 'ms_academic_phy']

def gnnbench_data(name):
    G = load_dataset(f'../data/{name}')
    G.standardize()
    A, X, y = G.unpack()
    return A, X, y

# Download and standardise data using gnn-bench
Process data using gnn-bench and saves it. Eventually will add this functionality to the data submodule

In [None]:
for dataset in tqdm(datasets, desc='Downloading data'):
    request = requests.get(f'https://raw.github.com/shchur/gnn-benchmark/master/data/npz/{dataset}.npz')
    open(f'../data/{dataset}.npz', 'wb').write(request.content)

HBox(children=(FloatProgress(value=0.0, description='Downloading data', max=8.0, style=ProgressStyle(descripti…




In [None]:
for dataset in datasets:
    A, X, y = gnnbench_data(dataset)
    np.savez(f'../data/standardised_{dataset}', A=A, X=X, y=y)

# Linegraph path lengths

This is used to make processing lengths in the linegraph faster by caching all distances. Cora took my computer 15 seconds and citeseer takes 6 seconds. It may not be feasible to compute for larger datasets.

In [None]:
import scipy.sparse as sp
from timebudget import timebudget

In [None]:
for dataset in ['cora', 'citeseer']:
    A, X, y = gnnbench_data(dataset)
    G = nx.from_scipy_sparse_matrix(A)
    G = nx.line_graph(G)
    A = nx.to_scipy_sparse_matrix(G)
    with timebudget(dataset):
        D = sp.csgraph.dijkstra(A)
    np.save(f'../data/linegraph_distances_{dataset}', D)

cora took 14.870sec
citeseer took 6.020sec
