In [32]:
import numpy as np
import pickle as pkl
import scipy.sparse as sp
import networkx as nx
import sys
import ogb
from ogb.nodeproppred import NodePropPredDataset
print ('python version: ', sys.version_info)

python version:  sys.version_info(major=3, minor=6, micro=10, releaselevel='final', serial=0)


### Check the dataset

In [44]:
dataset = 'pubmed'
names = ['feature', 'label', 'graph', 'idx_train', 'idx_eval', 'idx_test']
objects = []
for i in range(len(names)):
    f = open("./data/{}/{}.bin".format(dataset, names[i]), 'rb')
    if sys.version_info > (3, 0): # if python==3.x
        objects.append(pkl.load(f, encoding='latin1'))
    else: # if python==2.x
        objects.append(pkl.load(f))
feature, label, graph, idx_train, idx_eval, idx_test = objects

print ("Below shows the type of the stored objects:")
print ("-- feature: type={}, shape={}".format(type(feature), feature.shape))
print ("-- label: type={}, shape={}".format(type(label), label.shape))
print ("-- graph: type={}, node num={}".format(type(graph), len(graph)))
print ("-- idx_train: type={}, size={}".format(type(idx_train), len(idx_train)))
print ("-- idx_eval: type={}, size={}".format(type(idx_eval), len(idx_eval)))
print ("-- idx_test: type={}, size={}".format(type(idx_test), len(idx_test)))

Below shows the type of the stored objects:
-- feature: type=<class 'scipy.sparse.lil.lil_matrix'>, shape=(19717, 500)
-- label: type=<class 'numpy.ndarray'>, shape=(19717, 3)
-- graph: type=<class 'collections.defaultdict'>, node num=19717
-- idx_train: type=<class 'list'>, size=60
-- idx_eval: type=<class 'list'>, size=500
-- idx_test: type=<class 'list'>, size=1000


In [42]:
# convert scipy sparse matrix to numpy array
dense_features = features.toarray()
print (dense_features.shape, type(dense_features))

(2708, 1433) <class 'numpy.ndarray'>


### Process facebook_page data
* Source link: https://www.kaggle.com/rozemberczki/musae-facebook-pagepage-network?select=musae_facebook_target.csv

In [15]:
# TODO: fill here to process facebook data, and upload to ./data/facebook_page folder

### Process amazon_product data
* Source link: https://ogb.stanford.edu/docs/nodeprop/#loader

In [28]:
# followint the instruction of the website to download data
dataset = NodePropPredDataset(name = 'ogbn-products')

split_idx = dataset.get_idx_split()
train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]
graph, label = dataset[0] # graph: library-agnostic graph object

This will download 1.38GB. Will you proceed? (y/N)
y
Downloading https://snap.stanford.edu/ogb/data/nodeproppred/products.zip


Downloaded 1.38 GB: 100%|██████████| 1414/1414 [28:39<00:00,  1.22s/it]


Extracting dataset/products.zip
Loading necessary files...
This might take a while.


  0%|          | 0/1 [00:00<?, ?it/s]

Processing graphs...


100%|██████████| 1/1 [00:02<00:00,  2.65s/it]


Saving...


In [31]:
print (graph['edge_index'].shape, graph['node_feat'].shape, label.shape)

(2, 123718280) (2449029, 100) (2449029, 1)


In [14]:
# TODO: fill here to process amazon data, and upload to ./data/amazon_product folder

### Process Youtube Data

In [1]:
import scipy.io
mat = scipy.io.loadmat('../dataset/youtube.mat')
print (mat.keys())
print (mat['group'].shape, mat['network'].shape)

dict_keys(['__header__', '__version__', '__globals__', 'group', 'network'])
(1138499, 47) (1138499, 1138499)


### Process citation data
* Source link: https://github.com/kimiyoung/planetoid, where x, allx and tx are labeled data, unlabeled data and test data for inductive learning
* Process and split train/eval/test sets based on https://github.com/PetarV-/GAT/blob/master/utils/process.py

In [33]:
def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index
def sample_mask(idx, l):
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)

In [34]:
def process_citation_data(dataset_str): # {'pubmed', 'citeseer', 'cora'}
    """Load data."""
    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
    objects = []
    for i in range(len(names)):
        with open("./data/.raw_citation/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))

    x, y, tx, ty, allx, ally, graph = tuple(objects)    
    test_idx_reorder = parse_index_file("./data/.raw_citation/ind.{}.test.index".format(dataset_str))
    test_idx_range = np.sort(test_idx_reorder)
    
#     print (x.shape, y.shape, tx.shape, allx.shape)
#     print (len(test_idx_reorder), test_idx_reorder[:10], test_idx_range[:10])

    if dataset_str == 'citeseer':
        # Fix citeseer dataset (there are some isolated nodes in the graph)
        # Find isolated nodes, add them as zero-vecs into the right position
        test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
        tx_extended[test_idx_range-min(test_idx_range), :] = tx
        tx = tx_extended
        ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
        ty_extended[test_idx_range-min(test_idx_range), :] = ty
        ty = ty_extended

    features = sp.vstack((allx, tx)).tolil()
    features[test_idx_reorder, :] = features[test_idx_range, :]
    labels = np.vstack((ally, ty))
    labels[test_idx_reorder, :] = labels[test_idx_range, :]
    
    idx_test = test_idx_range.tolist()
    idx_train = list(range(len(y)))
    idx_val = list(range(len(y), len(y)+500))
    
    print ("labeled instance total num: ", len(idx_test)+len(idx_train)+len(idx_val))
    print ("instance total num*feature_dim, with class num: ", features.shape, labels.shape[1])
    print ("node num in graph: ", len(graph))

    return features, labels, graph, idx_train, idx_val, idx_test

In [40]:
dataset = 'pubmed'
names = ['feature', 'label', 'graph', 'idx_train', 'idx_eval', 'idx_test']
objects = process_citation_data(dataset) # features, graph, idx_train, idx_val, idx_test
for i in range(len(names)):
    pkl.dump(objects[i], open("./data/{}/{}.bin".format(dataset, names[i]), 'wb'))

labeled instance total num:  1560
instance total num*feature_dim, with class num:  (19717, 500) 3
node num in graph:  19717


In [46]:
a=list(range(10))
print (a.shape)

AttributeError: 'list' object has no attribute 'shape'