In [1]:
import numpy as np
import pickle as pkl
import scipy.sparse as sp
import networkx as nx
import sys
print ('python version: ', sys.version_info)

python version:  sys.version_info(major=3, minor=6, micro=10, releaselevel='final', serial=0)


### Check the dataset

In [2]:
dataset = 'cora'
names = ['features', 'graph', 'idx_train', 'idx_eval', 'idx_test']
objects = []
for i in range(len(names)):
    f = open("./data/{}/{}.bin".format(dataset, names[i]), 'rb')
    if sys.version_info > (3, 0): # if python==3.x
        objects.append(pkl.load(f, encoding='latin1'))
    else: # if python==2.x
        objects.append(pkl.load(f))
features, graph, idx_train, idx_eval, idx_test = objects

print ("Below shows the type of the stored objects:")
print ("-- features: type={}, shape={}".format(type(features), features.shape))
print ("-- graph: type={}, node num={}".format(type(graph), len(graph)))
print ("-- idx_train: type={}, size={}".format(type(idx_train), len(idx_train)))
print ("-- idx_eval: type={}, size={}".format(type(idx_eval), len(idx_eval)))
print ("-- idx_test: type={}, size={}".format(type(idx_test), len(idx_test)))

Below shows the type of the stored objects:
-- features: type=<class 'scipy.sparse.lil.lil_matrix'>, shape=(2708, 1433)
-- graph: type=<class 'collections.defaultdict'>, node num=2708
-- idx_train: type=<class 'list'>, size=140
-- idx_eval: type=<class 'list'>, size=500
-- idx_test: type=<class 'list'>, size=1000


In [3]:
# convert scipy sparse matrix to numpy array
dense_features = features.toarray()
print (dense_features.shape, type(dense_features))

(2708, 1433) <class 'numpy.ndarray'>


### Process facebook_page data
* Source link: https://www.kaggle.com/rozemberczki/musae-facebook-pagepage-network?select=musae_facebook_target.csv

In [None]:
# TODO: fill here to process facebook data

### Process amazon_product data
* Source link: https://ogb.stanford.edu/docs/nodeprop/#loader

In [None]:
# TODO: fill here to process amazon data

### Process citation data
* Source link: https://github.com/kimiyoung/planetoid, where x, allx and tx are labeled data, unlabeled data and test data for inductive learning
* Process and split train/eval/test sets based on https://github.com/PetarV-/GAT/blob/master/utils/process.py

In [26]:
def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index
def sample_mask(idx, l):
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)

In [79]:
def process_citation_data(dataset_str): # {'pubmed', 'citeseer', 'cora'}
    """Load data."""
    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
    objects = []
    for i in range(len(names)):
        with open("./data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))

    x, y, tx, ty, allx, ally, graph = tuple(objects)    
    test_idx_reorder = parse_index_file("./data/ind.{}.test.index".format(dataset_str))
    test_idx_range = np.sort(test_idx_reorder)
    
#     print (x.shape, y.shape, tx.shape, allx.shape)
#     print (len(test_idx_reorder), test_idx_reorder[:10], test_idx_range[:10])

    if dataset_str == 'citeseer':
        # Fix citeseer dataset (there are some isolated nodes in the graph)
        # Find isolated nodes, add them as zero-vecs into the right position
        test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
        tx_extended[test_idx_range-min(test_idx_range), :] = tx
        tx = tx_extended
        ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
        ty_extended[test_idx_range-min(test_idx_range), :] = ty
        ty = ty_extended

    features = sp.vstack((allx, tx)).tolil()
    features[test_idx_reorder, :] = features[test_idx_range, :]
    labels = np.vstack((ally, ty))
    labels[test_idx_reorder, :] = labels[test_idx_range, :]
    
    idx_test = test_idx_range.tolist()
    idx_train = range(len(y))
    idx_val = range(len(y), len(y)+500)
    
    print ("labeled instance total num: ", len(idx_test)+len(idx_train)+len(idx_val))
    print ("instance total num*feature_dim, with class num: ", features.shape, labels.shape[1])
    print ("node num in graph: ", len(graph))

    return features, graph, idx_train, idx_val, idx_test

In [86]:
dataset = 'pubmed'
names = ['features', 'graph', 'idx_train', 'idx_eval', 'idx_test']
objects = process_citation_data(dataset) # features, graph, idx_train, idx_val, idx_test
for i in range(len(names)):
    pkl.dump(objects[i], open("./data/{}/{}.bin".format(dataset, names[i]), 'wb'))

('labeled instance total num: ', 1560)
('instance total num*feature_dim, with class num: ', (19717, 500), 3)
('node num in graph: ', 19717)
