In [24]:
import numpy as np
import pickle as pkl
import networkx as nx
import scipy.sparse as sp
import os
from scipy.sparse.linalg.eigen.arpack import eigsh
import sys

data_root='../../dropbox/raw_data/citation'

def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index

def sample_mask(idx, l):
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)

def load_data(dataset_str):
    """Load data."""
    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
    objects = []
    for i in range(len(names)):
        with open("{}/ind.{}.{}".format(data_root, dataset_str, names[i]), 'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))

    x, y, tx, ty, allx, ally, graph = tuple(objects)
    test_idx_reorder = parse_index_file("{}/ind.{}.test.index".format(data_root, dataset_str))
    test_idx_range = np.sort(test_idx_reorder)

    print allx.shape, x.shape, tx.shape
    if dataset_str == 'citeseer' or dataset_str == 'nell':
        # Fix citeseer dataset (there are some isolated nodes in the graph)
        # Find isolated nodes, add them as zero-vecs into the right position
        # test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
        test_idx_range_full = range( allx.shape[0], len(graph) )
        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
        tx_extended[test_idx_range-allx.shape[0], :] = tx
        tx = tx_extended
        ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
        ty_extended[test_idx_range-allx.shape[0], :] = ty
        ty = ty_extended

    features = sp.vstack((allx, tx)).tolil()
    features[test_idx_reorder, :] = features[test_idx_range, :]
    print features.shape
    
    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
    print adj.shape
    labels = np.vstack((ally, ty))
    labels[test_idx_reorder, :] = labels[test_idx_range, :]

    idx_test = test_idx_range.tolist()
    idx_train = range(len(y))
    idx_val = range(len(y), len(y)+500)

    train_mask = sample_mask(idx_train, labels.shape[0])
    val_mask = sample_mask(idx_val, labels.shape[0])
    test_mask = sample_mask(idx_test, labels.shape[0])

    y_train = np.zeros(labels.shape)
    y_val = np.zeros(labels.shape)
    y_test = np.zeros(labels.shape)
    y_train[train_mask, :] = labels[train_mask, :]
    y_val[val_mask, :] = labels[val_mask, :]
    y_test[test_mask, :] = labels[test_mask, :]

    return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, idx_train, idx_val, idx_test



In [25]:
d = 'nell'
idxes = {}
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, idxes['idx_train'], idxes['idx_val'], idxes['idx_test'] = load_data(d)


(8922, 5414) (105, 5414) (969, 5414)
(65755, 5414)
(65755, 65755)


In [26]:
d = 'nell'
idxes = {}
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, idxes['idx_train'], idxes['idx_val'], idxes['idx_test'] = load_data(d)

output_root='../../dropbox/data/%s' % d

if not os.path.isdir(output_root):
    os.makedirs(output_root)
    
a = features[0]
with open('%s/features.txt' % output_root, 'w') as f:
    for i in range(features.shape[0]):
        row, col, val = sp.find(features[i])
        f.write('%d' % len(col))
        for j in range(len(col)):
            if d == 'pubmed':
                f.write(' %d:%.8f' % (col[j], val[j]))
            else:
                f.write(' %d:%.2f' % (col[j], val[j]))
        f.write('\n')
        
with open('%s/meta.txt' % output_root, 'w') as f:
    f.write('%d %d %d\n' % (len(train_mask), len(y_train[0]), features.shape[1]))

num_label = len(y_train[0])
with open('%s/label.txt' % output_root, 'w') as f:
    for i in range(features.shape[0]):
        y = None
        if train_mask[i]:
            y = y_train[i]
        elif val_mask[i]:
            y = y_val[i]
        elif test_mask[i]:
            y = y_test[i]
        for j in range(num_label):
            if y is not None:
                f.write('%d ' % y[j])
            else:
                f.write('0 ')
        f.write('\n')

with open('%s/adj_list.txt' % output_root, 'w') as f:
    for i in range(adj.shape[0]):
        _, col, _ = sp.find(adj[i])
        f.write('%d' % len(col))
        for j in range(len(col)):
            f.write(' %d' % col[j])
        f.write('\n')
        
for p in ['train', 'val', 'test']:
    idx = idxes['idx_%s' % p]
    with open('%s/%s_idx.txt' % (output_root, p), 'w') as f:
        for i in idx:
            f.write('%d\n' % i)

(8922, 5414) (105, 5414) (969, 5414)
(65755, 5414)
(65755, 65755)
