In [6]:
import numpy as np
from scipy import sparse

def extract_nodes(edges):
    node_idx = 0
    node_list = {}
    for edge_idx in range(edges.shape[0]):
        node_1 = edges[edge_idx, 0]
        node_2 = edges[edge_idx, 1]
        if node_1 not in node_list.keys():
            node_list[node_1] = node_idx
            node_idx += 1
        if node_2 not in node_list.keys():
            node_list[node_2] = node_idx
            node_idx += 1
    return node_list

edges = np.loadtxt('cora/cora.cites', delimiter='\t', dtype=int)
node_list = extract_nodes(edges)

In [9]:
num_nodes = len(list(node_list.keys()))

adj_mat = np.zeros((num_nodes, num_nodes), dtype=np.int8)
for edge_idx in range(edges.shape[0]):
    node_1 = node_list[edges[edge_idx, 0]]
    node_2 = node_list[edges[edge_idx, 1]]
    adj_mat[node_1, node_2] = 1

[[0 1 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 0]]


In [17]:
node_features = np.zeros((num_nodes, 1433), dtype=np.int8)
node_category = ['' for _ in range(num_nodes)]
with open('cora/cora.content') as feature_file:
    for frow in feature_file:
        row_content = frow[:-1].split('\t')
        node_idx = node_list[int(row_content[0])]
        node_feat = np.array([int(val) for val in row_content[1:-1]])
        node_cat = row_content[-1]
        node_features[node_idx] = node_feat
        node_category[node_idx] = node_cat

print(node_features.shape)
print(node_category)

(2708, 1433)
['Genetic_Algorithms', 'Genetic_Algorithms', 'Neural_Networks', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Neural_Networks', 'Genetic_Algorithms', 'Theory', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algorithms', 'Genetic_Algor

In [19]:
print(list(set(node_category)))

['Theory', 'Genetic_Algorithms', 'Probabilistic_Methods', 'Rule_Learning', 'Case_Based', 'Neural_Networks', 'Reinforcement_Learning']


In [20]:
fixed_cat = ['Theory', 'Genetic_Algorithms', 'Probabilistic_Methods', 'Rule_Learning', 'Case_Based', 'Neural_Networks', 'Reinforcement_Learning']
node_category_array = np.zeros((num_nodes, 7))
for node_idx, node_cat in enumerate(node_category):
    for cat_idx, cat in enumerate(fixed_cat):
        if node_cat == cat:
            node_category_array[node_idx, cat_idx] = 1
print(np.sum(node_category_array, axis=1))
print(node_category_array)

[1. 1. 1. ... 1. 1. 1.]
[[0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]]


In [21]:
node_whole_features = np.concatenate([node_features, node_category_array], axis=1)
print(node_whole_features)
print(node_whole_features.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]]
(2708, 1440)


In [22]:
sparse_adj_mat = sparse.csc_matrix(adj_mat)
sparse_feat_mat = sparse.csc_matrix(node_whole_features)
sparse.save_npz('cora_adjacency.npz', sparse_adj_mat)
sparse.save_npz('cora_features_raw.npz', sparse_feat_mat)

In [25]:
import json
node_list_serial = {int(key): int(val) for key, val in node_list.items()}
with open('cora_node_list.json', 'w+') as jfile:
    jfile.write(json.dumps(node_list_serial, sort_keys=True))

In [1]:
import data_utils

_, _, labels = data_utils.load_cora()

In [23]:
import numpy as np

def split_data(cora_labels, train_each_class=20, validation=500):
    train_mask = np.zeros((cora_labels.shape[0]))
    for class_idx in range(cora_labels.shape[1]):
        train_mask[np.random.choice(np.argwhere(cora_labels[:, class_idx])[:,0],
                                   size=train_each_class, replace=False)] = 1
    remaining_mask = 1 - train_mask
    val_mask = np.zeros((cora_labels.shape[0],))
    val_idx = np.random.choice(np.argwhere(remaining_mask)[:,0], size=validation, replace=False)
    remaining_mask[val_idx] = 2
    test_mask = 1 - train_mask - val_mask
    return train_mask, remaining_mask + train_mask - 1, 2 - 2 * train_mask - remaining_mask

train, val, test = split_data(labels)
print(labels.shape[0])
print(np.count_nonzero(train))
print(np.count_nonzero(val))
print(np.count_nonzero(test))

def check_disjoint(mask1, mask2):
    print(np.nonzero(np.multiply(mask1, mask2)))

check_disjoint(train, val)
check_disjoint(train, test)
check_disjoint(test, val)

print(np.nonzero(np.multiply(train, np.multiply(test, val))))

2708
140
500
2068
(array([], dtype=int64),)
(array([], dtype=int64),)
(array([], dtype=int64),)
(array([], dtype=int64),)
