In [21]:
import argparse, time
import numpy as np
import networkx as nx
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph
from dgl.data import register_data_args, citegrh

from gcn import GCN
#from gcn_mp import GCN
#from gcn_spmv import GCN
# from graphsage_utils import *

def evaluate(model, features, labels, mask):
    model.eval()
    with torch.no_grad():
        logits = model(features)
        logits = logits[mask]
        labels = labels[mask]
        _, indices = torch.max(logits, dim=1)
        correct = torch.sum(indices == labels)
        return correct.item() * 1.0 / len(labels)

def load_data_dgl(dataset='cora'):
    if dataset == 'cora':
        return citegrh.load_cora()
    elif dataset == 'citeseer':
        return citegrh.load_citeseer()
    elif dataset == 'pubmed':
        return citegrh.load_pubmed()    
    

In [27]:
def load_data(prefix, normalize=True, load_walks=False):
    G_data = json.load(open(prefix + "-G.json"))
    G = json_graph.node_link_graph(G_data)
    if isinstance(G.nodes()[0], int):
        def conversion(n): return int(n)
    else:
        def conversion(n): return n

    if os.path.exists(prefix + "-feats.npy"):
        feats = np.load(prefix + "-feats.npy")
    else:
        print("No features present.. Only identity features will be used.")
        feats = None
    class_map = json.load(open(prefix + "-class_map.json"))
    if isinstance(list(class_map.values())[0], list):
        def lab_conversion(n): return n
    else:
        def lab_conversion(n): return int(n)

    class_map = {conversion(k): lab_conversion(v)
                 for k, v in class_map.items()}

    # Remove all nodes that do not have val/test annotations
    # (necessary because of networkx weirdness with the Reddit data)
    broken_count = 0
    for node in G.nodes():
        if not 'val' in G.node[node] or not 'test' in G.node[node]:
            G.remove_node(node)
            broken_count += 1
    print("Removed {:d} nodes that lacked proper annotations due to networkx versioning issues".format(
        broken_count))

    # Make sure the graph has edge train_removed annotations
    # (some datasets might already have this..)
    print("Loaded data.. now preprocessing..")
    for edge in G.edges():
        if (G.node[edge[0]]['val'] or G.node[edge[1]]['val'] or
                G.node[edge[0]]['test'] or G.node[edge[1]]['test']):
            G[edge[0]][edge[1]]['train_removed'] = True
        else:
            G[edge[0]][edge[1]]['train_removed'] = False

    if normalize and not feats is None:
        from sklearn.preprocessing import StandardScaler
        train_ids = np.array([n for n in G.nodes(
        ) if not G.node[n]['val'] and not G.node[n]['test']])
        train_feats = feats[train_ids]
        scaler = StandardScaler()
        scaler.fit(train_feats)
        feats = scaler.transform(feats)

    return G, feats, class_map

def _sample_mask(idx, l):
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return mask


In [135]:
from easydict import EasyDict
# train_prefix = '../graphzoom/dataset/cora/cora'
# G, features, class_map = load_data(train_prefix)
dataset = 'cora'
dataset_dir = f'../graphzoom/dataset/{dataset}'
G      = json_graph.node_link_graph(json.load(open(dataset_dir + "/{}-G.json".format(dataset))))
labels = json.load(open(dataset_dir + "/{}-class_map.json".format(dataset)))
feats = np.load(dataset_dir + f"/{dataset}-feats.npy")

train_ids    = [n for n in G.nodes() if not G.node[n]['val'] and not G.node[n]['test']]
test_ids     = [n for n in G.nodes() if G.node[n]['test']]
val_ids     = test_ids[1000:1500]
test_ids     = test_ids[:1000]
# train_labels = [labels[str(i)] for i in train_ids]
# test_labels  = [labels[str(i)] for i in test_ids]
labels = torch.LongTensor(list(class_map.values()))
train_mask = _sample_mask(train_ids, labels.shape[0])
test_mask =  _sample_mask(test_ids, labels.shape[0])
val_mask =  _sample_mask(val_ids, labels.shape[0])
# val_mask = _sample_mask(range(200, 500), labels.shape[0])
onehot_labels = F.one_hot(labels)
print(len(train_labels))
print(len(test_ids))
print(len(val_ids))

140
1000
500


In [160]:
G

<networkx.classes.graph.Graph at 0x7ff06b03cdd0>

In [136]:
data = EasyDict({
    'graph': G,
    'labels': labels,
    'onehot_labels': onehot_labels,
    'features': feats,
    'train_mask':train_mask,
    'val_mask': val_mask,
    'test_mask': test_mask,
    'num_classes': onehot_labels.shape[1],
    
})

In [83]:
from scipy.sparse import csr_matrix
def construct_proj_laplacian(laplacian, levels, proj_dir):
    coarse_laplacian = []
    projections = []
    for i in range(levels):
        projection_name = "{}/Projection_{}.mtx".format(proj_dir, i+1)
        projection = mtx2matrix(projection_name)
        projections.append(projection)
        coarse_laplacian.append(laplacian)
        if i != (levels-1):
            laplacian = projection @ laplacian @ (projection.transpose())
    return projections, coarse_laplacian

def mtx2matrix(proj_name):
    data = []
    row = []
    col = []
    with open(proj_name) as ff:
        for i, line in enumerate(ff):
            info = line.split()
            if i == 0:
                NumReducedNodes = int(info[0])
                NumOriginNodes = int(info[1])
            else:
                row.append(int(info[0])-1)
                col.append(int(info[1])-1)
                data.append(1)
    matrix = csr_matrix((data, (row, col)), shape=(
        NumReducedNodes, NumOriginNodes))
    return matrix
levels = 2
reduce_results = f"../graphzoom/reduction_results/Cora"
original_adj = nx.adj_matrix(G)
projections, coarse_adj = construct_proj_laplacian(
    original_adj, levels, reduce_results)

(1169, 1169)

In [206]:
from torch.nn.functional import softmax
# softmax(labels)
coarse_feats = projections[0] @ data.features
coarse_labels = projections[0] @ data.onehot_labels 
coarse_graph = nx.Graph(coarse_adj[1])
rows_sum = coarse_labels.sum(axis=1)[:, np.newaxis]
norm_coarse_labels = coarse_labels / rows_sum
# list(map(np.shape, [coarse_embed, coarse_labels]))

tensor([[0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 2., 0.]])


tensor([[0.1147, 0.1147, 0.1147,  ..., 0.1147, 0.1147, 0.3118],
        [0.0747, 0.0747, 0.0747,  ..., 0.0747, 0.5519, 0.0747],
        [0.1147, 0.1147, 0.1147,  ..., 0.1147, 0.1147, 0.1147],
        ...,
        [0.0747, 0.0747, 0.0747,  ..., 0.0747, 0.5519, 0.0747],
        [0.0383, 0.0383, 0.0383,  ..., 0.0383, 0.0383, 0.7700],
        [0.5519, 0.0747, 0.0747,  ..., 0.0747, 0.0747, 0.0747]])

In [186]:
coarse_train_mask = _sample_mask(range(100), norm_coarse_labels.shape[0])
coarse_test_mask = _sample_mask(range(100,700), norm_coarse_labels.shape[0])
coarse_val_mask = _sample_mask(range(700,1000), norm_coarse_labels.shape[0])

In [187]:
coarse_data = EasyDict({
    'graph': coarse_graph,
    'labels': coarse_labels,
#     'onehot_labels': onehot_labels,
    'features': coarse_feats,
    'train_mask':coarse_train_mask,
    'val_mask': coarse_val_mask,
    'test_mask': coarse_test_mask,
    'num_classes': norm_coarse_labels.shape[1],
})

In [188]:
data = coarse_data
data.val_mask.shape

(1169,)

In [224]:
from torch.nn.functional import log_softmax
import pdb
def main(args):
    # load and preprocess dataset
    features = torch.FloatTensor(data.features)
    labels = torch.FloatTensor(data.labels)
#     g, features, class_map = load_data(train_prefix)
#     labels = torch.LongTensor(list(class_map.values()))
    if hasattr(torch, 'BoolTensor'):
        train_mask = torch.BoolTensor(data.train_mask)
        val_mask = torch.BoolTensor(data.val_mask)
        test_mask = torch.BoolTensor(data.test_mask)
    in_feats = data.features.shape[1]
    n_classes = data.num_classes
    n_edges = data.graph.number_of_edges()
    print("""----Data statistics------'
      #Edges %d
      #Classes %d
      #Train samples %d
      #Val samples %d
      #Test samples %d""" %
          (n_edges, n_classes,
              train_mask.int().sum().item(),
              val_mask.int().sum().item(),
              test_mask.int().sum().item()))

    if args.gpu < 0:
        cuda = False
    else:
        cuda = True
        torch.cuda.set_device(args.gpu)
        features = features.cuda()
        labels = labels.cuda()
        train_mask = train_mask.cuda()
        val_mask = val_mask.cuda()
        test_mask = test_mask.cuda()

    # graph preprocess and calculate normalization factor
    g = data.graph
    # add self loop
    if args.self_loop:
        print('add self_loop')
        g.remove_edges_from(nx.selfloop_edges(g))
        g.add_edges_from(zip(g.nodes(), g.nodes()))
    g = DGLGraph(g)
    n_edges = g.number_of_edges()
    # normalization
    degs = g.in_degrees().float()
    norm = torch.pow(degs, -0.5)
    norm[torch.isinf(norm)] = 0
    if cuda:
        norm = norm.cuda()
    g.ndata['norm'] = norm.unsqueeze(1)

    # create GCN model
    model = GCN(g,
                in_feats,
                args.n_hidden,
                n_classes,
                args.n_layers,
                F.relu,
                args.dropout,)
    print(model)

    if cuda:
        model.cuda()
#     loss_fcn = torch.nn.CrossEntropyLoss()
    loss_fcn = torch.nn.KLDivLoss()

    # use optimizer
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)

    # initialize graph
    dur = []
    for epoch in range(args.n_epochs):
        model.train()
        if epoch >= 3:
            t0 = time.time()
        # forward
        logits = log_softmax(model(features), 1)
        loss = loss_fcn(logits[train_mask], labels[train_mask])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch >= 3:
            dur.append(time.time() - t0)

#         acc = evaluate(model, features, labels, val_mask)
        acc=0
        print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | "
              "ETputs(KTEPS) {:.2f}". format(epoch, np.mean(dur), loss.item(),
                                             acc, n_edges / np.mean(dur) / 1000))

    print()
#     acc = evaluate(model, features, labels, test_mask)
    print("Test accuracy {:.2%}".format(acc))

In [225]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='GCN')
#     register_data_args(parser)
    parser.add_argument("--dropout", type=float, default=0.5,
            help="dropout probability")
    parser.add_argument("--gpu", type=int, default=0,
            help="gpu")
    parser.add_argument("--lr", type=float, default=1e-2,
            help="learning rate")
    parser.add_argument("--n-epochs", type=int, default=200,
            help="number of training epochs")
    parser.add_argument("--n-hidden", type=int, default=16,
            help="number of hidden gcn units")
    parser.add_argument("--n-layers", type=int, default=1,
            help="number of hidden gcn layers")
    parser.add_argument("--weight-decay", type=float, default=5e-4,
            help="Weight for L2 loss")
    parser.add_argument("--self-loop", action='store_true',
            help="graph self-loop (default=False)")
#     parser.add_argument("--dataset", default='cora')
    parser.set_defaults(self_loop=True)
    parser.add_argument("--dataset", default='cora')
#     args = parser.parse_args()
#     args = parser.parse_args()[1:]
    args = parser.parse_known_args()[0]
    print(args)

    main(args)

Namespace(dataset='cora', dropout=0.5, gpu=0, lr=0.01, n_epochs=200, n_hidden=16, n_layers=1, self_loop=True, weight_decay=0.0005)
----Data statistics------'
      #Edges 3782
      #Classes 7
      #Train samples 100
      #Val samples 300
      #Test samples 600
add self_loop
GCN(
  (layers): ModuleList(
    (0): GraphConv(in=1433, out=16, normalization=True, activation=<function relu at 0x7ff0c5a1e290>)
    (1): GraphConv(in=16, out=7, normalization=True, activation=None)
  )
  (dropout): Dropout(p=0.5, inplace=False)
)
Epoch 00000 | Time(s) nan | Loss 0.7513 | Accuracy 0.0000 | ETputs(KTEPS) nan
Epoch 00001 | Time(s) nan | Loss 0.6971 | Accuracy 0.0000 | ETputs(KTEPS) nan
Epoch 00002 | Time(s) nan | Loss 0.6527 | Accuracy 0.0000 | ETputs(KTEPS) nan
Epoch 00003 | Time(s) 0.0033 | Loss 0.6172 | Accuracy 0.0000 | ETputs(KTEPS) 1944.93
Epoch 00004 | Time(s) 0.0033 | Loss 0.5619 | Accuracy 0.0000 | ETputs(KTEPS) 1915.42
Epoch 00005 | Time(s) 0.0033 | Loss 0.5300 | Accuracy 0.0000 | ETpu

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Epoch 00040 | Time(s) 0.0033 | Loss 0.2886 | Accuracy 0.0000 | ETputs(KTEPS) 1917.06
Epoch 00041 | Time(s) 0.0033 | Loss 0.2888 | Accuracy 0.0000 | ETputs(KTEPS) 1914.74
Epoch 00042 | Time(s) 0.0033 | Loss 0.2885 | Accuracy 0.0000 | ETputs(KTEPS) 1915.72
Epoch 00043 | Time(s) 0.0033 | Loss 0.2809 | Accuracy 0.0000 | ETputs(KTEPS) 1916.94
Epoch 00044 | Time(s) 0.0033 | Loss 0.2847 | Accuracy 0.0000 | ETputs(KTEPS) 1917.07
Epoch 00045 | Time(s) 0.0033 | Loss 0.2783 | Accuracy 0.0000 | ETputs(KTEPS) 1918.94
Epoch 00046 | Time(s) 0.0033 | Loss 0.2915 | Accuracy 0.0000 | ETputs(KTEPS) 1918.20
Epoch 00047 | Time(s) 0.0033 | Loss 0.2820 | Accuracy 0.0000 | ETputs(KTEPS) 1918.39
Epoch 00048 | Time(s) 0.0033 | Loss 0.2899 | Accuracy 0.0000 | ETputs(KTEPS) 1912.69
Epoch 00049 | Time(s) 0.0033 | Loss 0.2825 | Accuracy 0.0000 | ETputs(KTEPS) 1912.17
Epoch 00050 | Time(s) 0.0033 | Loss 0.2869 | Accuracy 0.0000 | ETputs(KTEPS) 1909.75
Epoch 00051 | Time(s) 0.0033 | Loss 0.2732 | Accuracy 0.0000 | ET

## experiments
dataset | level | acc
--- | --- | ---
cora | 0 | 81.1