In [21]:
import argparse, time
import numpy as np
import networkx as nx
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph
from dgl.data import register_data_args, citegrh

from gcn import GCN
#from gcn_mp import GCN
#from gcn_spmv import GCN
# from graphsage_utils import *

def evaluate(model, features, labels, mask):
    model.eval()
    with torch.no_grad():
        logits = model(features)
        logits = logits[mask]
        labels = labels[mask]
        _, indices = torch.max(logits, dim=1)
        correct = torch.sum(indices == labels)
        return correct.item() * 1.0 / len(labels)

def load_data_dgl(dataset='cora'):
    if dataset == 'cora':
        return citegrh.load_cora()
    elif dataset == 'citeseer':
        return citegrh.load_citeseer()
    elif dataset == 'pubmed':
        return citegrh.load_pubmed()    
    

In [27]:
def load_data(prefix, normalize=True, load_walks=False):
    G_data = json.load(open(prefix + "-G.json"))
    G = json_graph.node_link_graph(G_data)
    if isinstance(G.nodes()[0], int):
        def conversion(n): return int(n)
    else:
        def conversion(n): return n

    if os.path.exists(prefix + "-feats.npy"):
        feats = np.load(prefix + "-feats.npy")
    else:
        print("No features present.. Only identity features will be used.")
        feats = None
    class_map = json.load(open(prefix + "-class_map.json"))
    if isinstance(list(class_map.values())[0], list):
        def lab_conversion(n): return n
    else:
        def lab_conversion(n): return int(n)

    class_map = {conversion(k): lab_conversion(v)
                 for k, v in class_map.items()}

    # Remove all nodes that do not have val/test annotations
    # (necessary because of networkx weirdness with the Reddit data)
    broken_count = 0
    for node in G.nodes():
        if not 'val' in G.node[node] or not 'test' in G.node[node]:
            G.remove_node(node)
            broken_count += 1
    print("Removed {:d} nodes that lacked proper annotations due to networkx versioning issues".format(
        broken_count))

    # Make sure the graph has edge train_removed annotations
    # (some datasets might already have this..)
    print("Loaded data.. now preprocessing..")
    for edge in G.edges():
        if (G.node[edge[0]]['val'] or G.node[edge[1]]['val'] or
                G.node[edge[0]]['test'] or G.node[edge[1]]['test']):
            G[edge[0]][edge[1]]['train_removed'] = True
        else:
            G[edge[0]][edge[1]]['train_removed'] = False

    if normalize and not feats is None:
        from sklearn.preprocessing import StandardScaler
        train_ids = np.array([n for n in G.nodes(
        ) if not G.node[n]['val'] and not G.node[n]['test']])
        train_feats = feats[train_ids]
        scaler = StandardScaler()
        scaler.fit(train_feats)
        feats = scaler.transform(feats)

    return G, feats, class_map

def _sample_mask(idx, l):
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return mask


In [135]:
from easydict import EasyDict
# train_prefix = '../graphzoom/dataset/cora/cora'
# G, features, class_map = load_data(train_prefix)
dataset = 'cora'
dataset_dir = f'../graphzoom/dataset/{dataset}'
G      = json_graph.node_link_graph(json.load(open(dataset_dir + "/{}-G.json".format(dataset))))
labels = json.load(open(dataset_dir + "/{}-class_map.json".format(dataset)))
feats = np.load(dataset_dir + f"/{dataset}-feats.npy")

train_ids    = [n for n in G.nodes() if not G.node[n]['val'] and not G.node[n]['test']]
test_ids     = [n for n in G.nodes() if G.node[n]['test']]
val_ids     = test_ids[1000:1500]
test_ids     = test_ids[:1000]
# train_labels = [labels[str(i)] for i in train_ids]
# test_labels  = [labels[str(i)] for i in test_ids]
labels = torch.LongTensor(list(class_map.values()))
train_mask = _sample_mask(train_ids, labels.shape[0])
test_mask =  _sample_mask(test_ids, labels.shape[0])
val_mask =  _sample_mask(val_ids, labels.shape[0])
# val_mask = _sample_mask(range(200, 500), labels.shape[0])
onehot_labels = F.one_hot(labels)
print(len(train_labels))
print(len(test_ids))
print(len(val_ids))

140
1000
500


In [131]:
# len(labels)
# _sample_mask(test_ids, len(labels)).nonzero()[0][:1000]

array([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
         11,   12,   13,   14,   15,   16,   17,   18,   19,   20,   21,
         22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   33,
         34,   35,   36,   37,   38,   39,   40,   41,   42,   43,   44,
         45,   46,   47,   48,   49,   50,   51,   52,   53,   54,   55,
         56,   57,   58,   59,   60,   61,   62,   63,   64,   65,   66,
         67,   68,   69,   70,   71,   72,   73,   74,   75,   76,   77,
         78,   79,   80,   81,   82,   83,   84,   86,   87,   88,   89,
         90,   91,   92,   93,   94,   95,   96,   97,   98,   99,  100,
        101,  102,  103,  104,  105,  106,  107,  108,  109,  110,  111,
        112,  113,  114,  115,  116,  117,  118,  119,  120,  121,  122,
        123,  124,  125,  126,  127,  128,  129,  130,  131,  132,  133,
        134,  135,  136,  137,  138,  139,  140,  141,  142,  143,  144,
        146,  147,  148,  149,  150,  151,  152,  1

In [136]:
data = EasyDict({
    'graph': G,
    'labels': labels,
    'onehot_labels': onehot_labels,
    'features': feats,
    'train_mask':train_mask,
    'val_mask': val_mask,
    'test_mask': test_mask,
    'num_classes': onehot_labels.shape[1],
    
})

In [83]:
from scipy.sparse import csr_matrix
def construct_proj_laplacian(laplacian, levels, proj_dir):
    coarse_laplacian = []
    projections = []
    for i in range(levels):
        projection_name = "{}/Projection_{}.mtx".format(proj_dir, i+1)
        projection = mtx2matrix(projection_name)
        projections.append(projection)
        coarse_laplacian.append(laplacian)
        if i != (levels-1):
            laplacian = projection @ laplacian @ (projection.transpose())
    return projections, coarse_laplacian

def mtx2matrix(proj_name):
    data = []
    row = []
    col = []
    with open(proj_name) as ff:
        for i, line in enumerate(ff):
            info = line.split()
            if i == 0:
                NumReducedNodes = int(info[0])
                NumOriginNodes = int(info[1])
            else:
                row.append(int(info[0])-1)
                col.append(int(info[1])-1)
                data.append(1)
    matrix = csr_matrix((data, (row, col)), shape=(
        NumReducedNodes, NumOriginNodes))
    return matrix
levels = 2
reduce_results = f"../graphzoom/reduction_results/Cora"
original_adj = nx.adj_matrix(G)
projections, coarse_adj = construct_proj_laplacian(
    original_adj, levels, reduce_results)

In [89]:
coarse_adj[1].shape

(1169, 1169)

In [100]:
coarse_embed = projections[0] * data.features
coarse_labels = data.oneho
# embed_coarse[0]
# data.features[0]
# projections[0].shape
# data.features.shape

array([-0.12038585, -0.08481889, -0.14797909, ..., -0.12038585,
       -0.14797909, -0.08481889])

In [124]:
def main(args):
    # load and preprocess dataset
    features = torch.FloatTensor(data.features)
    labels = data.labels
#     g, features, class_map = load_data(train_prefix)
#     labels = torch.LongTensor(list(class_map.values()))
    if hasattr(torch, 'BoolTensor'):
        train_mask = torch.BoolTensor(data.train_mask)
        val_mask = torch.BoolTensor(data.val_mask)
        test_mask = torch.BoolTensor(data.test_mask)
    in_feats = data.features.shape[1]
    n_classes = data.num_classes
    n_edges = data.graph.number_of_edges()
    print("""----Data statistics------'
      #Edges %d
      #Classes %d
      #Train samples %d
      #Val samples %d
      #Test samples %d""" %
          (n_edges, n_classes,
              train_mask.int().sum().item(),
              val_mask.int().sum().item(),
              test_mask.int().sum().item()))

    if args.gpu < 0:
        cuda = False
    else:
        cuda = True
        torch.cuda.set_device(args.gpu)
        features = features.cuda()
        labels = labels.cuda()
        train_mask = train_mask.cuda()
        val_mask = val_mask.cuda()
        test_mask = test_mask.cuda()

    # graph preprocess and calculate normalization factor
    g = data.graph
    # add self loop
    if args.self_loop:
        print('add self_loop')
        g.remove_edges_from(nx.selfloop_edges(g))
        g.add_edges_from(zip(g.nodes(), g.nodes()))
    g = DGLGraph(g)
    n_edges = g.number_of_edges()
    # normalization
    degs = g.in_degrees().float()
    norm = torch.pow(degs, -0.5)
    norm[torch.isinf(norm)] = 0
    if cuda:
        norm = norm.cuda()
    g.ndata['norm'] = norm.unsqueeze(1)

    # create GCN model
    model = GCN(g,
                in_feats,
                args.n_hidden,
                n_classes,
                args.n_layers,
                F.relu,
                args.dropout)

    if cuda:
        model.cuda()
    loss_fcn = torch.nn.CrossEntropyLoss()

    # use optimizer
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)

    # initialize graph
    dur = []
    for epoch in range(args.n_epochs):
        model.train()
        if epoch >= 3:
            t0 = time.time()
        # forward
        logits = model(features)
        loss = loss_fcn(logits[train_mask], labels[train_mask])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch >= 3:
            dur.append(time.time() - t0)

        acc = evaluate(model, features, labels, val_mask)
        print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | "
              "ETputs(KTEPS) {:.2f}". format(epoch, np.mean(dur), loss.item(),
                                             acc, n_edges / np.mean(dur) / 1000))

    print()
    acc = evaluate(model, features, labels, test_mask)
    print("Test accuracy {:.2%}".format(acc))

In [137]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='GCN')
#     register_data_args(parser)
    parser.add_argument("--dropout", type=float, default=0.5,
            help="dropout probability")
    parser.add_argument("--gpu", type=int, default=0,
            help="gpu")
    parser.add_argument("--lr", type=float, default=1e-2,
            help="learning rate")
    parser.add_argument("--n-epochs", type=int, default=200,
            help="number of training epochs")
    parser.add_argument("--n-hidden", type=int, default=16,
            help="number of hidden gcn units")
    parser.add_argument("--n-layers", type=int, default=1,
            help="number of hidden gcn layers")
    parser.add_argument("--weight-decay", type=float, default=5e-4,
            help="Weight for L2 loss")
    parser.add_argument("--self-loop", action='store_true',
            help="graph self-loop (default=False)")
#     parser.add_argument("--dataset", default='cora')
    parser.set_defaults(self_loop=True)
    parser.add_argument("--dataset", default='cora')
#     args = parser.parse_args()
#     args = parser.parse_args()[1:]
    args = parser.parse_known_args()[0]
    print(args)

    main(args)

Namespace(dataset='cora', dropout=0.5, gpu=0, lr=0.01, n_epochs=200, n_hidden=16, n_layers=1, self_loop=True, weight_decay=0.0005)
----Data statistics------'
      #Edges 5278
      #Classes 7
      #Train samples 140
      #Val samples 500
      #Test samples 1000
add self_loop
Epoch 00000 | Time(s) nan | Loss 1.9537 | Accuracy 0.5240 | ETputs(KTEPS) nan
Epoch 00001 | Time(s) nan | Loss 1.8581 | Accuracy 0.6000 | ETputs(KTEPS) nan
Epoch 00002 | Time(s) nan | Loss 1.7498 | Accuracy 0.6380 | ETputs(KTEPS) nan
Epoch 00003 | Time(s) 0.0033 | Loss 1.6401 | Accuracy 0.6600 | ETputs(KTEPS) 4036.95
Epoch 00004 | Time(s) 0.0032 | Loss 1.4867 | Accuracy 0.6820 | ETputs(KTEPS) 4108.05
Epoch 00005 | Time(s) 0.0032 | Loss 1.3996 | Accuracy 0.7060 | ETputs(KTEPS) 4112.35
Epoch 00006 | Time(s) 0.0032 | Loss 1.2872 | Accuracy 0.7320 | ETputs(KTEPS) 4107.37
Epoch 00007 | Time(s) 0.0032 | Loss 1.1587 | Accuracy 0.7560 | ETputs(KTEPS) 4108.14
Epoch 00008 | Time(s) 0.0032 | Loss 1.0270 | Accuracy 0.7600 

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Epoch 00023 | Time(s) 0.0033 | Loss 0.2125 | Accuracy 0.7900 | ETputs(KTEPS) 4031.52
Epoch 00024 | Time(s) 0.0033 | Loss 0.1987 | Accuracy 0.7900 | ETputs(KTEPS) 4014.21
Epoch 00025 | Time(s) 0.0033 | Loss 0.1740 | Accuracy 0.7920 | ETputs(KTEPS) 4015.74
Epoch 00026 | Time(s) 0.0033 | Loss 0.1655 | Accuracy 0.7920 | ETputs(KTEPS) 4020.49
Epoch 00027 | Time(s) 0.0033 | Loss 0.1709 | Accuracy 0.7980 | ETputs(KTEPS) 4021.01
Epoch 00028 | Time(s) 0.0033 | Loss 0.1656 | Accuracy 0.7980 | ETputs(KTEPS) 4029.34
Epoch 00029 | Time(s) 0.0033 | Loss 0.1754 | Accuracy 0.8000 | ETputs(KTEPS) 4036.90
Epoch 00030 | Time(s) 0.0033 | Loss 0.1387 | Accuracy 0.8060 | ETputs(KTEPS) 3991.49
Epoch 00031 | Time(s) 0.0033 | Loss 0.1416 | Accuracy 0.8040 | ETputs(KTEPS) 3996.66
Epoch 00032 | Time(s) 0.0033 | Loss 0.1151 | Accuracy 0.8100 | ETputs(KTEPS) 4001.66
Epoch 00033 | Time(s) 0.0033 | Loss 0.1164 | Accuracy 0.8100 | ETputs(KTEPS) 4005.93
Epoch 00034 | Time(s) 0.0033 | Loss 0.1056 | Accuracy 0.8100 | ET

## experiments
dataset | level | acc
--- | --- | ---
cora | 0 | 81.1