In [292]:
import argparse, time
import numpy as np
import networkx as nx
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph
from dgl.data import register_data_args, citegrh

# from gcn import GCN
#from gcn_mp import GCN
#from gcn_spmv import GCN
# from graphsage_utils import *

def evaluate(model, features, labels, mask):
    model.eval()
    with torch.no_grad():
        logits, _ = model(features)
        logits = logits[mask]
        labels = labels[mask]
        _, indices = torch.max(logits, dim=1)
        correct = torch.sum(indices == labels)
        return correct.item() * 1.0 / len(labels)

def load_data_dgl(dataset='cora'):
    if dataset == 'cora':
        return citegrh.load_cora()
    elif dataset == 'citeseer':
        return citegrh.load_citeseer()
    elif dataset == 'pubmed':
        return citegrh.load_pubmed()    
    

In [293]:
"""GCN using DGL nn package

References:
- Semi-Supervised Classification with Graph Convolutional Networks
- Paper: https://arxiv.org/abs/1609.02907
- Code: https://github.com/tkipf/gcn
"""
import torch
import torch.nn as nn
from dgl.nn.pytorch import GraphConv

class GCN(nn.Module):
    def __init__(self,
                 g,
                 in_feats,
                 n_hidden,
                 n_classes,
                 n_layers,
                 activation,
                 dropout, log_softmax=False):
        super(GCN, self).__init__()
        self.g = g
        self.layers = nn.ModuleList()
        # input layer
        self.layers.append(GraphConv(in_feats, n_hidden, activation=activation))
        # hidden layers
        for i in range(n_layers - 1):
            self.layers.append(GraphConv(n_hidden, n_hidden, activation=activation))
        # output layer
        self.layers.append(GraphConv(n_hidden, n_classes))
        self.dropout = nn.Dropout(p=dropout)
        self.log_softmax = log_softmax

    def forward(self, features):
        h = features
        for i, layer in enumerate(self.layers):
            if i != 0:
                h = self.dropout(h)
            emb = h
            h = layer(self.g, h)
        if self.log_softmax:
            return nn.functional.log_softmax(h, 1), emb
        return h, emb

In [294]:
def load_data(prefix, normalize=True, load_walks=False):
    G_data = json.load(open(prefix + "-G.json"))
    G = json_graph.node_link_graph(G_data)
    if isinstance(G.nodes()[0], int):
        def conversion(n): return int(n)
    else:
        def conversion(n): return n

    if os.path.exists(prefix + "-feats.npy"):
        feats = np.load(prefix + "-feats.npy")
    else:
        print("No features present.. Only identity features will be used.")
        feats = None
    class_map = json.load(open(prefix + "-class_map.json"))
    if isinstance(list(class_map.values())[0], list):
        def lab_conversion(n): return n
    else:
        def lab_conversion(n): return int(n)

    class_map = {conversion(k): lab_conversion(v)
                 for k, v in class_map.items()}

    # Remove all nodes that do not have val/test annotations
    # (necessary because of networkx weirdness with the Reddit data)
    broken_count = 0
    for node in G.nodes():
        if not 'val' in G.node[node] or not 'test' in G.node[node]:
            G.remove_node(node)
            broken_count += 1
    print("Removed {:d} nodes that lacked proper annotations due to networkx versioning issues".format(
        broken_count))

    # Make sure the graph has edge train_removed annotations
    # (some datasets might already have this..)
    print("Loaded data.. now preprocessing..")
    for edge in G.edges():
        if (G.node[edge[0]]['val'] or G.node[edge[1]]['val'] or
                G.node[edge[0]]['test'] or G.node[edge[1]]['test']):
            G[edge[0]][edge[1]]['train_removed'] = True
        else:
            G[edge[0]][edge[1]]['train_removed'] = False

    if normalize and not feats is None:
        from sklearn.preprocessing import StandardScaler
        train_ids = np.array([n for n in G.nodes(
        ) if not G.node[n]['val'] and not G.node[n]['test']])
        train_feats = feats[train_ids]
        scaler = StandardScaler()
        scaler.fit(train_feats)
        feats = scaler.transform(feats)

    return G, feats, class_map

def _sample_mask(idx, l):
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return mask


In [305]:
from easydict import EasyDict
# train_prefix = '../graphzoom/dataset/cora/cora'
# G, features, class_map = load_data(train_prefix)
dataset = 'citeseer'
dataset_dir = f'../graphzoom/dataset/{dataset}'
G      = json_graph.node_link_graph(json.load(open(dataset_dir + "/{}-G.json".format(dataset))))
labels = json.load(open(dataset_dir + "/{}-class_map.json".format(dataset)))
feats = np.load(dataset_dir + f"/{dataset}-feats.npy")

train_ids    = [n for n in G.nodes() if not G.node[n]['val'] and not G.node[n]['test']]
test_ids     = [n for n in G.nodes() if G.node[n]['test']]
val_ids     = test_ids[1000:1500]
test_ids     = test_ids[:1000]
# train_labels = [labels[str(i)] for i in train_ids]
# test_labels  = [labels[str(i)] for i in test_ids]
labels = torch.LongTensor(list(labels.values()))
train_mask = _sample_mask(train_ids, labels.shape[0])
test_mask =  _sample_mask(test_ids, labels.shape[0])
val_mask =  _sample_mask(val_ids, labels.shape[0])
# val_mask = _sample_mask(range(200, 500), labels.shape[0])
onehot_labels = F.one_hot(labels)
print(len(train_labels))
print(len(test_ids))
print(len(val_ids))

140
1000
500


In [306]:
data = EasyDict({
    'graph': G,
    'labels': labels,
    'onehot_labels': onehot_labels,
    'features': feats,
    'train_mask':train_mask,
    'val_mask': val_mask,
    'test_mask': test_mask,
    'num_classes': onehot_labels.shape[1],
    'coarse': False
    
})

In [300]:
from scipy.sparse import csr_matrix
def construct_proj_laplacian(laplacian, levels, proj_dir):
    coarse_laplacian = []
    projections = []
    for i in range(levels):
        projection_name = "{}/Projection_{}.mtx".format(proj_dir, i+1)
        projection = mtx2matrix(projection_name)
        projections.append(projection)
        coarse_laplacian.append(laplacian)
        if i != (levels-1):
            laplacian = projection @ laplacian @ (projection.transpose())
    return projections, coarse_laplacian

def mtx2matrix(proj_name):
    data = []
    row = []
    col = []
    with open(proj_name) as ff:
        for i, line in enumerate(ff):
            info = line.split()
            if i == 0:
                NumReducedNodes = int(info[0])
                NumOriginNodes = int(info[1])
            else:
                row.append(int(info[0])-1)
                col.append(int(info[1])-1)
                data.append(1)
    matrix = csr_matrix((data, (row, col)), shape=(
        NumReducedNodes, NumOriginNodes))
    return matrix
levels = 2
reduce_results = f"../graphzoom/reduction_results/{dataset}"
original_adj = nx.adj_matrix(G)
projections, coarse_adj = construct_proj_laplacian(
    original_adj, levels, reduce_results)

In [269]:
from torch.nn.functional import softmax
# softmax(labels)
# level = 1
coarse_feats = projections[0] @ data.features
coarse_labels = projections[0] @ data.onehot_labels 
coarse_graph = nx.Graph(coarse_adj[1])
rows_sum = coarse_labels.sum(axis=1)[:, np.newaxis]
norm_coarse_labels = coarse_labels / rows_sum
# list(map(np.shape, [coarse_embed, coarse_labels]))

In [270]:
coarse_train_mask = _sample_mask(range(100), norm_coarse_labels.shape[0])
coarse_test_mask = _sample_mask(range(100,700), norm_coarse_labels.shape[0])
coarse_val_mask = _sample_mask(range(700,1000), norm_coarse_labels.shape[0])

In [307]:
coarse_data = EasyDict({
    'graph': coarse_graph,
    'labels': coarse_labels,
#     'onehot_labels': onehot_labels,
    'features': coarse_feats,
    'train_mask':coarse_train_mask,
    'val_mask': coarse_val_mask,
    'test_mask': coarse_test_mask,
    'num_classes': norm_coarse_labels.shape[1],
    'coarse' : True
})

In [310]:
data = coarse_data
data.val_mask.shape
data.features.shape

(1402, 3703)

In [311]:
from torch.nn.functional import log_softmax
import pdb
def main(args):
    # load and preprocess dataset
    features = torch.FloatTensor(data.features)
    if data.coarse:
        labels = torch.FloatTensor(data.labels)
        loss_fcn = torch.nn.KLDivLoss()
    else:
        labels = torch.LongTensor(data.labels)
        loss_fcn = torch.nn.CrossEntropyLoss()
#     g, features, class_map = load_data(train_prefix)
#     labels = torch.LongTensor(list(class_map.values()))
    if hasattr(torch, 'BoolTensor'):
        train_mask = torch.BoolTensor(data.train_mask)
        val_mask = torch.BoolTensor(data.val_mask)
        test_mask = torch.BoolTensor(data.test_mask)
    in_feats = data.features.shape[1]
    n_classes = data.num_classes
    n_edges = data.graph.number_of_edges()
    print("""----Data statistics------'
      #Edges %d
      #Classes %d
      #Train samples %d
      #Val samples %d
      #Test samples %d""" %
          (n_edges, n_classes,
              train_mask.int().sum().item(),
              val_mask.int().sum().item(),
              test_mask.int().sum().item()))

    if args.gpu < 0:
        cuda = False
    else:
        cuda = True
        torch.cuda.set_device(args.gpu)
        features = features.cuda()
        labels = labels.cuda()
        train_mask = train_mask.cuda()
        val_mask = val_mask.cuda()
        test_mask = test_mask.cuda()

    # graph preprocess and calculate normalization factor
    g = data.graph
    # add self loop
    if args.self_loop:
        print('add self_loop')
        g.remove_edges_from(nx.selfloop_edges(g))
        g.add_edges_from(zip(g.nodes(), g.nodes()))
    g = DGLGraph(g)
    n_edges = g.number_of_edges()
    # normalization
    degs = g.in_degrees().float()
    norm = torch.pow(degs, -0.5)
    norm[torch.isinf(norm)] = 0
    if cuda:
        norm = norm.cuda()
    g.ndata['norm'] = norm.unsqueeze(1)

    # create GCN model
    model = GCN(g,
                in_feats,
                args.n_hidden,
                n_classes,
                args.n_layers,
                F.relu,
                args.dropout,log_softmax=data.coarse)
    print(model)

    if cuda:
        model.cuda()

    # use optimizer
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)

    # initialize graph
    dur = []
    for epoch in range(args.n_epochs):
        model.train()
        if epoch >= 3:
            t0 = time.time()
        # forward
        logits, h = model(features)
        loss = loss_fcn(logits[train_mask], labels[train_mask])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch >= 3:
            dur.append(time.time() - t0)

        acc = evaluate(model, features, labels, val_mask)
#         acc=0
        print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | "
              "ETputs(KTEPS) {:.2f}". format(epoch, np.mean(dur), loss.item(),
                                             acc, n_edges / np.mean(dur) / 1000))

    print()
    print(h.shape)
    np.save(f'{dataset}_emb_level_1', h.detach().cpu().numpy())
    acc = evaluate(model, features, labels, test_mask)
    print("Test accuracy {:.2%}".format(acc))

In [312]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='GCN')
#     register_data_args(parser)
    parser.add_argument("--dropout", type=float, default=0.5,
            help="dropout probability")
    parser.add_argument("--gpu", type=int, default=0,
            help="gpu")
    parser.add_argument("--lr", type=float, default=1e-2,
            help="learning rate")
    parser.add_argument("--n-epochs", type=int, default=200,
            help="number of training epochs")
    parser.add_argument("--n-hidden", type=int, default=128,
            help="number of hidden gcn units")
    parser.add_argument("--n-layers", type=int, default=1,
            help="number of hidden gcn layers")
    parser.add_argument("--weight-decay", type=float, default=5e-4,
            help="Weight for L2 loss")
    parser.add_argument("--self-loop", action='store_true',
            help="graph self-loop (default=False)")
#     parser.add_argument("--dataset", default='cora')
    parser.set_defaults(self_loop=True)
    parser.add_argument("--dataset", default='cora')
#     args = parser.parse_args()
#     args = parser.parse_args()[1:]
    args = parser.parse_known_args()[0]
    print(args)

    main(args)

Namespace(dataset='cora', dropout=0.5, gpu=0, lr=0.01, n_epochs=200, n_hidden=128, n_layers=1, self_loop=True, weight_decay=0.0005)
----Data statistics------'
      #Edges 3371
      #Classes 6
      #Train samples 100
      #Val samples 300
      #Test samples 600
add self_loop
GCN(
  (layers): ModuleList(
    (0): GraphConv(in=3703, out=128, normalization=True, activation=<function relu at 0x7ff0c5a1e290>)
    (1): GraphConv(in=128, out=6, normalization=True, activation=None)
  )
  (dropout): Dropout(p=0.5, inplace=False)
)


RuntimeError: Expected object of scalar type Long but got scalar type Float for argument #2 'target'

## experiments
dataset | level | shape | acc
--- | --- | --- | ---
cora | 0 | 81.1
citeseer | 1402, 128 | 65.20
pubmed | 7903, 128 | 79.8

In [304]:
# emb_cora_l1 = np.load('../graphzoom/embed_results/cora/cora_level_2.npy')
emb_cora_l1 = np.load('pubmed_emb_level_1.npy')
emb_cora_l1.shape

(19717, 128)

In [240]:
# emb_cora_l1

array([[0.8133336 , 1.074242  , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.9198053 , ..., 1.4515474 , 0.25990957,
        0.        ],
       [0.82546675, 0.        , 0.        , ..., 0.17736198, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 1.5836211 , ..., 1.0919161 , 0.        ,
        0.        ],
       [0.        , 0.        , 2.675375  , ..., 1.2003655 , 0.36214033,
        0.        ],
       [0.6243051 , 0.        , 0.        , ..., 0.        , 1.064039  ,
        0.        ]], dtype=float32)

In [275]:
!pwd


/yushi/repo/GraphZoom/dgl_gcn


In [None]:
citeseer 