In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np


"""GCN using DGL nn package

References:
- Semi-Supervised Classification with Graph Convolutional Networks
- Paper: https://arxiv.org/abs/1609.02907
- Code: https://github.com/tkipf/gcn
"""
import torch
import torch.nn as nn
from dgl.nn.pytorch import GraphConv

class GCN(nn.Module):
    def __init__(self,
                 g,
                 in_feats,
                 n_hidden,
                 n_classes,
                 n_layers,
                 activation,
                 dropout):
        super(GCN, self).__init__()
        self.g = g
        self.layers = nn.ModuleList()
        # input layer
        self.layers.append(GraphConv(in_feats, n_hidden, activation=activation))
        # hidden layers
        for i in range(n_layers - 1):
            self.layers.append(GraphConv(n_hidden, n_hidden, activation=activation))
        # output layer
        self.layers.append(GraphConv(n_hidden, n_classes))
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, features):
        h = features
        for i, layer in enumerate(self.layers):
            if i != 0:
                h = self.dropout(h)
            h = layer(self.g, h)
        return h

In [None]:
import numpy as np
import time
import torch
import torch.nn.functional as F
from dgl import DGLGraph
from dgl.data import load_data
import networkx as nx
import os
import sys
thismodule = sys.modules[__name__]
from sklearn.neighbors import NearestNeighbors
class Args(object):
    pass


class InnerProductDecoder(torch.nn.Module):
    def __init__(self, activation=torch.sigmoid, dropout=0.1):
        super(InnerProductDecoder, self).__init__()
        self.dropout = dropout
        self.activation = activation

    def forward(self, z):
        z = F.dropout(z, self.dropout)
        adj = self.activation(torch.mm(z, z.t()))
        return adj

# crossentropy = torch.nn.CrossEntropyLoss()
def lp_loss(logits, labels, edges):
    preds = torch.sum(logits[edges[:, 0]] * logits[edges[:, 1]], dim=1)
    return F.binary_cross_entropy_with_logits(preds, labels) + 0.1 * torch.mean(logits.pow(2))


def unsup_loss(logits, labels, edges, pos_weight):
    preds = torch.sigmoid(torch.sum(logits[edges[:, 0]] * logits[edges[:, 1]], dim=1))
#     print(preds.min(), preds.max())
    return torch.where(labels, - torch.log(preds), - torch.log(1 - preds + 1e-20)).sum()
#     torch.log(preds)

def accuracy(logits, labels):
    _, indices = torch.max(logits, dim=1)
    correct = torch.sum(indices == labels.type(torch.LongTensor))
    return correct.item() * 1.0 / len(labels)
    
class DataWrapper(object):
    def __init__(self, path, checkerPath=None):
        data = pd.read_csv(path, sep=' ', header=None)
        self.mapping = set(data[0]) | set(data[1])
        self.mapping = dict(zip(self.mapping, range(len(self.mapping))))
        if checkerPath is not None:
            checker = pd.read_csv(checkerPath, header=None)[0].values
            self.labels = ~(data[0].astype(str) + ' ' + data[1].astype(str)).isin(checker)
        else:
            self.labels = data[2] > 0.5
        data[0] = data[0].map(self.mapping)
        data[1] = data[1].map(self.mapping)
        self.graph = nx.from_pandas_edgelist(data, create_using=nx.Graph, source=0, target=1)
        self.features = np.zeros(len(self.graph))
        self.edgelist = data[[0, 1]]
        
def accuracy_lp(logits, labels):
    return ((torch.sigmoid(logits) > 0.5).type(torch.FloatTensor) == labels).type(torch.FloatTensor).mean().item()


def evaluate(model, features, labels, edges, mask, task):
    model.eval()
    if task == 'nc':
        with torch.no_grad():
            logits = model(features)
            logits = logits[mask]
            labels = labels[mask]
            return accuracy(logits, labels)
    elif task == 'lp':
        with torch.no_grad():
            logits = model(features)
            preds = torch.sum(logits[edges[mask, 0]] * logits[edges[mask, 1]], dim=1)
            return accuracy_lp(preds, labels[mask])

def get_data(name):
    args = Args()
    args.dataset = name
    if name in ['cora', 'citeseer', 'pubmed']:
        return load_data(args)
    return DataWrapper(name)


In [None]:
gcnrgs = {
    'n_hidden': 64,
    'n_layers': 2,
    'dropout': 0.5,
    'aggregator_type': 'gcn',
    'gpu': 1,
    'epochs': 300,
    'lr': 1e-2,
    'weight_decay': 5e-4,
    'self_loop': 'store_true'
}

In [None]:

class EarlyStopping:
    def __init__(self, patience=10, exp_name='gat'):
        self.patience = patience
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.exp_name = exp_name + '_es_checkpoint.pt'

    def step(self, acc, model):
        score = acc
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(model)
        elif score < self.best_score:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(model)
            self.counter = 0
        return self.early_stop

    def save_checkpoint(self, model):
        '''Saves model when validation loss decrease.'''
        torch.save(model.state_dict(), self.exp_name)

In [None]:
def init_gcn(g, num_feats, n_classes):
    args = Args()
    for key, val in gcnrgs.items():
        setattr(args, key, val)
    model = GCN(g,
                num_feats,
                args.n_hidden,
                n_classes,
                args.n_layers,
                F.relu,
                args.dropout)
    return model, args

def init_model(g, m, num_feats, num_classes):
    return getattr(thismodule, f'init_{m}')(g, num_feats, num_classes)

In [None]:
def create_lp_dataset(G, seed=0):
    np.random.seed(seed)
    edgelist = np.array(G.edges)
    edgelist = np.concatenate([edgelist, np.ones((edgelist.shape[0], 1))], 1)
    negs = negative_sampling(G)
    edgelist = np.concatenate([edgelist, negs], 0)
    labels = edgelist[:, 2]
    edgelist = edgelist[:, :2]
    perm = np.random.permutation(edgelist.shape[0])
    train_mask = perm[:perm.shape[0] // 3 + 1]
    val_mask = perm[perm.shape[0] // 3 + 1:perm.shape[0] // 2 + 1]
    test_mask = perm[perm.shape[0] // 2 + 1:]
    return edgelist, labels, train_mask, val_mask, test_mask

def negative_sampling(G):
    adj = nx.to_numpy_array(G)
    nn = NearestNeighbors(metric='cosine')
    nn.fit(adj)
    _, res = nn.kneighbors()
    negs = []
    for idx, i in enumerate(res):
        for j in i:
            if not G.has_edge(idx, j):
                negs.append([idx, j])
    negs = np.array(negs)
    e = negs[np.random.permutation(negs.shape[0])[:G.number_of_edges()]]
    return np.concatenate([e, np.zeros((e.shape[0], 1))], 1)

In [None]:
res = {}
if not os.path.exists('new_results'):
    os.mkdir('new_results')
for d in ['hse']:
    print(d)
    data = DataWrapper("../HSE Data Extracting/Data/edgelist_train.txt", "../HSE Data Extracting/Data/edgelist__2015nw.txt")
    features = torch.FloatTensor(np.ones((data.features.shape[0], 1)))
    num_feats = features.shape[1]
    n_edges = data.graph.number_of_edges()
    g = data.graph
    # add self loop
    g.remove_edges_from(g.selfloop_edges())
    g = DGLGraph(g)
    n_edges = g.number_of_edges()
    res[d] = {}
    task = 'lp'
    res[d][task] = {}
    
    degs = g.in_degrees().float()
    norm = torch.pow(degs, -0.5)
    norm[torch.isinf(norm)] = 0
    g.ndata['norm'] = norm.unsqueeze(1)
    adj = g.adjacency_matrix().to_dense()
    pos_weight = torch.Tensor([float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()])
    
    edgelist, labels, train_mask, val_mask, test_mask = create_lp_dataset(data.graph)
    
    edgelist = torch.LongTensor(edgelist)
    labels = torch.ByteTensor(labels)
    train_mask = torch.LongTensor(train_mask)
    val_mask = torch.LongTensor(val_mask)
    test_mask = torch.LongTensor(test_mask)
    n_classes = 64
    loss_fcn = unsup_loss
    
    for m in ['gcn']:
        print(m)
        model, args = init_model(g, m, num_feats, n_classes)
        stopper = EarlyStopping(patience=100, exp_name=task + '_' + m)
        decoder = InnerProductDecoder(activation=lambda x: x)
        optimizer = torch.optim.Adam(list(model.parameters()) + list(decoder.parameters()), lr=args.lr, weight_decay=args.weight_decay)
        dur = []
        for epoch in range(args.epochs):
            model.train()
            if epoch >= 3:
                t0 = time.time()
            # forward
            logits = model(features)
            
            preds = decoder(logits)
            
            loss = F.binary_cross_entropy_with_logits(preds, adj, pos_weight=pos_weight)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if epoch >= 3:
                dur.append(time.time() - t0)

            train_acc = evaluate(model, features, labels, edgelist, train_mask,  task)

            val_acc = evaluate(model, features, labels, edgelist, val_mask, task)
            if stopper.step(-loss.item(), model):   
                break

            if (epoch % 10) == 0:
                print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | TrainAcc {:.4f} |"
                      " ValAcc {:.4f} | ETputs(KTEPS) {:.2f}".
                      format(epoch, np.mean(dur), loss.item(), train_acc,
                             val_acc, n_edges / np.mean(dur) / 1000))

        print()
        model.load_state_dict(torch.load(task + '_' + m + '_es_checkpoint.pt'))
        acc = evaluate(model, features, labels, edgelist, test_mask, task)
        res[d][task][m] = acc
        embeddings = model(features).data.numpy()
        pd.DataFrame(embeddings).to_csv(f'{m}_{d}_wo_feats.csv', index=False)
        print("Test Accuracy {:.4f}".format(acc))

In [None]:
data.labels.mean()

In [None]:
import pickle

invmap = {j: i for i, j in data.mapping.items()}
emb_dict_default = dict(zip([invmap[i] for i in range(len(invmap))], embeddings))
with open('emb_dict_train_gcn_unsup.pkl', 'wb') as f:
    pickle.dump(emb_dict_default, f)