In [1]:
import networkx as nx
import numpy as np
import torch
import torch.nn.functional as F
from torch_geometric.utils import to_networkx
from copy import deepcopy
from scipy.sparse import coo_matrix
from sklearn.metrics import roc_auc_score
import itertools
import dgl
from dgl.nn import SAGEConv
import dgl.function as fn

In [2]:
import sys
sys.path.append('../src')
sys.path.append('..')

import src.synthetic as synthetic
import src.transform as transform

In [3]:
def create_train_test_split_edge(data):
    # Create a list of positive and negative edges
    u, v = data.edges()
    u, v = u.numpy(), v.numpy()
    edge_index = np.array((u, v))
    adj = coo_matrix((np.ones(data.num_edges()), edge_index))
    adj_neg = 1 - adj.todense() - np.eye(data.num_nodes())
    neg_u, neg_v = np.where(adj_neg != 0)

    # Create train/test edge split
    test_size = int(np.floor(data.num_edges() * 0.1))
    eids = np.random.permutation(np.arange(data.num_edges())) # Create an array of 'edge IDs'

    train_pos_u, train_pos_v = edge_index[:, eids[test_size:]]
    test_pos_u, test_pos_v   = edge_index[:, eids[:test_size]]

    # Sample an equal amount of negative edges from  the graph, split into train/test
    neg_eids = np.random.choice(len(neg_u), data.num_edges())
    test_neg_u, test_neg_v = (
        neg_u[neg_eids[:test_size]],
        neg_v[neg_eids[:test_size]],
    )
    train_neg_u, train_neg_v = (
        neg_u[neg_eids[test_size:]],
        neg_v[neg_eids[test_size:]],
    )

    # Remove test edges from original graph
    train_g = deepcopy(data)
    train_g.remove_edges(eids[:test_size]) # Remove positive edges from the testing set from the network

    train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=data.num_nodes())
    train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=data.num_nodes())

    test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=data.num_nodes())
    test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=data.num_nodes())

    return train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g

def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]
    )
    return F.binary_cross_entropy_with_logits(scores, labels)


def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]
    ).numpy()
    return roc_auc_score(labels, scores)

In [4]:
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, "mean")
        self.conv2 = SAGEConv(h_feats, h_feats, "mean")

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h
    

class DotPredictor(torch.nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata["h"] = h
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v("h", "h", "score"))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata["score"][:, 0]

In [16]:
def engineer_features(G):
    # TODO Work on getting this to be more feature agnostic - i.e. take the join of all this stuff and null if not present
    # Also need a stored one-hot 

    # Change type to two features, is_student, and is_org
    G_eng = deepcopy(G)
    _type = np.asarray(list(nx.get_node_attributes(G_eng, 'type').items()))
    is_student = np.asarray(_type[:,1] == 'student', dtype='double')
    # commitment_limit = list(nx.get_node_attributes(G, 'commitment_limit').values())

    X = np.column_stack([is_student, 1-is_student])
    nx.set_node_attributes(G_eng, dict(zip(_type[:,0], X)), 'X')

    # TODO Add major in as one-hot

    # TODO Add Year in as one-hot


    return G_eng

In [17]:
G = synthetic.synthesize_graph()


In [18]:
G_eng = engineer_features(G)

In [27]:
G = dgl.from_networkx(G_eng, node_attrs=['X']) # TODO Investigate the slowness here

In [28]:
G

Graph(num_nodes=315, num_edges=1016,
      ndata_schemes={'X': Scheme(shape=(2,), dtype=torch.float64)}
      edata_schemes={})

In [29]:
train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g = create_train_test_split_edge(G)

model = GraphSAGE(train_g.ndata["X"].shape[1], 32)
pred = DotPredictor()
optimizer = torch.optim.Adam(
    itertools.chain(model.parameters(), pred.parameters()), lr=0.01
)

In [21]:
# ----------- 4. training -------------------------------- #
all_logits = []
for e in range(1001):
    # forward
    h = model(train_g, train_g.ndata["X"])
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)

    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 5 == 0:
        print("In epoch {}, loss: {}".format(e, loss))

    # ----------- 5. check results ------------------------ #
    if e % 100 == 0:
        with torch.no_grad():
            pos_score = pred(test_pos_g, h)
            neg_score = pred(test_neg_g, h)
            print("AUC", compute_auc(pos_score, neg_score))

RuntimeError: expected scalar type Double but found Float

In [30]:
train_g

Graph(num_nodes=315, num_edges=915,
      ndata_schemes={'X': Scheme(shape=(2,), dtype=torch.float64)}
      edata_schemes={})