# Import Package

In [1]:
import random
from decimal import *
import numpy as np
import collections
from tqdm import tqdm
from vose_sampler import VoseAlias
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric.nn
import torch.optim
from torch_geometric.datasets import Planetoid

# Load Data

In [2]:
dataset = Planetoid(root='/tmp/Cora', name='Cora')
data = dataset[0]

In [3]:
data.is_undirected()

True

# Define hyperparameter

In [4]:
power = 0.75
batchsize=5
epochs=100
negativepower = 0.75
negsamplesize = 5

# Data Preprocessing

In [5]:
df = pd.DataFrame(data.edge_index.T.numpy())
weight = torch.randint(1,100,( data.edge_index.shape[1] , ) )
df['weight'] = weight

In [6]:
edgedistdict = collections.defaultdict(int)
nodedistdict = collections.defaultdict(int)

weightsdict = collections.defaultdict(int)
nodedegrees = collections.defaultdict(int)

weightsum = 0
negprobsum = 0

nlines = 0
maxindex = 0

In [7]:
for row in df.values:
    node1 = row[0]
    node2 = row[1]
    weight = row[2]
    
    edgedistdict[tuple([node1, node2])] = weight # edge의 weight기록
    nodedistdict[node1] += weight # node에서 나가는 weight를 더해준다 -> directed
    
    weightsdict[tuple([node1, node2])] = weight # edge의 weight 기록
    nodedegrees[node1] += weight # node에서 나가는 weight를 더해준다 -> directed
    
    weightsum += weight # W
    negprobsum += np.power(weight, power) # negative sampling시 3/4 term
    
    # maxindex기록(왜 필요?)
    if node1 > maxindex:
        maxindex = node1
    elif node2 > maxindex:
        maxindex = node2

In [8]:
for node, outdegree in nodedistdict.items(): # negative sampling term으로 변환
    nodedistdict[node] = np.power(outdegree, power) / negprobsum 
    
for edge, weight in edgedistdict.items(): # wij/W로 변환
    edgedistdict[edge] = weight / weightsum

In [9]:
# VoseAlias Sampling
edgesampler = VoseAlias(edgedistdict)
nodesampler = VoseAlias(nodedistdict)

In [10]:
batchrange = int(len(edgedistdict)/batchsize)

In [11]:
def negSampleBatch(sourcenode, targetnode, negsamplesize, nodedegrees, nodesampler, t=10e-3):
    """
    For generating negative samples.
    """
    negsamples = 0
    while negsamples < negsamplesize:
        samplednode = nodesampler.sample_n(1)
        if (samplednode == sourcenode) or (samplednode == targetnode):
            continue
        else:
            negsamples += 1
            yield samplednode

In [12]:
def makeData(samplededges, negsamplesize, nodedegrees, nodesampler):
    for e in samplededges:
        sourcenode, targetnode = e[0], e[1]
        negnodes = []
        for negsample in negSampleBatch(sourcenode, targetnode, negsamplesize, nodedegrees, nodesampler, t=10e-3):
            for node in negsample:
                negnodes.append(node)
        yield [e[0], e[1]] + negnodes
# 0, 1에는 source, target이고 2~ 는 negative sample

# Model

In [13]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [14]:
class Line(nn.Module):
    def __init__(self, size, embed_dim=128, order=2):
        super(Line, self).__init__()

        assert order in [1, 2], print("Order should either be int(1) or int(2)")

        self.embed_dim = embed_dim
        self.order = order
        self.nodes_embeddings = nn.Embedding(size, embed_dim)

        if order == 2:
            self.contextnodes_embeddings = nn.Embedding(size, embed_dim)
            # Initialization
            self.contextnodes_embeddings.weight.data = self.contextnodes_embeddings.weight.data.uniform_(
                -.5, .5) / embed_dim

        # Initialization
        self.nodes_embeddings.weight.data = self.nodes_embeddings.weight.data.uniform_(
            -.5, .5) / embed_dim

    def forward(self, v_i, v_j, negsamples):

        v_i = self.nodes_embeddings(v_i)

        if self.order == 2:
            v_j = self.contextnodes_embeddings(v_j)
            negativenodes = -self.contextnodes_embeddings(negsamples)

        else:
            v_j = self.nodes_embeddings(v_j)
            negativenodes = -self.nodes_embeddings(negsamples)

        mulpositivebatch = torch.mul(v_i, v_j)
        positivebatch = F.logsigmoid(torch.sum(mulpositivebatch, dim=1))

        mulnegativebatch = torch.mul(v_i.view(len(v_i), 1, self.embed_dim), negativenodes)
        negativebatch = torch.sum(
            F.logsigmoid(
                torch.sum(mulnegativebatch, dim=2)
            ),
            dim=1)
        loss = positivebatch + negativebatch
        return -torch.mean(loss)

In [15]:
model = Line(size=maxindex+1)
model.to(device)

Line(
  (nodes_embeddings): Embedding(2708, 128)
  (contextnodes_embeddings): Embedding(2708, 128)
)

In [16]:
opt = torch.optim.Adam(model.parameters(), lr=0.001)

In [17]:
lossdata = {"it": [], "loss": []}
it = 0

# Learing

In [18]:
for epoch in range(epochs):
    total_loss = 0
    for b in range(batchrange): # 왜 이만큼 반복해야하지?
        
        # forward
        samplededges = edgesampler.sample_n(batchsize) # batchsize만큼 edge를 sampling
        batch = list(makeData(samplededges, negsamplesize, nodedegrees, nodesampler))
        batch = torch.LongTensor(batch)
        v_i = batch[:, 0]
        v_j = batch[:, 1]
        negsamples = batch[:, 2:]
        
        # initialize
        model.zero_grad()
        
        # loss
        loss = model(v_i.to(device), v_j.to(device), negsamples.to(device))
        
        # backward
        loss.backward()
        
        # update
        opt.step()

        # writing
        lossdata["loss"].append(loss.item())
        lossdata["it"].append(it)
        it += 1
        
        total_loss += loss
        
    train_loss = total_loss / batchrange
    if (epoch+1) % 10 == 0:
        print("epoch:{}, loss:{}".format(epoch+1, train_loss))

epoch:10, loss:0.37782058119773865
epoch:20, loss:0.19048920273780823
epoch:30, loss:0.17091511189937592
epoch:40, loss:0.15600639581680298
epoch:50, loss:0.16630582511425018
epoch:60, loss:0.16645987331867218
epoch:70, loss:0.15748505294322968
epoch:80, loss:0.1596524566411972
epoch:90, loss:0.17914359271526337
epoch:100, loss:0.1658223420381546
