<a href="https://colab.research.google.com/github/GeorgeM2000/DMTE/blob/master/code/DMTE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***Libraries & Tools***

In [1]:
import argparse
import os
import random
import numpy as np
import tensorflow as tf
from math import pow
from datetime import datetime
from pathlib import Path
import shutil
from tqdm import tqdm
import zipfile
from collections import defaultdict
from sklearn.preprocessing import normalize

In [None]:
!pwd

# ***Global Variables***

In [2]:
MAX_LEN=300
neg_table_size=1000000
NEG_SAMPLE_POWER=0.75
batch_size=64
num_epoch=200 # Default: 200
embed_size=200
word_embed_size=200
lr=1e-3

In [None]:
datasetName = "cora"
dataTextFile = "data.txt"
ratio = 0.15

# Original parameters 
alpha = 0.2
beta = 0.05

# Additional parameters 
gamma = 0.0
delta = 0.0

Count the words of the smallest and largest abstract

In [None]:
max_word_count = 0
min_word_count = float('inf')

with open(f'/content/datasets/{datasetName}/{dataTextFile}', 'r') as file:
    for line in file:
        word_count = len(line.split())

        if word_count > max_word_count:
            max_word_count = word_count

        if word_count < min_word_count:
            min_word_count = word_count

print("Max word count:", max_word_count)
print("Min word count:", min_word_count)


MAX_LEN = max_word_count + 1

Max word count: 473
Min word count: 10


Run the cell below if the data file with the astracts is too large and needs extracting 

In [None]:
zip_file_path = '/content/data.zip'
extract_to = f'/content/datasets/{datasetName}'

# Open and extract the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print("Extraction complete!")

Extraction complete!


In [4]:
def sub_Mat(P, node):

    sub_P = np.zeros((len(node),len(node)))
    for i in range(len(node)):
        for j in range(len(node)):
            sub_P[i,j] = P[node[i],node[j]]

    return sub_P

# ***DataSet***

In [None]:
class dataSet:
    def __init__(self, text_path, graph_path, labels_path=None):

        #text_file, graph_file, labels_file = self.load(text_path, graph_path, labels_path)
        text_file, graph_file = self.load(text_path, graph_path)

        self.edges = self.load_edges(graph_file)

        #self.labels = self.load_labels(labels_file)

        self.text, self.num_vocab, self.num_nodes = self.load_text(text_file)

        self.nodes = range(0, self.num_nodes)

        self.negative_table = InitNegTable(self.edges)

        self.P = self.P_matrix(self.edges, self.num_nodes)

    """
    def load(self, text_path, graph_path, labels_path):
        text_file = open(text_path, 'rb').readlines()
        graph_file = open(graph_path, 'rb').readlines()
        labels_file = open(labels_path, 'rb').readlines()

        return text_file, graph_file, label_file
    """

    def load(self, text_path, graph_path):
        text_file = open(text_path, 'rb').readlines()
        graph_file = open(graph_path, 'rb').readlines()

        return text_file, graph_file

    def load_edges(self, graph_file):
        edges = []
        for i in graph_file:
            if np.random.uniform(0.0, 1.0) <= ratio:
                edges.append(list(map(int, i.strip().decode().split('\t'))))

        return edges

    def load_labels(self, labels_file):
      labels = []
      for i in labels_file:
        labels.append(list(map(int, i.strip().decode().split(','))))

      return labels

    def adj_list(self, edges):
      node1, node2 = zip(*edges)

      # Create adjacency list
      adj_list = defaultdict(list)
      for n1, n2 in edges:
          adj_list[n1].append(n2)
          adj_list[n2].append(n1)

      return dict(adj_list)

    def load_text(self, text_file):
        """
        Adapting with adapt(text_data):

        vectorize_layer.adapt(text_data) analyzes text_data, builds a vocabulary, and assigns a unique integer ID to each word based on its frequency (most frequent words get lower IDs).
        Transforming with vectorize_layer(text_data):

        This maps each word in text_data to its corresponding integer token ID, producing a 2D array where each row represents a sequence of token IDs for a given input line, padded or truncated to max_len.
        """
        vectorize_layer = tf.keras.layers.TextVectorization(
            max_tokens=None,  # Set a limit if needed
            output_mode='int',
            output_sequence_length=MAX_LEN
        )

        text_data = [line.strip() for line in text_file]

        vectorize_layer.adapt(text_data)

        text = vectorize_layer(text_data).numpy()

        num_vocab = len(vectorize_layer.get_vocabulary())
        print(f'Vocabulary: {num_vocab}')
        num_nodes = len(text)

        return text, num_vocab, num_nodes

    def negative_sample(self, edges):
        # edges is the sample_edges in self.generate_batches()
        node1, node2 = zip(*edges)
        sample_edges = []

        # The negative table contains edges that don not exist
        func = lambda: self.negative_table[random.randint(0, neg_table_size - 1)] # Pick a random node from the negative table

        # For each edge...
        for i in range(len(edges)):
            neg_node = func() # Pick a negative node

            # If the negative node is identical to the first and second node in the current edge...
            while node1[i] == neg_node or node2[i] == neg_node:
                neg_node = func() # Pick another negative node until the neg node is different than the first and second node in the current edge

            # Create a new type of edge that has an additional node, the negative node
            sample_edges.append([node1[i], node2[i], neg_node])

        return sample_edges

    def generate_batches(self, mode=None):

        num_batch = len(self.edges) // batch_size
        edges = self.edges
        if mode == 'add':
            num_batch += 1
            edges.extend(edges[:(batch_size - len(self.edges) % batch_size)])
        if mode != 'add':
            random.shuffle(edges)
        sample_edges = edges[:num_batch * batch_size]

        # For each edge in "sample_edges", add a negative edge
        sample_edges = self.negative_sample(sample_edges)

        # Create batches of edges

        """
        The first batch range is 0 -- batch_size - 1
        The second batch range is batch_size -- 2 * batch_size - 1
        The third batch range is 2 * batch_size -- 3* batch_size - 1 and so on
        """
        batches = []
        for i in range(num_batch):
            batches.append(sample_edges[i * batch_size:(i + 1) * batch_size])

        return batches

    def nodes_batches(self, mode=None):

        num_batch = len(self.nodes) // batch_size
        nodes = self.nodes
        if mode == 'add':
            num_batch += 1
            nodes.extend(nodes[:(batch_size - len(self.nodes) % batch_size)])
            random.shuffle(nodes)
        if mode != 'add':
            random.shuffle(nodes)
        sample_nodes = nodes[:num_batch * batch_size]

        batches = []
        for i in range(num_batch):
            batches.append(sample_nodes[i * batch_size:(i + 1) * batch_size])
        return batches

    def P_matrix(self, edges, num_nodes):
        # Take all the edges
        a_list,b_list=zip(*edges)
        a_list=list(a_list) # This list contains the first nodes in all edges
        b_list=list(b_list) # This list contains the second nodes in all edges

        P = np.zeros((num_nodes,num_nodes))

        for i in range(len(a_list)):
            P[a_list[i],b_list[i]]=1 # The prob of transitioning from "a_list[i]" to "b_list[i]". Initially it's 1 for unweighted graphs
            P[b_list[i],a_list[i]]=1 # The prob of transitioning from "b_list[i]" to "a_list[i]". Initially it's 1 for unweighted graphs

        P = normalize(P, axis=1, norm='l1') # We normalize P

        return P

# ***DMTE***

In [75]:
class Model:
    def __init__(self, vocab_size, num_nodes, alpha, beta, num_labels=None):
        # '''hyperparameter'''
        with tf.name_scope('read_inputs') as scope:
            self.Text_a = tf.compat.v1.placeholder(tf.int32, [batch_size, MAX_LEN], name='Ta')
            self.Text_b = tf.compat.v1.placeholder(tf.int32, [batch_size, MAX_LEN], name='Tb')
            self.Text_neg = tf.compat.v1.placeholder(tf.int32, [batch_size, MAX_LEN], name='Tneg')
            self.Node_a = tf.compat.v1.placeholder(tf.int32, [batch_size], name='n1')
            self.Node_b = tf.compat.v1.placeholder(tf.int32, [batch_size], name='n2')
            self.Node_neg = tf.compat.v1.placeholder(tf.int32, [batch_size], name='n3')
            self.P_a = tf.compat.v1.placeholder(tf.float32, [batch_size, batch_size], name='Pa')
            self.P_b = tf.compat.v1.placeholder(tf.float32, [batch_size, batch_size], name='Pb')
            self.P_neg = tf.compat.v1.placeholder(tf.float32, [batch_size, batch_size], name='Pneg')
            #self.labels = tf.compat.v1.placeholder(tf.int32, [batch_size, num_labels], name='L')

        with tf.name_scope('initialize_embedding') as scope:
            self.text_embed = tf.Variable(tf.random.truncated_normal([vocab_size, word_embed_size], stddev=0.3))
            self.node_embed = tf.Variable(tf.random.truncated_normal([num_nodes, embed_size // 2], stddev=0.3))
            self.node_embed = tf.clip_by_norm(self.node_embed, clip_norm=1, axes=1)

        with tf.name_scope('lookup_embeddings') as scope:
            self.TA = tf.nn.embedding_lookup(self.text_embed, self.Text_a)
            self.T_A = tf.expand_dims(self.TA, -1)

            self.TB = tf.nn.embedding_lookup(self.text_embed, self.Text_b)
            self.T_B = tf.expand_dims(self.TB, -1)

            self.TNEG = tf.nn.embedding_lookup(self.text_embed, self.Text_neg)
            self.T_NEG = tf.expand_dims(self.TNEG, -1)

            self.N_A = tf.nn.embedding_lookup(self.node_embed, self.Node_a)
            self.N_B = tf.nn.embedding_lookup(self.node_embed, self.Node_b)
            self.N_NEG = tf.nn.embedding_lookup(self.node_embed, self.Node_neg)

        self.alpha = alpha
        self.beta = beta
        self.convA, self.convB, self.convNeg = self.conv()
        self.loss = self.compute_loss()

    def conv(self):

        W0 = tf.Variable(tf.random.truncated_normal([word_embed_size, embed_size // 2], stddev=0.3))
        W1 = tf.Variable(tf.random.truncated_normal([word_embed_size, embed_size // 2], stddev=0.3))
        W2 = tf.Variable(tf.random.truncated_normal([word_embed_size, embed_size // 2], stddev=0.3))
        #W3 = tf.Variable(tf.random.truncated_normal([word_embed_size, embed_size // 2], stddev=0.3))
        #W4 = tf.Variable(tf.random.truncated_normal([word_embed_size, embed_size // 2], stddev=0.3))

        mA = tf.reduce_mean(self.T_A, axis=1, keepdims=True)
        mB = tf.reduce_mean(self.T_B, axis=1, keepdims=True)
        mNEG = tf.reduce_mean(self.T_NEG, axis=1, keepdims=True)

        convA = tf.tanh(tf.squeeze(mA))
        convB = tf.tanh(tf.squeeze(mB))
        convNEG = tf.tanh(tf.squeeze(mNEG))

        attA = (tf.matmul(convA, W0) +
                self.alpha * tf.matmul(tf.matmul(self.P_a, convA), W1) +
                self.beta * tf.matmul(tf.matmul(tf.math.square(self.P_a), convA), W2))
                #gamma * tf.matmul(tf.matmul(tf.math.pow(self.P_a, 3), convA), W3) +
                #delta * tf.matmul(tf.matmul(tf.math.pow(self.P_a, 4), convA), W4))

        attB = (tf.matmul(convB, W0) +
                self.alpha * tf.matmul(tf.matmul(self.P_b, convB), W1) +
                self.beta * tf.matmul(tf.matmul(tf.math.square(self.P_b), convB), W2))
                #gamma * tf.matmul(tf.matmul(tf.math.pow(self.P_b, 3), convB), W3) +
                #delta * tf.matmul(tf.matmul(tf.math.pow(self.P_b, 4), convB), W4))


        attNEG = (tf.matmul(convNEG, W0) +
                  self.alpha * tf.matmul(tf.matmul(self.P_a, convNEG), W1) +
                  self.beta * tf.matmul(tf.matmul(tf.math.square(self.P_a), convNEG), W2))
                  #gamma * tf.matmul(tf.matmul(tf.math.pow(self.P_a, 3), convNEG), W3) +
                  #delta * tf.matmul(tf.matmul(tf.math.pow(self.P_a, 4), convNEG), W4))

        return attA, attB, attNEG

    def compute_loss(self):

        # Loss functions for:


        # Text-Text
        p1 = tf.math.log(tf.nn.sigmoid(tf.reduce_sum(tf.multiply(self.convA, self.convB), 1)) + 0.001)

        p2 = tf.math.log(tf.nn.sigmoid(-tf.reduce_sum(tf.multiply(self.convA, self.convNeg), 1)) + 0.001)

        p11 = tf.math.log(tf.nn.sigmoid(tf.reduce_sum(tf.multiply(self.convB, self.convA), 1)) + 0.001)

        p12 = tf.math.log(tf.nn.sigmoid(-tf.reduce_sum(tf.multiply(self.convB, self.convNeg), 1)) + 0.001)



        # Node-Node
        p3 = tf.math.log(tf.nn.sigmoid(tf.reduce_sum(tf.multiply(self.N_A +
                                                                 self.alpha * tf.matmul(self.P_a, self.N_A) +
                                                                 self.beta * tf.matmul(tf.math.square(self.P_a), self.N_A), self.N_B), 1)) + 0.001)
                                                                 #gamma * tf.matmul(tf.math.pow(self.P_a, 3), self.N_A) +
                                                                 #delta * tf.matmul(tf.math.pow(self.P_a, 4), self.N_A), self.N_B), 1)) + 0.001)

        p4 = tf.math.log(tf.nn.sigmoid(-tf.reduce_sum(tf.multiply(self.N_A +
                                                                  self.alpha * tf.matmul(self.P_a, self.N_A) +
                                                                  self.beta * tf.matmul(tf.math.square(self.P_a), self.N_A), self.N_NEG), 1)) + 0.001)
                                                                  #gamma * tf.matmul(tf.math.pow(self.P_a, 3), self.N_A) +
                                                                  #delta * tf.matmul(tf.math.pow(self.P_a, 4), self.N_A), self.N_NEG), 1)) + 0.001)

        p13 = tf.math.log(tf.nn.sigmoid(tf.reduce_sum(tf.multiply(self.N_B +
                                                                  self.alpha * tf.matmul(self.P_b, self.N_B) +
                                                                  self.beta * tf.matmul(tf.math.square(self.P_b), self.N_B), self.N_A), 1)) + 0.001)
                                                                  #gamma * tf.matmul(tf.math.pow(self.P_b, 3), self.N_B) +
                                                                  #delta * tf.matmul(tf.math.pow(self.P_b, 4), self.N_B), self.N_A), 1)) + 0.001)

        p14 = tf.math.log(tf.nn.sigmoid(-tf.reduce_sum(tf.multiply(self.N_B +
                                                                   self.alpha * tf.matmul(self.P_b, self.N_B) +
                                                                   self.beta * tf.matmul(tf.math.square(self.P_b), self.N_B), self.N_NEG), 1)) + 0.001)
                                                                   #gamma * tf.matmul(tf.math.pow(self.P_b, 3), self.N_B) +
                                                                   #delta * tf.matmul(tf.math.pow(self.P_b, 4), self.N_B), self.N_NEG), 1)) + 0.001)




        # Node-Text
        p5 = tf.math.log(tf.nn.sigmoid(tf.reduce_sum(tf.multiply(self.N_A +
                                                                 self.alpha * tf.matmul(self.P_a, self.N_A) +
                                                                 self.beta * tf.matmul(tf.math.square(self.P_a), self.N_A), self.convB), 1)) + 0.001)
                                                                 #gamma * tf.matmul(tf.math.pow(self.P_a, 3), self.N_A) +
                                                                 #delta * tf.matmul(tf.math.pow(self.P_a, 4), self.N_A), self.convB), 1)) + 0.001)

        p6 = tf.math.log(tf.nn.sigmoid(-tf.reduce_sum(tf.multiply(self.N_A +
                                                                  self.alpha * tf.matmul(self.P_a, self.N_A) +
                                                                  self.beta * tf.matmul(tf.math.square(self.P_a), self.N_A), self.convNeg), 1)) + 0.001)
                                                                  #gamma * tf.matmul(tf.math.pow(self.P_a, 3), self.N_A) +
                                                                  #delta * tf.matmul(tf.math.pow(self.P_a, 4), self.N_A), self.convNeg), 1)) + 0.001)

        p15 = tf.math.log(tf.nn.sigmoid(tf.reduce_sum(tf.multiply(self.N_B +
                                                                  self.alpha * tf.matmul(self.P_b, self.N_B) +
                                                                  self.beta * tf.matmul(tf.math.square(self.P_b), self.N_B), self.convA), 1)) + 0.001)
                                                                  #gamma * tf.matmul(tf.math.pow(self.P_b, 3), self.N_B) +
                                                                  #delta * tf.matmul(tf.math.pow(self.P_b, 4), self.N_B), self.convA), 1)) + 0.001)

        p16 = tf.math.log(tf.nn.sigmoid(-tf.reduce_sum(tf.multiply(self.N_B +
                                                                   self.alpha * tf.matmul(self.P_b, self.N_B) +
                                                                   self.beta * tf.matmul(tf.math.square(self.P_b), self.N_B), self.convNeg), 1)) + 0.001)
                                                                   #gamma * tf.matmul(tf.math.pow(self.P_b, 3), self.N_B) +
                                                                   #delta * tf.matmul(tf.math.pow(self.P_b, 4), self.N_B), self.convNeg), 1)) + 0.001)



        # Text-Node
        p7 = tf.math.log(tf.nn.sigmoid(tf.reduce_sum(tf.multiply(self.convA, self.N_B), 1)) + 0.001)

        p8 = tf.math.log(tf.nn.sigmoid(-tf.reduce_sum(tf.multiply(self.convA, self.N_NEG), 1)) + 0.001)

        p17 = tf.math.log(tf.nn.sigmoid(tf.reduce_sum(tf.multiply(self.convB, self.N_A), 1)) + 0.001)

        p18 = tf.math.log(tf.nn.sigmoid(-tf.reduce_sum(tf.multiply(self.convB, self.N_NEG), 1)) + 0.001)

        rho1 = 1.0
        rho2 = 1.0
        rho3 = 0.5
        temp_loss = rho1 * (p1 + p2 + p11 + p12) + rho2 * (p3 + p4 + p13 + p14) + rho3 * (p5 + p6 + p15 + p16) + rho3 * (p7 + p8 + p17 + p18)
        loss = -tf.reduce_sum(temp_loss)
        return loss

# ***Negative Sample***

In [8]:
def InitNegTable(edges):
    a_list, b_list = zip(*edges)
    a_list = list(a_list)
    b_list = list(b_list)
    node = a_list
    node.extend(b_list)

    node_degree = {}
    for i in node:
        if i in node_degree:
            node_degree[i] += 1
        else:
            node_degree[i] = 1
    sum_degree = 0
    for i in node_degree.values():
        sum_degree += pow(i, 0.75)

    por = 0
    cur_sum = 0
    vid = -1
    neg_table = []
    degree_list = list(node_degree.values())
    node_id = list(node_degree.keys())
    for i in range(neg_table_size):
        if ((i + 1) / float(neg_table_size)) > por:
            cur_sum += pow(degree_list[vid + 1], NEG_SAMPLE_POWER)
            por = cur_sum / sum_degree
            vid += 1
        neg_table.append(node_id[vid])
    print(f'Neg. table size: {len(neg_table)}')
    return neg_table


# ***Run***

In [9]:
def prepareData(datasetName, ratio):
  f = open('/content/datasets/%s/graph.txt' % datasetName, 'rb')
  edges = [i for i in f]
  selected = int(len(edges) * float(ratio))
  selected = selected - selected % batch_size
  selected = random.sample(edges, selected)
  remain = [i for i in edges if i not in selected]
  try:
    temp_dir = Path('temp')

    # Check if the directory exists, if so, delete it
    if temp_dir.exists() and temp_dir.is_dir():
        shutil.rmtree(temp_dir)
        print("Existing directory deleted.")

    # Create the directory
    temp_dir.mkdir(parents=True, exist_ok=True)
    print("Directory created successfully.")

  except Exception as e:
      print(f"An error occurred: {e}")

  fw1 = open('temp/graph.txt', 'wb')
  fw2 = open('temp/test_graph.txt', 'wb')

  for i in selected:
      fw1.write(i)
  for i in remain:
      fw2.write(i)

In [68]:
prepareData(datasetName, ratio)

Existing directory deleted.
Directory created successfully.


In [69]:
# load data
dataset_name = datasetName
graph_path = os.path.join('/content/temp/graph.txt')
text_path = os.path.join("/content", "datasets", dataset_name, dataTextFile)

data = dataSet(text_path, graph_path)

Vocabulary: 14696
Neg. table size: 1000000


In [72]:
with tf.Graph().as_default():
    sess = tf.compat.v1.Session()
    with sess.as_default():
        model = Model(data.num_vocab, data.num_nodes)
        opt = tf.compat.v1.train.AdamOptimizer(lr)#tf.keras.optimizers.Adam(learning_rate=lr)
        train_op = opt.minimize(model.loss)#opt.minimize(model.loss, var_list=model.trainable_variables)
        sess.run(tf.compat.v1.global_variables_initializer())
        time = 0

        # training
        print('start training.......')


        for epoch in tqdm(range(num_epoch), desc="Epochs"):
            start = datetime.now()
            loss_epoch = 0
            batches = data.generate_batches()
            h1 = 0
            num_batch = len(batches)
            for i in range(num_batch):
                batch = batches[i]

                node1, node2, node3 = zip(*batch)
                node1, node2, node3 = np.array(node1), np.array(node2), np.array(node3)
                text1, text2, text3 = data.text[node1], data.text[node2], data.text[node3]
                #labels1, labels2 = data.labels[node1], data.labels[node2]
                #labels = [max(a, b) for a, b in zip(labels1, labels2)]
                P1, P2, P3 = sub_Mat(data.P, node1), sub_Mat(data.P, node2), sub_Mat(data.P, node3)


                feed_dict = {
                    model.Text_a: text1,
                    model.Text_b: text2,
                    model.Text_neg: text3,
                    model.Node_a: node1,
                    model.Node_b: node2,
                    model.Node_neg: node3,
                    model.P_a: P1,
                    model.P_b: P2,
                    model.P_neg: P3
                    #model.labels: labels
                }

                # run the graph
                _, loss_batch = sess.run([train_op, model.loss], feed_dict=feed_dict)

                loss_epoch += loss_batch

            end = datetime.now()
            time += (end - start).total_seconds()
            print('epoch: ', epoch + 1, ' loss: ', loss_epoch)

        print(f'Time: {time}')
        # Saving embeddings
        with open('temp/embed.txt', 'wb') as file:
            batches = data.generate_batches(mode='add')
            num_batch = len(batches)
            embed = [[] for _ in range(data.num_nodes)]

            for i in range(num_batch):
                batch = batches[i]
                node1, node2, node3 = zip(*batch)
                node1, node2, node3 = np.array(node1), np.array(node2), np.array(node3)
                text1, text2, text3 = data.text[node1], data.text[node2], data.text[node3]
                #labels1, labels2 = data.labels[node1], data.labels[node2]
                #labels = [max(a, b) for a, b in zip(labels1, labels2)]
                P1, P2, P3 = sub_Mat(data.P, node1), sub_Mat(data.P, node2), sub_Mat(data.P, node3)

                feed_dict = {
                    model.Text_a: text1,
                    model.Text_b: text2,
                    model.Text_neg: text3,
                    model.Node_a: node1,
                    model.Node_b: node2,
                    model.Node_neg: node3,
                    model.P_a: P1,
                    model.P_b: P2,
                    model.P_neg: P3
                    #model.labels: labels
                }

                # Fetch embeddings
                #convA, convB, TA, TB = sess.run([model.convA, model.convB, model.N_A, model.N_B])
                convA, convB, TA, TB = sess.run([model.convA, model.convB, model.N_A, model.N_B], feed_dict=feed_dict)

                # For each node in the batch
                for j in range(batch_size):
                    em = list(convA[j]) + list(TA[j]) # Create an embedding by concatenating the text (convA) and node (TA) embeddings
                    embed[node1[j]].append(em) # A node can appear many times in edges. Thus, each time that node will have a different embedding. Append the different embeddings for a particular node

                    em = list(convB[j]) + list(TB[j]) # Create an embedding by concatenating the text (convA) and node (TA) embeddings
                    embed[node2[j]].append(em)


            for i in range(data.num_nodes):
                if embed[i]:
                    tmp = np.mean(embed[i], axis=0) #/ len(embed[i]) # If a node has many different embeddings, take their mean.
                    file.write((' '.join(map(str, tmp)) + '\n').encode())
                else:
                    file.write('\n'.encode())


start training.......


Epochs:   2%|▎         | 5/200 [00:03<01:37,  2.01it/s]

epoch:  1  loss:  1169.6209716796875
epoch:  2  loss:  1169.963623046875
epoch:  3  loss:  1170.62548828125
epoch:  4  loss:  1159.5347900390625
epoch:  5  loss:  1157.07666015625
epoch:  6  loss:  1168.103271484375
epoch:  7  loss:  1166.83056640625
epoch:  8  loss:  1158.626953125


Epochs:   6%|▋         | 13/200 [00:03<00:27,  6.76it/s]

epoch:  9  loss:  1153.518798828125
epoch:  10  loss:  1148.9827880859375
epoch:  11  loss:  1148.7830810546875
epoch:  12  loss:  1129.75244140625
epoch:  13  loss:  1127.093505859375
epoch:  14  loss:  1090.1806640625
epoch:  15  loss:  1087.0775146484375


Epochs:  10%|█         | 21/200 [00:03<00:13, 13.29it/s]

epoch:  16  loss:  1015.8990478515625
epoch:  17  loss:  957.2886962890625
epoch:  18  loss:  929.2559814453125
epoch:  19  loss:  852.0103149414062
epoch:  20  loss:  768.510009765625
epoch:  21  loss:  720.5796508789062
epoch:  22  loss:  665.7279663085938
epoch:  23  loss:  635.3858642578125
epoch:  24  loss:  580.2509765625


Epochs:  16%|█▌        | 32/200 [00:03<00:06, 24.94it/s]

epoch:  25  loss:  545.997314453125
epoch:  26  loss:  523.072021484375
epoch:  27  loss:  508.63531494140625
epoch:  28  loss:  493.94598388671875
epoch:  29  loss:  485.21875
epoch:  30  loss:  477.4057922363281
epoch:  31  loss:  466.4040222167969
epoch:  32  loss:  465.57708740234375
epoch:  33  loss:  463.22027587890625
epoch:  34  loss:  456.7693786621094
epoch:  35  loss:  458.1394348144531
epoch:  36  loss:  453.41217041015625


Epochs:  22%|██▏       | 44/200 [00:04<00:04, 37.04it/s]

epoch:  37  loss:  454.68365478515625
epoch:  38  loss:  452.77001953125
epoch:  39  loss:  449.3072509765625
epoch:  40  loss:  450.08819580078125
epoch:  41  loss:  452.5135498046875
epoch:  42  loss:  450.4064025878906
epoch:  43  loss:  452.56988525390625
epoch:  44  loss:  448.18731689453125
epoch:  45  loss:  447.72314453125
epoch:  46  loss:  448.20831298828125
epoch:  47  loss:  445.70709228515625
epoch:  48  loss:  447.58990478515625


Epochs:  28%|██▊       | 56/200 [00:04<00:03, 45.86it/s]

epoch:  49  loss:  446.4619140625
epoch:  50  loss:  446.343017578125
epoch:  51  loss:  446.20001220703125
epoch:  52  loss:  445.55645751953125
epoch:  53  loss:  445.90081787109375
epoch:  54  loss:  444.69683837890625
epoch:  55  loss:  444.41851806640625
epoch:  56  loss:  442.7498779296875
epoch:  57  loss:  442.1947021484375
epoch:  58  loss:  440.49334716796875
epoch:  59  loss:  443.51263427734375
epoch:  60  loss:  441.1300964355469


Epochs:  34%|███▍      | 68/200 [00:04<00:02, 50.96it/s]

epoch:  61  loss:  442.69989013671875
epoch:  62  loss:  441.224853515625
epoch:  63  loss:  439.1410217285156
epoch:  64  loss:  440.91375732421875
epoch:  65  loss:  440.49090576171875
epoch:  66  loss:  442.0968933105469
epoch:  67  loss:  440.8128356933594
epoch:  68  loss:  437.5782470703125
epoch:  69  loss:  437.7692565917969
epoch:  70  loss:  437.917236328125
epoch:  71  loss:  439.93109130859375
epoch:  72 

Epochs:  40%|████      | 80/200 [00:04<00:02, 53.80it/s]

 loss:  433.8986511230469
epoch:  73  loss:  436.2981262207031
epoch:  74  loss:  432.802734375
epoch:  75  loss:  433.486572265625
epoch:  76  loss:  436.9244079589844
epoch:  77  loss:  434.3404541015625
epoch:  78  loss:  432.0978698730469
epoch:  79  loss:  432.6258544921875
epoch:  80  loss:  431.27691650390625
epoch:  81  loss:  432.33203125
epoch:  82  loss:  433.2620849609375
epoch:  83  loss:  431.9076843261719


Epochs:  46%|████▌     | 92/200 [00:05<00:01, 55.72it/s]

epoch:  84  loss:  430.6609802246094
epoch:  85  loss:  432.6119689941406
epoch:  86  loss:  432.54058837890625
epoch:  87  loss:  429.9193420410156
epoch:  88  loss:  430.59716796875
epoch:  89  loss:  432.6406555175781
epoch:  90  loss:  429.65509033203125
epoch:  91  loss:  428.2223205566406
epoch:  92  loss:  425.322509765625
epoch:  93  loss:  428.6480712890625
epoch:  94  loss:  425.0893859863281
epoch:  95  loss:  428.22430419921875


Epochs:  52%|█████▏    | 104/200 [00:05<00:01, 56.41it/s]

epoch:  96  loss:  423.54400634765625
epoch:  97  loss:  428.75970458984375
epoch:  98  loss:  426.0213623046875
epoch:  99  loss:  424.700927734375
epoch:  100  loss:  423.6951904296875
epoch:  101  loss:  424.987060546875
epoch:  102  loss:  425.4794006347656
epoch:  103  loss:  423.714599609375
epoch:  104  loss:  423.49151611328125
epoch:  105  loss:  422.9898986816406
epoch:  106  loss:  423.1294860839844
epoch:  107  loss:  422.8922424316406


Epochs:  58%|█████▊    | 116/200 [00:05<00:01, 56.47it/s]

epoch:  108  loss:  424.3688659667969
epoch:  109  loss:  418.89508056640625
epoch:  110  loss:  418.05810546875
epoch:  111  loss:  417.0315246582031
epoch:  112  loss:  418.3421630859375
epoch:  113  loss:  415.5439453125
epoch:  114  loss:  420.70025634765625
epoch:  115  loss:  416.85223388671875
epoch:  116  loss:  415.3298034667969
epoch:  117  loss:  414.750244140625
epoch:  118  loss:  415.8599853515625
epoch:  119  loss:  418.99114990234375


Epochs:  64%|██████▍   | 128/200 [00:05<00:01, 55.92it/s]

epoch:  120  loss:  415.11090087890625
epoch:  121  loss:  416.4937744140625
epoch:  122  loss:  408.50726318359375
epoch:  123  loss:  416.77044677734375
epoch:  124  loss:  416.33935546875
epoch:  125  loss:  409.8394775390625
epoch:  126  loss:  407.8341064453125
epoch:  127  loss:  413.97296142578125
epoch:  128  loss:  413.28228759765625
epoch:  129  loss:  412.2596130371094
epoch:  130  loss:  409.40350341796875
epoch:  131  loss:  411.113037109375


Epochs:  70%|███████   | 141/200 [00:05<00:01, 57.35it/s]

epoch:  132  loss:  409.3389892578125
epoch:  133  loss:  409.797119140625
epoch:  134  loss:  412.151123046875
epoch:  135  loss:  407.48004150390625
epoch:  136  loss:  407.74188232421875
epoch:  137  loss:  404.7742919921875
epoch:  138  loss:  402.31011962890625
epoch:  139  loss:  409.4803161621094
epoch:  140  loss:  405.86505126953125
epoch:  141  loss:  403.855224609375
epoch:  142  loss:  402.86090087890625
epoch:  143  loss:  405.601318359375
epoch:  144

Epochs:  76%|███████▋  | 153/200 [00:06<00:00, 57.31it/s]

  loss:  402.9423522949219
epoch:  145  loss:  403.2458801269531
epoch:  146  loss:  402.548095703125
epoch:  147  loss:  404.3426818847656
epoch:  148  loss:  405.399169921875
epoch:  149  loss:  398.0605163574219
epoch:  150  loss:  402.21881103515625
epoch:  151  loss:  402.0821533203125
epoch:  152  loss:  396.88299560546875
epoch:  153  loss:  397.8475341796875
epoch:  154  loss:  398.72509765625
epoch:  155  loss:  396.38922119140625
epoch:  156  loss:  395.6197509765625


Epochs:  82%|████████▎ | 165/200 [00:06<00:00, 57.89it/s]

epoch:  157  loss:  395.77362060546875
epoch:  158  loss:  393.75701904296875
epoch:  159  loss:  392.53619384765625
epoch:  160  loss:  394.5004577636719
epoch:  161  loss:  394.4031982421875
epoch:  162  loss:  393.7326354980469
epoch:  163  loss:  392.3907470703125
epoch:  164  loss:  389.8310546875
epoch:  165  loss:  385.72747802734375
epoch:  166  loss:  382.6465759277344
epoch:  167  loss:  386.7355041503906
epoch:  168  loss:  389.4393310546875


Epochs:  88%|████████▊ | 177/200 [00:06<00:00, 57.86it/s]

epoch:  169  loss:  386.60498046875
epoch:  170  loss:  387.56842041015625
epoch:  171  loss:  390.28839111328125
epoch:  172  loss:  391.700927734375
epoch:  173  loss:  388.78216552734375
epoch:  174  loss:  387.66680908203125
epoch:  175  loss:  387.30523681640625
epoch:  176  loss:  384.2142333984375
epoch:  177  loss:  378.93963623046875
epoch:  178  loss:  379.3531494140625
epoch:  179  loss:  381.21649169921875
epoch:  180  loss:  377.4583740234375


Epochs:  94%|█████████▍| 189/200 [00:06<00:00, 56.48it/s]

epoch:  181  loss:  385.106201171875
epoch:  182  loss:  378.49505615234375
epoch:  183  loss:  382.8277282714844
epoch:  184  loss:  383.4664306640625
epoch:  185  loss:  387.161865234375
epoch:  186  loss:  377.10650634765625
epoch:  187  loss:  376.3406066894531
epoch:  188  loss:  381.91455078125
epoch:  189  loss:  379.9837951660156
epoch:  190  loss:  384.17193603515625
epoch:  191  loss:  372.5572509765625


Epochs: 100%|██████████| 200/200 [00:06<00:00, 28.84it/s]


epoch:  192  loss:  376.53167724609375
epoch:  193  loss:  372.5558166503906
epoch:  194  loss:  379.68817138671875
epoch:  195  loss:  383.0722351074219
epoch:  196  loss:  376.029296875
epoch:  197  loss:  372.38604736328125
epoch:  198  loss:  377.3175048828125
epoch:  199  loss:  379.6077575683594
epoch:  200  loss:  379.413818359375
Time: 6.762818999999999


In [73]:
node2vec = {}
f = open('temp/embed.txt', 'rb')
for i, j in enumerate(f):
    if j.decode() != '\n':
        node2vec[i] = list(map(float, j.strip().decode().split(' ')))
f1 = open(os.path.join('temp/test_graph.txt'), 'rb')
edges = [list(map(int, i.strip().decode().split('\t'))) for i in f1]
nodes = list(set([i for j in edges for i in j]))
a = 0
b = 0
result = []
for i, j in edges:
    if i in node2vec.keys() and j in node2vec.keys():
        dot1 = np.dot(node2vec[i], node2vec[j])
        random_node = random.sample(nodes, 1)[0]
        while random_node == j or random_node not in node2vec.keys():
            random_node = random.sample(nodes, 1)[0]
        dot2 = np.dot(node2vec[i], node2vec[random_node])
        result.append(np.asarray([dot1,dot2]))
        if dot1 > dot2:
            a += 1
        elif dot1 == dot2:
            a += 0.5
        b += 1

print("Auc value:", float(a) / b)

Auc value: 0.71
