<a href="https://colab.research.google.com/github/GeorgeM2000/DMTE/blob/master/code/DMTE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***Libraries & Tools***

In [None]:
import argparse
import os
import random
import numpy as np
import tensorflow as tf
from math import pow
from datetime import datetime
from pathlib import Path
import shutil
from tqdm import tqdm
import zipfile
from collections import defaultdict
from sklearn.preprocessing import normalize

In [None]:
!pwd

# ***Global Variables and General Functionality***

In [None]:
MAX_LEN=300
neg_table_size=1000000
NEG_SAMPLE_POWER=0.75
batch_size=64
num_epoch=200 # Default: 200
embed_size=200
word_embed_size=200
lr=1e-3

In [None]:
datasetName = "cora"
dataTextFile = "data.txt"
ratio = 0.15

# Original parameters
alpha = 0.3
beta = 0.1

# Additional parameters
gamma = 0.0
delta = 0.0

Count the words of the smallest and largest abstract

In [None]:
max_word_count = 0
min_word_count = float('inf')

with open(f'/content/datasets/{datasetName}/{dataTextFile}', 'r') as file:
    for line in file:
        word_count = len(line.split())

        if word_count > max_word_count:
            max_word_count = word_count

        if word_count < min_word_count:
            min_word_count = word_count

print("Max word count:", max_word_count)
print("Min word count:", min_word_count)


MAX_LEN = max_word_count + 1

Max word count: 473
Min word count: 10


Execute the code below if the data file with the astracts is too large and needs extracting

In [None]:
zip_file_path = '/content/data.zip'
extract_to = f'/content/datasets/{datasetName}'

# Open and extract the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print("Extraction complete!")

Extraction complete!


In [None]:
def sub_Mat(P, node):

    sub_P = np.zeros((len(node),len(node)))
    for i in range(len(node)):
        for j in range(len(node)):
            sub_P[i,j] = P[node[i],node[j]]

    return sub_P

# ***DataSet***

In [None]:
class dataSet:
    def __init__(self, text_path, graph_path, labels_path=None):

        #text_file, graph_file, labels_file = self.load(text_path, graph_path, labels_path)
        text_file, graph_file = self.load(text_path, graph_path)

        self.edges = self.load_edges(graph_file)

        #self.labels = self.load_labels(labels_file)

        self.text, self.num_vocab, self.num_nodes = self.load_text(text_file)

        self.nodes = range(0, self.num_nodes)

        self.negative_table = InitNegTable(self.edges)

        self.P = self.P_matrix(self.edges, self.num_nodes)

    """
    def load(self, text_path, graph_path, labels_path):
        text_file = open(text_path, 'rb').readlines()
        graph_file = open(graph_path, 'rb').readlines()
        labels_file = open(labels_path, 'rb').readlines()

        return text_file, graph_file, label_file
    """

    def load(self, text_path, graph_path):
        text_file = open(text_path, 'rb').readlines()
        graph_file = open(graph_path, 'rb').readlines()

        return text_file, graph_file

    def load_edges(self, graph_file):
        edges = []
        for i in graph_file:
            if np.random.uniform(0.0, 1.0) <= ratio:
                edges.append(list(map(int, i.strip().decode().split('\t'))))

        return edges

    """
    def load_labels(self, labels_file):
      labels = []
      for i in labels_file:
        labels.append(list(map(int, i.strip().decode().split(','))))

      return labels

    def adj_list(self, edges):
      node1, node2 = zip(*edges)

      # Create adjacency list
      adj_list = defaultdict(list)
      for n1, n2 in edges:
          adj_list[n1].append(n2)
          adj_list[n2].append(n1)

      return dict(adj_list)
    """

    # This method has been modified to be compatible with newer versions
    def load_text(self, text_file):
        """
        Adapting with adapt(text_data):

        vectorize_layer.adapt(text_data) analyzes text_data, builds a vocabulary, and assigns a unique integer ID to each word based on its frequency (most frequent words get lower IDs).
        Transforming with vectorize_layer(text_data):

        This maps each word in text_data to its corresponding integer token ID, producing a 2D array where each row represents a sequence of token IDs for a given input line, padded or truncated to max_len.
        """
        vectorize_layer = tf.keras.layers.TextVectorization(
            max_tokens=None,  # Set a limit if needed
            output_mode='int',
            output_sequence_length=MAX_LEN
        )

        text_data = [line.strip() for line in text_file]

        vectorize_layer.adapt(text_data)

        text = vectorize_layer(text_data).numpy()

        num_vocab = len(vectorize_layer.get_vocabulary())
        print(f'Vocabulary: {num_vocab}')
        num_nodes = len(text)

        return text, num_vocab, num_nodes

    def negative_sample(self, edges):
        # edges is the sample_edges in self.generate_batches()
        node1, node2 = zip(*edges)
        sample_edges = []

        # The negative table contains edges that don not exist
        func = lambda: self.negative_table[random.randint(0, neg_table_size - 1)] # Pick a random node from the negative table

        # For each edge...
        for i in range(len(edges)):
            neg_node = func() # Pick a negative node

            # If the negative node is identical to the first and second node in the current edge...
            while node1[i] == neg_node or node2[i] == neg_node:
                neg_node = func() # Pick another negative node until the neg node is different than the first and second node in the current edge

            # Create a new type of edge that has an additional node, the negative node
            sample_edges.append([node1[i], node2[i], neg_node])

        return sample_edges

    def generate_batches(self, mode=None):

        num_batch = len(self.edges) // batch_size
        edges = self.edges
        if mode == 'add':
            num_batch += 1
            edges.extend(edges[:(batch_size - len(self.edges) % batch_size)])
        if mode != 'add':
            random.shuffle(edges)
        sample_edges = edges[:num_batch * batch_size]

        # For each edge in "sample_edges", add a negative edge
        sample_edges = self.negative_sample(sample_edges)

        # Create batches of edges

        """
        The first batch range is 0 -- batch_size - 1
        The second batch range is batch_size -- 2 * batch_size - 1
        The third batch range is 2 * batch_size -- 3* batch_size - 1 and so on
        """
        batches = []
        for i in range(num_batch):
            batches.append(sample_edges[i * batch_size:(i + 1) * batch_size])

        return batches

    def nodes_batches(self, mode=None):

        num_batch = len(self.nodes) // batch_size
        nodes = self.nodes
        if mode == 'add':
            num_batch += 1
            nodes.extend(nodes[:(batch_size - len(self.nodes) % batch_size)])
            random.shuffle(nodes)
        if mode != 'add':
            random.shuffle(nodes)
        sample_nodes = nodes[:num_batch * batch_size]

        batches = []
        for i in range(num_batch):
            batches.append(sample_nodes[i * batch_size:(i + 1) * batch_size])
        return batches

    def P_matrix(self, edges, num_nodes):
        # Take all the edges
        a_list,b_list=zip(*edges)
        a_list=list(a_list) # This list contains the first nodes in all edges
        b_list=list(b_list) # This list contains the second nodes in all edges

        P = np.zeros((num_nodes,num_nodes))

        for i in range(len(a_list)):
            P[a_list[i],b_list[i]]=1 # The prob of transitioning from "a_list[i]" to "b_list[i]". Initially it's 1 for unweighted graphs
            P[b_list[i],a_list[i]]=1 # The prob of transitioning from "b_list[i]" to "a_list[i]". Initially it's 1 for unweighted graphs

        P = normalize(P, axis=1, norm='l1') # We normalize P

        return P

# ***DMTE***

In [None]:
class Model:
    def __init__(self, vocab_size, num_nodes, alpha, beta, num_labels=None):
        # '''hyperparameter'''
        with tf.name_scope('read_inputs') as scope:
            self.Text_a = tf.compat.v1.placeholder(tf.int32, [batch_size, MAX_LEN], name='Ta')
            self.Text_b = tf.compat.v1.placeholder(tf.int32, [batch_size, MAX_LEN], name='Tb')
            self.Text_neg = tf.compat.v1.placeholder(tf.int32, [batch_size, MAX_LEN], name='Tneg')
            self.Node_a = tf.compat.v1.placeholder(tf.int32, [batch_size], name='n1')
            self.Node_b = tf.compat.v1.placeholder(tf.int32, [batch_size], name='n2')
            self.Node_neg = tf.compat.v1.placeholder(tf.int32, [batch_size], name='n3')
            self.P_a = tf.compat.v1.placeholder(tf.float32, [batch_size, batch_size], name='Pa')
            self.P_b = tf.compat.v1.placeholder(tf.float32, [batch_size, batch_size], name='Pb')
            self.P_neg = tf.compat.v1.placeholder(tf.float32, [batch_size, batch_size], name='Pneg')

        with tf.name_scope('initialize_embedding') as scope:
            self.text_embed = tf.Variable(tf.random.truncated_normal([vocab_size, word_embed_size], stddev=0.3))
            self.node_embed = tf.Variable(tf.random.truncated_normal([num_nodes, embed_size // 2], stddev=0.3))
            self.node_embed = tf.clip_by_norm(self.node_embed, clip_norm=1, axes=1)

        with tf.name_scope('lookup_embeddings') as scope:
            self.TA = tf.nn.embedding_lookup(self.text_embed, self.Text_a)
            self.T_A = tf.expand_dims(self.TA, -1)

            self.TB = tf.nn.embedding_lookup(self.text_embed, self.Text_b)
            self.T_B = tf.expand_dims(self.TB, -1)

            self.TNEG = tf.nn.embedding_lookup(self.text_embed, self.Text_neg)
            self.T_NEG = tf.expand_dims(self.TNEG, -1)

            self.N_A = tf.nn.embedding_lookup(self.node_embed, self.Node_a)
            self.N_B = tf.nn.embedding_lookup(self.node_embed, self.Node_b)
            self.N_NEG = tf.nn.embedding_lookup(self.node_embed, self.Node_neg)

        self.alpha = alpha
        self.beta = beta
        self.convA, self.convB, self.convNeg = self.conv()
        self.loss = self.compute_loss()

    def conv(self):

        W0 = tf.Variable(tf.random.truncated_normal([word_embed_size, embed_size // 2], stddev=0.3))
        W1 = tf.Variable(tf.random.truncated_normal([word_embed_size, embed_size // 2], stddev=0.3))
        W2 = tf.Variable(tf.random.truncated_normal([word_embed_size, embed_size // 2], stddev=0.3))

        # Additional weight matrices
        #W3 = tf.Variable(tf.random.truncated_normal([word_embed_size, embed_size // 2], stddev=0.3))
        #W4 = tf.Variable(tf.random.truncated_normal([word_embed_size, embed_size // 2], stddev=0.3))

        mA = tf.reduce_mean(self.T_A, axis=1, keepdims=True)
        mB = tf.reduce_mean(self.T_B, axis=1, keepdims=True)
        mNEG = tf.reduce_mean(self.T_NEG, axis=1, keepdims=True)

        convA = tf.tanh(tf.squeeze(mA))
        convB = tf.tanh(tf.squeeze(mB))
        convNEG = tf.tanh(tf.squeeze(mNEG))

        attA = (tf.matmul(convA, W0) +
                self.alpha * tf.matmul(tf.matmul(self.P_a, convA), W1) +
                self.beta * tf.matmul(tf.matmul(tf.math.square(self.P_a), convA), W2))
                #gamma * tf.matmul(tf.matmul(tf.math.pow(self.P_a, 3), convA), W3) +
                #delta * tf.matmul(tf.matmul(tf.math.pow(self.P_a, 4), convA), W4))

        attB = (tf.matmul(convB, W0) +
                self.alpha * tf.matmul(tf.matmul(self.P_b, convB), W1) +
                self.beta * tf.matmul(tf.matmul(tf.math.square(self.P_b), convB), W2))
                #gamma * tf.matmul(tf.matmul(tf.math.pow(self.P_b, 3), convB), W3) +
                #delta * tf.matmul(tf.matmul(tf.math.pow(self.P_b, 4), convB), W4))


        attNEG = (tf.matmul(convNEG, W0) +
                  self.alpha * tf.matmul(tf.matmul(self.P_a, convNEG), W1) +
                  self.beta * tf.matmul(tf.matmul(tf.math.square(self.P_a), convNEG), W2))
                  #gamma * tf.matmul(tf.matmul(tf.math.pow(self.P_a, 3), convNEG), W3) +
                  #delta * tf.matmul(tf.matmul(tf.math.pow(self.P_a, 4), convNEG), W4))

        return attA, attB, attNEG

    def compute_loss(self):

        # Loss functions for:


        # Text-Text
        p1 = tf.math.log(tf.nn.sigmoid(tf.reduce_sum(tf.multiply(self.convA, self.convB), 1)) + 0.001)

        p2 = tf.math.log(tf.nn.sigmoid(-tf.reduce_sum(tf.multiply(self.convA, self.convNeg), 1)) + 0.001)

        p11 = tf.math.log(tf.nn.sigmoid(tf.reduce_sum(tf.multiply(self.convB, self.convA), 1)) + 0.001)

        p12 = tf.math.log(tf.nn.sigmoid(-tf.reduce_sum(tf.multiply(self.convB, self.convNeg), 1)) + 0.001)



        # Node-Node
        p3 = tf.math.log(tf.nn.sigmoid(tf.reduce_sum(tf.multiply(self.N_A +
                                                                 self.alpha * tf.matmul(self.P_a, self.N_A) +
                                                                 self.beta * tf.matmul(tf.math.square(self.P_a), self.N_A), self.N_B), 1)) + 0.001)
                                                                 #gamma * tf.matmul(tf.math.pow(self.P_a, 3), self.N_A) +
                                                                 #delta * tf.matmul(tf.math.pow(self.P_a, 4), self.N_A), self.N_B), 1)) + 0.001)

        p4 = tf.math.log(tf.nn.sigmoid(-tf.reduce_sum(tf.multiply(self.N_A +
                                                                  self.alpha * tf.matmul(self.P_a, self.N_A) +
                                                                  self.beta * tf.matmul(tf.math.square(self.P_a), self.N_A), self.N_NEG), 1)) + 0.001)
                                                                  #gamma * tf.matmul(tf.math.pow(self.P_a, 3), self.N_A) +
                                                                  #delta * tf.matmul(tf.math.pow(self.P_a, 4), self.N_A), self.N_NEG), 1)) + 0.001)

        p13 = tf.math.log(tf.nn.sigmoid(tf.reduce_sum(tf.multiply(self.N_B +
                                                                  self.alpha * tf.matmul(self.P_b, self.N_B) +
                                                                  self.beta * tf.matmul(tf.math.square(self.P_b), self.N_B), self.N_A), 1)) + 0.001)
                                                                  #gamma * tf.matmul(tf.math.pow(self.P_b, 3), self.N_B) +
                                                                  #delta * tf.matmul(tf.math.pow(self.P_b, 4), self.N_B), self.N_A), 1)) + 0.001)

        p14 = tf.math.log(tf.nn.sigmoid(-tf.reduce_sum(tf.multiply(self.N_B +
                                                                   self.alpha * tf.matmul(self.P_b, self.N_B) +
                                                                   self.beta * tf.matmul(tf.math.square(self.P_b), self.N_B), self.N_NEG), 1)) + 0.001)
                                                                   #gamma * tf.matmul(tf.math.pow(self.P_b, 3), self.N_B) +
                                                                   #delta * tf.matmul(tf.math.pow(self.P_b, 4), self.N_B), self.N_NEG), 1)) + 0.001)




        # Node-Text
        p5 = tf.math.log(tf.nn.sigmoid(tf.reduce_sum(tf.multiply(self.N_A +
                                                                 self.alpha * tf.matmul(self.P_a, self.N_A) +
                                                                 self.beta * tf.matmul(tf.math.square(self.P_a), self.N_A), self.convB), 1)) + 0.001)
                                                                 #gamma * tf.matmul(tf.math.pow(self.P_a, 3), self.N_A) +
                                                                 #delta * tf.matmul(tf.math.pow(self.P_a, 4), self.N_A), self.convB), 1)) + 0.001)

        p6 = tf.math.log(tf.nn.sigmoid(-tf.reduce_sum(tf.multiply(self.N_A +
                                                                  self.alpha * tf.matmul(self.P_a, self.N_A) +
                                                                  self.beta * tf.matmul(tf.math.square(self.P_a), self.N_A), self.convNeg), 1)) + 0.001)
                                                                  #gamma * tf.matmul(tf.math.pow(self.P_a, 3), self.N_A) +
                                                                  #delta * tf.matmul(tf.math.pow(self.P_a, 4), self.N_A), self.convNeg), 1)) + 0.001)

        p15 = tf.math.log(tf.nn.sigmoid(tf.reduce_sum(tf.multiply(self.N_B +
                                                                  self.alpha * tf.matmul(self.P_b, self.N_B) +
                                                                  self.beta * tf.matmul(tf.math.square(self.P_b), self.N_B), self.convA), 1)) + 0.001)
                                                                  #gamma * tf.matmul(tf.math.pow(self.P_b, 3), self.N_B) +
                                                                  #delta * tf.matmul(tf.math.pow(self.P_b, 4), self.N_B), self.convA), 1)) + 0.001)

        p16 = tf.math.log(tf.nn.sigmoid(-tf.reduce_sum(tf.multiply(self.N_B +
                                                                   self.alpha * tf.matmul(self.P_b, self.N_B) +
                                                                   self.beta * tf.matmul(tf.math.square(self.P_b), self.N_B), self.convNeg), 1)) + 0.001)
                                                                   #gamma * tf.matmul(tf.math.pow(self.P_b, 3), self.N_B) +
                                                                   #delta * tf.matmul(tf.math.pow(self.P_b, 4), self.N_B), self.convNeg), 1)) + 0.001)



        # Text-Node
        p7 = tf.math.log(tf.nn.sigmoid(tf.reduce_sum(tf.multiply(self.convA, self.N_B), 1)) + 0.001)

        p8 = tf.math.log(tf.nn.sigmoid(-tf.reduce_sum(tf.multiply(self.convA, self.N_NEG), 1)) + 0.001)

        p17 = tf.math.log(tf.nn.sigmoid(tf.reduce_sum(tf.multiply(self.convB, self.N_A), 1)) + 0.001)

        p18 = tf.math.log(tf.nn.sigmoid(-tf.reduce_sum(tf.multiply(self.convB, self.N_NEG), 1)) + 0.001)

        rho1 = 1.0
        rho2 = 1.0
        rho3 = 0.3
        temp_loss = rho1 * (p1 + p2 + p11 + p12) + rho2 * (p3 + p4 + p13 + p14) + rho3 * (p5 + p6 + p15 + p16) + rho3 * (p7 + p8 + p17 + p18)
        loss = -tf.reduce_sum(temp_loss)
        return loss

# ***Negative Sample***

In [None]:
def InitNegTable(edges):
    a_list, b_list = zip(*edges)
    a_list = list(a_list)
    b_list = list(b_list)
    node = a_list
    node.extend(b_list)

    node_degree = {}
    for i in node:
        if i in node_degree:
            node_degree[i] += 1
        else:
            node_degree[i] = 1
    sum_degree = 0
    for i in node_degree.values():
        sum_degree += pow(i, 0.75)

    por = 0
    cur_sum = 0
    vid = -1
    neg_table = []
    degree_list = list(node_degree.values())
    node_id = list(node_degree.keys())
    for i in range(neg_table_size):
        if ((i + 1) / float(neg_table_size)) > por:
            cur_sum += pow(degree_list[vid + 1], NEG_SAMPLE_POWER)
            por = cur_sum / sum_degree
            vid += 1
        neg_table.append(node_id[vid])
    print(f'Neg. table size: {len(neg_table)}')
    return neg_table


# ***Run (Single execution)***

In [None]:
def prepareData(datasetName, ratio):
  f = open('/content/datasets/%s/graph.txt' % datasetName, 'rb')
  edges = [i for i in f]
  selected = int(len(edges) * float(ratio))
  selected = selected - selected % batch_size
  selected = random.sample(edges, selected)
  remain = [i for i in edges if i not in selected]
  try:
    temp_dir = Path('temp')

    # Check if the directory exists, if so, delete it
    if temp_dir.exists() and temp_dir.is_dir():
        shutil.rmtree(temp_dir)
        print("Existing directory deleted.")

    # Create the directory
    temp_dir.mkdir(parents=True, exist_ok=True)
    print("Directory created successfully.")

  except Exception as e:
      print(f"An error occurred: {e}")

  fw1 = open('temp/graph.txt', 'wb')
  fw2 = open('temp/test_graph.txt', 'wb')

  for i in selected:
      fw1.write(i)
  for i in remain:
      fw2.write(i)

In [None]:
prepareData(datasetName, ratio)

Directory created successfully.


In [None]:
# load data
dataset_name = datasetName
graph_path = os.path.join('/content/temp/graph.txt')
text_path = os.path.join("/content", "datasets", dataset_name, dataTextFile)

data = dataSet(text_path, graph_path)

Vocabulary: 14696
Neg. table size: 1000000


In [None]:
with tf.Graph().as_default():
    sess = tf.compat.v1.Session()
    with sess.as_default():
        model = Model(data.num_vocab, data.num_nodes, alpha, beta)
        opt = tf.compat.v1.train.AdamOptimizer(lr)#tf.keras.optimizers.Adam(learning_rate=lr)
        train_op = opt.minimize(model.loss)#opt.minimize(model.loss, var_list=model.trainable_variables)
        sess.run(tf.compat.v1.global_variables_initializer())
        time = 0

        # training
        print('start training.......')


        for epoch in tqdm(range(num_epoch), desc="Epochs"):
            start = datetime.now()
            loss_epoch = 0
            batches = data.generate_batches()
            h1 = 0
            num_batch = len(batches)
            for i in range(num_batch):
                batch = batches[i]

                node1, node2, node3 = zip(*batch)
                node1, node2, node3 = np.array(node1), np.array(node2), np.array(node3)
                text1, text2, text3 = data.text[node1], data.text[node2], data.text[node3]
                #labels1, labels2 = data.labels[node1], data.labels[node2]
                #labels = [max(a, b) for a, b in zip(labels1, labels2)]
                P1, P2, P3 = sub_Mat(data.P, node1), sub_Mat(data.P, node2), sub_Mat(data.P, node3)


                feed_dict = {
                    model.Text_a: text1,
                    model.Text_b: text2,
                    model.Text_neg: text3,
                    model.Node_a: node1,
                    model.Node_b: node2,
                    model.Node_neg: node3,
                    model.P_a: P1,
                    model.P_b: P2,
                    model.P_neg: P3
                    #model.labels: labels
                }

                # run the graph
                _, loss_batch = sess.run([train_op, model.loss], feed_dict=feed_dict)

                loss_epoch += loss_batch

            end = datetime.now()
            time += (end - start).total_seconds()
            print('epoch: ', epoch + 1, ' loss: ', loss_epoch)

        print(f'Time: {time}')
        # Saving embeddings
        with open('temp/embed.txt', 'wb') as file:
            batches = data.generate_batches(mode='add')
            num_batch = len(batches)
            embed = [[] for _ in range(data.num_nodes)]

            for i in range(num_batch):
                batch = batches[i]
                node1, node2, node3 = zip(*batch)
                node1, node2, node3 = np.array(node1), np.array(node2), np.array(node3)
                text1, text2, text3 = data.text[node1], data.text[node2], data.text[node3]
                #labels1, labels2 = data.labels[node1], data.labels[node2]
                #labels = [max(a, b) for a, b in zip(labels1, labels2)]
                P1, P2, P3 = sub_Mat(data.P, node1), sub_Mat(data.P, node2), sub_Mat(data.P, node3)

                feed_dict = {
                    model.Text_a: text1,
                    model.Text_b: text2,
                    model.Text_neg: text3,
                    model.Node_a: node1,
                    model.Node_b: node2,
                    model.Node_neg: node3,
                    model.P_a: P1,
                    model.P_b: P2,
                    model.P_neg: P3
                    #model.labels: labels
                }

                # Fetch embeddings
                #convA, convB, TA, TB = sess.run([model.convA, model.convB, model.N_A, model.N_B])
                convA, convB, TA, TB = sess.run([model.convA, model.convB, model.N_A, model.N_B], feed_dict=feed_dict)

                # For each node in the batch
                for j in range(batch_size):
                    em = list(convA[j]) + list(TA[j]) # Create an embedding by concatenating the text (convA) and node (TA) embeddings
                    embed[node1[j]].append(em) # A node can appear many times in edges. Thus, each time that node will have a different embedding. Append the different embeddings for a particular node

                    em = list(convB[j]) + list(TB[j]) # Create an embedding by concatenating the text (convA) and node (TA) embeddings
                    embed[node2[j]].append(em)


            for i in range(data.num_nodes):
                if embed[i]:
                    tmp = np.mean(embed[i], axis=0) #/ len(embed[i]) # If a node has many different embeddings, take their mean.
                    file.write((' '.join(map(str, tmp)) + '\n').encode())
                else:
                    file.write('\n'.encode())


start training.......


Epochs:   2%|▎         | 5/200 [00:03<01:49,  1.78it/s]

epoch:  1  loss:  1173.5244140625
epoch:  2  loss:  1174.24072265625
epoch:  3  loss:  1154.207275390625
epoch:  4  loss:  1172.4483642578125
epoch:  5  loss:  1152.0401611328125
epoch:  6  loss:  1151.056884765625
epoch:  7  loss:  1164.86083984375
epoch:  8  loss:  1154.0650634765625


Epochs:   7%|▋         | 14/200 [00:03<00:27,  6.77it/s]

epoch:  9  loss:  1151.578857421875
epoch:  10  loss:  1158.53173828125
epoch:  11  loss:  1162.73046875
epoch:  12  loss:  1123.0736083984375
epoch:  13  loss:  1146.65771484375
epoch:  14  loss:  1124.2763671875
epoch:  15  loss:  1144.7598876953125
epoch:  16  loss:  1116.37841796875
epoch:  17  loss:  1111.78955078125


Epochs:  12%|█▏        | 23/200 [00:04<00:12, 13.84it/s]

epoch:  18  loss:  1087.02685546875
epoch:  19  loss:  1078.768798828125
epoch:  20  loss:  985.6742553710938
epoch:  21  loss:  953.3301391601562
epoch:  22  loss:  923.7408447265625
epoch:  23  loss:  797.2402954101562
epoch:  24  loss:  721.5189208984375
epoch:  25  loss:  665.926025390625
epoch:  26  loss:  589.0726928710938


Epochs:  16%|█▌        | 32/200 [00:04<00:07, 21.88it/s]

epoch:  27  loss:  541.81591796875
epoch:  28  loss:  518.4888916015625
epoch:  29  loss:  487.5359191894531
epoch:  30  loss:  478.1725158691406
epoch:  31  loss:  462.22637939453125
epoch:  32  loss:  457.2728271484375
epoch:  33  loss:  456.0660400390625
epoch:  34  loss:  453.733154296875
epoch:  35  loss:  458.64306640625


Epochs:  20%|██        | 40/200 [00:04<00:05, 28.06it/s]

epoch:  36  loss:  457.6021728515625
epoch:  37  loss:  457.1626892089844
epoch:  38  loss:  453.93988037109375
epoch:  39  loss:  457.95416259765625
epoch:  40  loss:  454.21343994140625
epoch:  41  loss:  456.1550598144531
epoch:  42  loss:  454.9295959472656
epoch:  43  loss:  459.04974365234375


Epochs:  24%|██▍       | 48/200 [00:04<00:04, 32.58it/s]

epoch:  44  loss:  454.1771545410156
epoch:  45  loss:  454.8286437988281
epoch:  46  loss:  451.9372863769531
epoch:  47  loss:  451.1806640625
epoch:  48  loss:  450.6763916015625
epoch:  49  loss:  447.9947509765625
epoch:  50  loss:  447.8292236328125
epoch:  51  loss:  446.8936767578125


Epochs:  28%|██▊       | 57/200 [00:05<00:03, 38.14it/s]

epoch:  52  loss:  445.7368469238281
epoch:  53  loss:  442.2054443359375
epoch:  54  loss:  443.56732177734375
epoch:  55  loss:  440.69512939453125
epoch:  56  loss:  442.2057189941406
epoch:  57  loss:  440.3218994140625
epoch:  58  loss:  438.1889953613281
epoch:  59  loss:  439.9997863769531
epoch:  60  loss:  437.0
epoch:  61  loss:  437.03546142578125
epoch:  62  loss:  436.3720397949219


Epochs:  34%|███▍      | 69/200 [00:05<00:02, 47.79it/s]

epoch:  63  loss:  437.1236267089844
epoch:  64  loss:  437.3717346191406
epoch:  65  loss:  437.6768798828125
epoch:  66  loss:  436.0179138183594
epoch:  67  loss:  434.22137451171875
epoch:  68  loss:  433.870849609375
epoch:  69  loss:  434.51593017578125
epoch:  70  loss:  433.2633056640625
epoch:  71  loss:  432.7579650878906
epoch:  72  loss:  432.4459533691406
epoch:  73  loss:  431.6860656738281
epoch:  74  loss:  429.9345397949219


Epochs:  41%|████      | 82/200 [00:05<00:02, 54.64it/s]

epoch:  75  loss:  427.79095458984375
epoch:  76  loss:  431.9444580078125
epoch:  77  loss:  430.1364440917969
epoch:  78  loss:  429.7733154296875
epoch:  79  loss:  431.08953857421875
epoch:  80  loss:  430.47845458984375
epoch:  81  loss:  430.384521484375
epoch:  82  loss:  427.6012878417969
epoch:  83  loss:  425.7528381347656
epoch:  84  loss:  426.482666015625
epoch:  85  loss:  424.9984130859375
epoch:  86  loss:  424.6258850097656
epoch:  87  loss:  423.8175048828125


Epochs:  48%|████▊     | 96/200 [00:05<00:01, 58.64it/s]

epoch:  88  loss:  424.8148498535156
epoch:  89  loss:  424.79217529296875
epoch:  90  loss:  419.984375
epoch:  91  loss:  421.1540222167969
epoch:  92  loss:  422.2255554199219
epoch:  93  loss:  422.36376953125
epoch:  94  loss:  420.3765869140625
epoch:  95  loss:  419.77191162109375
epoch:  96  loss:  416.397216796875
epoch:  97  loss:  421.25640869140625
epoch:  98  loss:  417.799072265625
epoch:  99  loss:  417.11053466796875
epoch:  100  loss:  419.8067321777344


Epochs:  55%|█████▍    | 109/200 [00:05<00:01, 56.78it/s]

epoch:  101  loss:  419.2234191894531
epoch:  102  loss:  421.1321716308594
epoch:  103  loss:  416.65020751953125
epoch:  104  loss:  415.495849609375
epoch:  105  loss:  414.22412109375
epoch:  106  loss:  413.58001708984375
epoch:  107  loss:  414.04736328125
epoch:  108  loss:  413.18243408203125
epoch:  109  loss:  413.91009521484375
epoch:  110  loss:  412.515869140625
epoch:  111  loss:  412.89642333984375
epoch:  112  loss:  413.33819580078125


Epochs:  62%|██████▏   | 123/200 [00:06<00:01, 59.36it/s]

epoch:  113  loss:  410.89801025390625
epoch:  114  loss:  414.0845947265625
epoch:  115  loss:  407.47027587890625
epoch:  116  loss:  412.28851318359375
epoch:  117  loss:  408.349853515625
epoch:  118  loss:  410.3521728515625
epoch:  119  loss:  406.89605712890625
epoch:  120  loss:  407.666259765625
epoch:  121  loss:  407.37847900390625
epoch:  122  loss:  404.1093444824219
epoch:  123  loss:  406.41461181640625
epoch:  124  loss:  404.8984375
epoch:  125  loss:  405.539794921875


Epochs:  68%|██████▊   | 137/200 [00:06<00:01, 61.30it/s]

epoch:  126  loss:  407.5130310058594
epoch:  127  loss:  407.4278564453125
epoch:  128  loss:  402.0755615234375
epoch:  129  loss:  402.1451721191406
epoch:  130  loss:  402.386474609375
epoch:  131  loss:  405.47869873046875
epoch:  132  loss:  394.49072265625
epoch:  133  loss:  398.353759765625
epoch:  134  loss:  400.8099670410156
epoch:  135  loss:  396.1330261230469
epoch:  136  loss:  394.53656005859375
epoch:  137  loss:  397.3466796875
epoch:  138  loss:  397.70733642578125


Epochs:  76%|███████▌  | 151/200 [00:06<00:00, 60.98it/s]

epoch:  139  loss:  395.216064453125
epoch:  140  loss:  398.314453125
epoch:  141  loss:  395.197265625
epoch:  142  loss:  393.09503173828125
epoch:  143  loss:  397.183349609375
epoch:  144  loss:  393.8685607910156
epoch:  145  loss:  391.1815185546875
epoch:  146  loss:  392.2414245605469
epoch:  147  loss:  393.87994384765625
epoch:  148  loss:  390.54718017578125
epoch:  149  loss:  387.286865234375
epoch:  150  loss:  389.736083984375
epoch:  151  loss:  388.5830383300781


Epochs:  79%|███████▉  | 158/200 [00:06<00:00, 59.25it/s]

epoch:  152  loss:  389.41644287109375
epoch:  153  loss:  384.4251708984375
epoch:  154  loss:  384.6579284667969
epoch:  155  loss:  385.1009521484375
epoch:  156  loss:  383.4337158203125
epoch:  157  loss:  384.91632080078125
epoch:  158  loss:  388.1527099609375
epoch:  159  loss:  383.04254150390625
epoch:  160  loss:  383.5155334472656
epoch:  161  loss:  388.5513000488281
epoch:  162  loss:  380.39703369140625


Epochs:  86%|████████▌ | 171/200 [00:06<00:00, 57.10it/s]

epoch:  163  loss:  374.1988525390625
epoch:  164  loss:  378.568115234375
epoch:  165  loss:  377.6250915527344
epoch:  166  loss:  380.1463623046875
epoch:  167  loss:  375.5496826171875
epoch:  168  loss:  380.25164794921875
epoch:  169  loss:  378.221435546875
epoch:  170  loss:  381.31927490234375
epoch:  171  loss:  377.1251525878906
epoch:  172  loss:  379.8698425292969
epoch:  173  loss:  379.19085693359375
epoch:  174  loss:  377.0843200683594
epoch:  175  loss:  376.36199951171875


Epochs:  92%|█████████▏| 184/200 [00:07<00:00, 59.03it/s]

epoch:  176  loss:  382.3316955566406
epoch:  177  loss:  378.7127685546875
epoch:  178  loss:  378.74945068359375
epoch:  179  loss:  373.7747802734375
epoch:  180  loss:  373.53643798828125
epoch:  181  loss:  374.49932861328125
epoch:  182  loss:  372.39154052734375
epoch:  183  loss:  366.13671875
epoch:  184  loss:  365.71014404296875
epoch:  185  loss:  364.03631591796875
epoch:  186  loss:  366.6719970703125
epoch:  187  loss:  362.9066467285156
epoch:  188  loss:  360.530029296875


Epochs: 100%|██████████| 200/200 [00:07<00:00, 26.98it/s]

epoch:  189  loss:  372.97332763671875
epoch:  190  loss:  374.51824951171875
epoch:  191  loss:  358.698486328125
epoch:  192  loss:  366.5995178222656
epoch:  193  loss:  373.2900695800781
epoch:  194  loss:  373.1017761230469
epoch:  195  loss:  367.3129577636719
epoch:  196  loss:  370.6121826171875
epoch:  197  loss:  370.6998291015625
epoch:  198  loss:  361.46771240234375
epoch:  199  loss:  351.54949951171875
epoch:  200  loss:  360.9666748046875
Time: 7.201402000000006





In [None]:
node2vec = {}
f = open('temp/embed.txt', 'rb')
for i, j in enumerate(f):
    if j.decode() != '\n':
        node2vec[i] = list(map(float, j.strip().decode().split(' ')))
f1 = open(os.path.join('temp/test_graph.txt'), 'rb')
edges = [list(map(int, i.strip().decode().split('\t'))) for i in f1]
nodes = list(set([i for j in edges for i in j]))
a = 0
b = 0
result = []
for i, j in edges:
    if i in node2vec.keys() and j in node2vec.keys():
        dot1 = np.dot(node2vec[i], node2vec[j])
        random_node = random.sample(nodes, 1)[0]
        while random_node == j or random_node not in node2vec.keys():
            random_node = random.sample(nodes, 1)[0]
        dot2 = np.dot(node2vec[i], node2vec[random_node])
        result.append(np.asarray([dot1,dot2]))
        if dot1 > dot2:
            a += 1
        elif dot1 == dot2:
            a += 0.5
        b += 1

print("Auc value:", float(a) / b)

Auc value: 0.8064516129032258


# ***Run (Multiple executions)***

In [None]:
graph_paths = ['/path/to/graph1.txt']
text_paths = ['/path/to/text1.txt', '/path/to/text2.txt']

# Log file to save execution details
log_file = 'DMTE_execution_logs.txt'

# Load data and execute for each combination of graph_path and text_path
for graph_path in graph_paths:
    for text_path in tqdm(text_paths, desc="Processing Text Files"):
      data = dataSet(text_path, graph_path)

      # Logging the execution details
      with open(log_file, 'a') as log:
          log.write(f'Processing graph_path: {graph_path}, text_path: {text_path}\n')


      with tf.Graph().as_default():
          sess = tf.compat.v1.Session()
          with sess.as_default():
              model = Model(data.num_vocab, data.num_nodes, alpha, beta)
              opt = tf.compat.v1.train.AdamOptimizer(lr)#tf.keras.optimizers.Adam(learning_rate=lr)
              train_op = opt.minimize(model.loss)#opt.minimize(model.loss, var_list=model.trainable_variables)
              sess.run(tf.compat.v1.global_variables_initializer())
              #total_time = 0

              # Training
              print('start training.......')
              start_time = datetime.now()
              for epoch in tqdm(range(num_epoch), desc="Epochs"):
                  #start_time = datetime.now()
                  loss_epoch = 0
                  batches = data.generate_batches()
                  h1 = 0
                  num_batch = len(batches)
                  for i in range(num_batch):
                      batch = batches[i]

                      node1, node2, node3 = zip(*batch)
                      node1, node2, node3 = np.array(node1), np.array(node2), np.array(node3)
                      text1, text2, text3 = data.text[node1], data.text[node2], data.text[node3]
                      P1, P2, P3 = sub_Mat(data.P, node1), sub_Mat(data.P, node2), sub_Mat(data.P, node3)


                      feed_dict = {
                          model.Text_a: text1,
                          model.Text_b: text2,
                          model.Text_neg: text3,
                          model.Node_a: node1,
                          model.Node_b: node2,
                          model.Node_neg: node3,
                          model.P_a: P1,
                          model.P_b: P2,
                          model.P_neg: P3
                      }

                      # run the graph
                      _, loss_batch = sess.run([train_op, model.loss], feed_dict=feed_dict)

                      loss_epoch += loss_batch

                  #end_time = datetime.now()
                  #total_time += (end - start).total_seconds()
                  #print('epoch: ', epoch + 1, ' loss: ', loss_epoch)

              #print(f'Time: {total_time}')
              end_time = datetime.now()
              with open(log_file, 'a') as log:
                log.write(f'Loss: {loss_epoch}, Time: {(end_time - start_time).total_seconds()}\n')


              # Save embeddings with a unique name
              embed_file = f'temp/embed_{os.path.basename(graph_path)}_{os.path.basename(text_path)}.txt'
              with open(embed_file, 'wb') as file:
                  batches = data.generate_batches(mode='add')
                  num_batch = len(batches)
                  embed = [[] for _ in range(data.num_nodes)]

                  for i in range(num_batch):
                      batch = batches[i]
                      node1, node2, node3 = zip(*batch)
                      node1, node2, node3 = np.array(node1), np.array(node2), np.array(node3)
                      text1, text2, text3 = data.text[node1], data.text[node2], data.text[node3]
                      P1, P2, P3 = sub_Mat(data.P, node1), sub_Mat(data.P, node2), sub_Mat(data.P, node3)

                      feed_dict = {
                          model.Text_a: text1,
                          model.Text_b: text2,
                          model.Text_neg: text3,
                          model.Node_a: node1,
                          model.Node_b: node2,
                          model.Node_neg: node3,
                          model.P_a: P1,
                          model.P_b: P2,
                          model.P_neg: P3
                      }

                      # Fetch embeddings
                      #convA, convB, TA, TB = sess.run([model.convA, model.convB, model.N_A, model.N_B])
                      convA, convB, TA, TB = sess.run([model.convA, model.convB, model.N_A, model.N_B], feed_dict=feed_dict)

                      # For each node in the batch
                      for j in range(batch_size):
                          em = list(convA[j]) + list(TA[j]) # Create an embedding by concatenating the text (convA) and node (TA) embeddings
                          embed[node1[j]].append(em) # A node can appear many times in edges. Thus, each time that node will have a different embedding. Append the different embeddings for a particular node

                          em = list(convB[j]) + list(TB[j]) # Create an embedding by concatenating the text (convA) and node (TA) embeddings
                          embed[node2[j]].append(em)


                  for i in range(data.num_nodes):
                      if embed[i]:
                          tmp = np.mean(embed[i], axis=0) #/ len(embed[i]) # If a node has many different embeddings, take their mean.
                          file.write((' '.join(map(str, tmp)) + '\n').encode())
                      else:
                          file.write('\n'.encode())
              # Log completion
              with open(log_file, 'a') as log:
                  log.write(f'Embeddings saved to: {embed_file}\n')


In [None]:
embed_files = ["embed1.txt", "embed2.txt", "embed3.txt"]
test_graph_file = "temp/test_graph.txt"

# Initialize a log file to store the AUC results
with open("DMTE_auc_results.log", "w") as auc_file:
    auc_file.write("Embed File\tAUC Value\n")


# Loop through each embed.txt file
for embed_file in tqdm(embed_files, desc="Processing embed files"):
    node2vec = {}

    with open(embed_file, 'rb') as f:
      for i, j in enumerate(f):
          if j.decode() != '\n':
              node2vec[i] = list(map(float, j.strip().decode().split(' ')))

    # Load the edges from the test graph file
    with open(test_graph_file, 'rb') as f1:
        edges = [list(map(int, i.strip().decode().split('\t'))) for i in f1]
    nodes = list(set([i for j in edges for i in j]))

    a = 0
    b = 0
    result = []
    for i, j in edges:
        if i in node2vec.keys() and j in node2vec.keys():
            dot1 = np.dot(node2vec[i], node2vec[j])
            random_node = random.sample(nodes, 1)[0]
            while random_node == j or random_node not in node2vec.keys():
                random_node = random.sample(nodes, 1)[0]
            dot2 = np.dot(node2vec[i], node2vec[random_node])
            result.append(np.asarray([dot1,dot2]))
            if dot1 > dot2:
                a += 1
            elif dot1 == dot2:
                a += 0.5
            b += 1

    auc_value = float(a) / b if b > 0 else 0
    print(f"AUC value for {embed_file}: {auc_value}")

    # Log the result
    with open("DMTE_auc_results.log", "a") as auc_file:
        auc_file.write(f"{embed_file}\t{test_graph_file}\t{auc_value}\n")