<a href="https://colab.research.google.com/github/GeorgeM2000/DMTE/blob/master/code/DMTE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***Libraries & Tools***

In [None]:
import argparse
import os
import random
import numpy as np
import tensorflow as tf
import shutil
import zipfile
import gc


from math import pow
from datetime import datetime
from pathlib import Path
from tqdm import tqdm
from collections import defaultdict
from sklearn.preprocessing import normalize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
!pwd

# ***Global Variables and General Functionality***

In [None]:
MAX_LEN=300
MAX_LENS = [] # List to hold the values for multiple execution
neg_table_size=1000000
NEG_SAMPLE_POWER=0.75
batch_size=64
num_epoch=50 # Default: 200
embed_size=200
word_embed_size=200
lr=1e-3

In [None]:
dataset_name = "arxiv"
data_text_file = "RAKE5.txt"
data_text_files = ["RAKE5.txt", "data.txt", "YAKE5.txt"]
graph_file = 'graph.txt'
parent_path = f'/content/datasets/{dataset_name}'
log_file = 'DMTE_Execution_Logs.txt'
link_pred_results_file = 'DMTE_Link_Pred_Res.txt'
node_clf_results_file = 'DMTE_Node_Clf_Res.txt'
categories_file = 'group.txt'


split_graph_file = 'sgraph15.txt'
split_graph_files = ['sgraph15.txt', 'sgraph45.txt', 'sgraph75.txt']
test_graph_file = 'tgraph85.txt'
test_graph_files = ['tgraph85.txt', 'tgraph55.txt', 'tgraph25.txt']

ratio = 0.15

# Original parameters
alpha = 0.3
beta = 0.1

# Additional parameters
gamma = 0.0
delta = 0.0

In [None]:
clf_ratio = [0.15, 0.45, 0.75]
clf_num = 5
train_classifier = True

In [None]:
for tf in data_text_files:
  max_word_count = 0
  min_word_count = float('inf')

  with open(f'{parent_path}/{tf}', 'r') as file:
      for line in file:
          word_count = len(line.split())

          if word_count > max_word_count:
              max_word_count = word_count

          if word_count < min_word_count:
              min_word_count = word_count

  MAX_LENS.append(max_word_count+1)
  print(f'=== {tf} ===')
  print("Max word count:", max_word_count)
  print("Min word count:", min_word_count)
  print()

gc.collect()

In [None]:
MAX_LENS

In [None]:
MAX_LEN = MAX_LENS[-1] # For single execution

Execute the code below if the data file with the astracts is too large and needs extracting

In [None]:
# Open and extract the zip file
with zipfile.ZipFile('/content/PartialData.zip', 'r') as zip_ref:
    zip_ref.extractall(parent_path)

print("Extraction complete!")

In [None]:
def sub_Mat(P, node):

    sub_P = np.zeros((len(node),len(node)))
    for i in range(len(node)):
        for j in range(len(node)):
            sub_P[i,j] = P[node[i],node[j]]

    return sub_P

In [None]:
zero_list = []
for i in range(0, embed_size):
    zero_list.append(0)
zero_list = np.array(zero_list)

In [None]:
def get_vectors_from_file(file_path):
  vectors = {}

  with open(f'{file_path}', "r") as f:
      for idx, line in enumerate(f):
          vector = list(map(float, line.strip().split()))  # Convert to list of floats
          vectors[idx] = vector  # Assign embedding to node idx

  return vectors

Use the python code below only for node classification tasks

In [None]:
# Create the edge list. Store the unique nodes in the list "nodes"
with open(f'{parent_path}/{graph_file}', 'r') as f:
  eedges = f.readlines()

edge_list = []
nodes = [] # "nodes" will contain all the unique nodes of the graph
for ee in eedges:
  edge_list.append(list(ee.split()))
for ll in edge_list:
  for ed in ll:
    if ed not in nodes:
      nodes.append(ed)
    else:
      continue

In [None]:
len(nodes)

In [None]:
len(edge_list)

# ***DataSet***

In [None]:
class dataSet:
    def __init__(self, text_path, graph_path, labels_path=None):

        text_file, graph_file = self.load(text_path, graph_path)
        self.edges = self.load_edges(graph_file)
        self.text, self.num_vocab, self.num_nodes = self.load_text(text_file)
        self.nodes = range(0, self.num_nodes)
        self.negative_table = InitNegTable(self.edges)
        self.P = self.P_matrix(self.edges, self.num_nodes)


    def load(self, text_path, graph_path):
        text_file = open(text_path, 'rb').readlines()
        graph_file = open(graph_path, 'rb').readlines()

        return text_file, graph_file

    def load_edges(self, graph_file):
        edges = []
        for i in graph_file:
            if np.random.uniform(0.0, 1.0) <= ratio:
                edges.append(list(map(int, i.strip().decode().split('\t'))))

        return edges



    # This method has been modified to be compatible with newer versions
    def load_text(self, text_file):
        """
        Adapting with adapt(text_data):

        vectorize_layer.adapt(text_data) analyzes text_data, builds a vocabulary, and assigns a unique integer ID to each word based on its frequency (most frequent words get lower IDs).
        Transforming with vectorize_layer(text_data):

        This maps each word in text_data to its corresponding integer token ID, producing a 2D array where each row represents a sequence of token IDs for a given input line, padded or truncated to max_len.
        """
        vectorize_layer = tf.keras.layers.TextVectorization(
            max_tokens=None,  # Set a limit if needed
            output_mode='int',
            output_sequence_length=MAX_LEN
        )

        text_data = [line.strip() for line in text_file]

        vectorize_layer.adapt(text_data)

        text = vectorize_layer(text_data).numpy()

        num_vocab = len(vectorize_layer.get_vocabulary())
        #print(f'Vocabulary: {num_vocab}')
        num_nodes = len(text)

        return text, num_vocab, num_nodes

    def negative_sample(self, edges):
        # edges is the sample_edges in self.generate_batches()
        node1, node2 = zip(*edges)
        sample_edges = []

        # The negative table contains edges that don not exist
        func = lambda: self.negative_table[random.randint(0, neg_table_size - 1)] # Pick a random node from the negative table

        # For each edge...
        for i in range(len(edges)):
            neg_node = func() # Pick a negative node

            # If the negative node is identical to the first and second node in the current edge...
            while node1[i] == neg_node or node2[i] == neg_node:
                neg_node = func() # Pick another negative node until the neg node is different than the first and second node in the current edge

            # Create a new type of edge that has an additional node, the negative node
            sample_edges.append([node1[i], node2[i], neg_node])

        return sample_edges

    def generate_batches(self, mode=None):

        num_batch = len(self.edges) // batch_size
        edges = self.edges
        if mode == 'add':
            num_batch += 1
            edges.extend(edges[:(batch_size - len(self.edges) % batch_size)])
        if mode != 'add':
            random.shuffle(edges)
        sample_edges = edges[:num_batch * batch_size]

        # For each edge in "sample_edges", add a negative edge
        sample_edges = self.negative_sample(sample_edges)

        # Create batches of edges

        """
        The first batch range is 0 -- batch_size - 1
        The second batch range is batch_size -- 2 * batch_size - 1
        The third batch range is 2 * batch_size -- 3* batch_size - 1 and so on
        """
        batches = []
        for i in range(num_batch):
            batches.append(sample_edges[i * batch_size:(i + 1) * batch_size])

        return batches

    def nodes_batches(self, mode=None):

        num_batch = len(self.nodes) // batch_size
        nodes = self.nodes
        if mode == 'add':
            num_batch += 1
            nodes.extend(nodes[:(batch_size - len(self.nodes) % batch_size)])
            random.shuffle(nodes)
        if mode != 'add':
            random.shuffle(nodes)
        sample_nodes = nodes[:num_batch * batch_size]

        batches = []
        for i in range(num_batch):
            batches.append(sample_nodes[i * batch_size:(i + 1) * batch_size])
        return batches

    def P_matrix(self, edges, num_nodes):
        # Take all the edges
        a_list,b_list=zip(*edges)
        a_list=list(a_list) # This list contains the first nodes in all edges
        b_list=list(b_list) # This list contains the second nodes in all edges

        P = np.zeros((num_nodes,num_nodes))

        for i in range(len(a_list)):
            P[a_list[i],b_list[i]]=1 # The prob of transitioning from "a_list[i]" to "b_list[i]". Initially it's 1 for unweighted graphs
            P[b_list[i],a_list[i]]=1 # The prob of transitioning from "b_list[i]" to "a_list[i]". Initially it's 1 for unweighted graphs

        P = normalize(P, axis=1, norm='l1') # We normalize P

        return P

# ***DMTE***

In [None]:
class Model:
    def __init__(self, vocab_size, num_nodes, alpha, beta, num_labels=None):
        # '''hyperparameter'''
        with tf.name_scope('read_inputs') as scope:
            self.Text_a = tf.compat.v1.placeholder(tf.int32, [batch_size, MAX_LEN], name='Ta')
            self.Text_b = tf.compat.v1.placeholder(tf.int32, [batch_size, MAX_LEN], name='Tb')
            self.Text_neg = tf.compat.v1.placeholder(tf.int32, [batch_size, MAX_LEN], name='Tneg')
            self.Node_a = tf.compat.v1.placeholder(tf.int32, [batch_size], name='n1')
            self.Node_b = tf.compat.v1.placeholder(tf.int32, [batch_size], name='n2')
            self.Node_neg = tf.compat.v1.placeholder(tf.int32, [batch_size], name='n3')
            self.P_a = tf.compat.v1.placeholder(tf.float32, [batch_size, batch_size], name='Pa')
            self.P_b = tf.compat.v1.placeholder(tf.float32, [batch_size, batch_size], name='Pb')
            self.P_neg = tf.compat.v1.placeholder(tf.float32, [batch_size, batch_size], name='Pneg')

        with tf.name_scope('initialize_embedding') as scope:
            self.text_embed = tf.Variable(tf.random.truncated_normal([vocab_size, word_embed_size], stddev=0.3))
            self.node_embed = tf.Variable(tf.random.truncated_normal([num_nodes, embed_size // 2], stddev=0.3))
            self.node_embed = tf.clip_by_norm(self.node_embed, clip_norm=1, axes=1)

        with tf.name_scope('lookup_embeddings') as scope:
            self.TA = tf.nn.embedding_lookup(self.text_embed, self.Text_a)
            self.T_A = tf.expand_dims(self.TA, -1)

            self.TB = tf.nn.embedding_lookup(self.text_embed, self.Text_b)
            self.T_B = tf.expand_dims(self.TB, -1)

            self.TNEG = tf.nn.embedding_lookup(self.text_embed, self.Text_neg)
            self.T_NEG = tf.expand_dims(self.TNEG, -1)

            self.N_A = tf.nn.embedding_lookup(self.node_embed, self.Node_a)
            self.N_B = tf.nn.embedding_lookup(self.node_embed, self.Node_b)
            self.N_NEG = tf.nn.embedding_lookup(self.node_embed, self.Node_neg)

        self.alpha = alpha
        self.beta = beta
        self.convA, self.convB, self.convNeg = self.conv()
        self.loss = self.compute_loss()

    def conv(self):

        W0 = tf.Variable(tf.random.truncated_normal([word_embed_size, embed_size // 2], stddev=0.3))
        W1 = tf.Variable(tf.random.truncated_normal([word_embed_size, embed_size // 2], stddev=0.3))
        W2 = tf.Variable(tf.random.truncated_normal([word_embed_size, embed_size // 2], stddev=0.3))

        # Additional weight matrices
        #W3 = tf.Variable(tf.random.truncated_normal([word_embed_size, embed_size // 2], stddev=0.3))
        #W4 = tf.Variable(tf.random.truncated_normal([word_embed_size, embed_size // 2], stddev=0.3))

        mA = tf.reduce_mean(self.T_A, axis=1, keepdims=True)
        mB = tf.reduce_mean(self.T_B, axis=1, keepdims=True)
        mNEG = tf.reduce_mean(self.T_NEG, axis=1, keepdims=True)

        convA = tf.tanh(tf.squeeze(mA))
        convB = tf.tanh(tf.squeeze(mB))
        convNEG = tf.tanh(tf.squeeze(mNEG))

        attA = (tf.matmul(convA, W0) +
                self.alpha * tf.matmul(tf.matmul(self.P_a, convA), W1) +
                self.beta * tf.matmul(tf.matmul(tf.math.square(self.P_a), convA), W2))
                #gamma * tf.matmul(tf.matmul(tf.math.pow(self.P_a, 3), convA), W3) +
                #delta * tf.matmul(tf.matmul(tf.math.pow(self.P_a, 4), convA), W4))

        attB = (tf.matmul(convB, W0) +
                self.alpha * tf.matmul(tf.matmul(self.P_b, convB), W1) +
                self.beta * tf.matmul(tf.matmul(tf.math.square(self.P_b), convB), W2))
                #gamma * tf.matmul(tf.matmul(tf.math.pow(self.P_b, 3), convB), W3) +
                #delta * tf.matmul(tf.matmul(tf.math.pow(self.P_b, 4), convB), W4))


        attNEG = (tf.matmul(convNEG, W0) +
                  self.alpha * tf.matmul(tf.matmul(self.P_a, convNEG), W1) +
                  self.beta * tf.matmul(tf.matmul(tf.math.square(self.P_a), convNEG), W2))
                  #gamma * tf.matmul(tf.matmul(tf.math.pow(self.P_a, 3), convNEG), W3) +
                  #delta * tf.matmul(tf.matmul(tf.math.pow(self.P_a, 4), convNEG), W4))

        return attA, attB, attNEG

    def compute_loss(self):

        # Loss functions for:


        # Text-Text
        p1 = tf.math.log(tf.nn.sigmoid(tf.reduce_sum(tf.multiply(self.convA, self.convB), 1)) + 0.001)

        p2 = tf.math.log(tf.nn.sigmoid(-tf.reduce_sum(tf.multiply(self.convA, self.convNeg), 1)) + 0.001)

        p11 = tf.math.log(tf.nn.sigmoid(tf.reduce_sum(tf.multiply(self.convB, self.convA), 1)) + 0.001)

        p12 = tf.math.log(tf.nn.sigmoid(-tf.reduce_sum(tf.multiply(self.convB, self.convNeg), 1)) + 0.001)



        # Node-Node
        p3 = tf.math.log(tf.nn.sigmoid(tf.reduce_sum(tf.multiply(self.N_A +
                                                                 self.alpha * tf.matmul(self.P_a, self.N_A) +
                                                                 self.beta * tf.matmul(tf.math.square(self.P_a), self.N_A), self.N_B), 1)) + 0.001)
                                                                 #gamma * tf.matmul(tf.math.pow(self.P_a, 3), self.N_A) +
                                                                 #delta * tf.matmul(tf.math.pow(self.P_a, 4), self.N_A), self.N_B), 1)) + 0.001)

        p4 = tf.math.log(tf.nn.sigmoid(-tf.reduce_sum(tf.multiply(self.N_A +
                                                                  self.alpha * tf.matmul(self.P_a, self.N_A) +
                                                                  self.beta * tf.matmul(tf.math.square(self.P_a), self.N_A), self.N_NEG), 1)) + 0.001)
                                                                  #gamma * tf.matmul(tf.math.pow(self.P_a, 3), self.N_A) +
                                                                  #delta * tf.matmul(tf.math.pow(self.P_a, 4), self.N_A), self.N_NEG), 1)) + 0.001)

        p13 = tf.math.log(tf.nn.sigmoid(tf.reduce_sum(tf.multiply(self.N_B +
                                                                  self.alpha * tf.matmul(self.P_b, self.N_B) +
                                                                  self.beta * tf.matmul(tf.math.square(self.P_b), self.N_B), self.N_A), 1)) + 0.001)
                                                                  #gamma * tf.matmul(tf.math.pow(self.P_b, 3), self.N_B) +
                                                                  #delta * tf.matmul(tf.math.pow(self.P_b, 4), self.N_B), self.N_A), 1)) + 0.001)

        p14 = tf.math.log(tf.nn.sigmoid(-tf.reduce_sum(tf.multiply(self.N_B +
                                                                   self.alpha * tf.matmul(self.P_b, self.N_B) +
                                                                   self.beta * tf.matmul(tf.math.square(self.P_b), self.N_B), self.N_NEG), 1)) + 0.001)
                                                                   #gamma * tf.matmul(tf.math.pow(self.P_b, 3), self.N_B) +
                                                                   #delta * tf.matmul(tf.math.pow(self.P_b, 4), self.N_B), self.N_NEG), 1)) + 0.001)




        # Node-Text
        p5 = tf.math.log(tf.nn.sigmoid(tf.reduce_sum(tf.multiply(self.N_A +
                                                                 self.alpha * tf.matmul(self.P_a, self.N_A) +
                                                                 self.beta * tf.matmul(tf.math.square(self.P_a), self.N_A), self.convB), 1)) + 0.001)
                                                                 #gamma * tf.matmul(tf.math.pow(self.P_a, 3), self.N_A) +
                                                                 #delta * tf.matmul(tf.math.pow(self.P_a, 4), self.N_A), self.convB), 1)) + 0.001)

        p6 = tf.math.log(tf.nn.sigmoid(-tf.reduce_sum(tf.multiply(self.N_A +
                                                                  self.alpha * tf.matmul(self.P_a, self.N_A) +
                                                                  self.beta * tf.matmul(tf.math.square(self.P_a), self.N_A), self.convNeg), 1)) + 0.001)
                                                                  #gamma * tf.matmul(tf.math.pow(self.P_a, 3), self.N_A) +
                                                                  #delta * tf.matmul(tf.math.pow(self.P_a, 4), self.N_A), self.convNeg), 1)) + 0.001)

        p15 = tf.math.log(tf.nn.sigmoid(tf.reduce_sum(tf.multiply(self.N_B +
                                                                  self.alpha * tf.matmul(self.P_b, self.N_B) +
                                                                  self.beta * tf.matmul(tf.math.square(self.P_b), self.N_B), self.convA), 1)) + 0.001)
                                                                  #gamma * tf.matmul(tf.math.pow(self.P_b, 3), self.N_B) +
                                                                  #delta * tf.matmul(tf.math.pow(self.P_b, 4), self.N_B), self.convA), 1)) + 0.001)

        p16 = tf.math.log(tf.nn.sigmoid(-tf.reduce_sum(tf.multiply(self.N_B +
                                                                   self.alpha * tf.matmul(self.P_b, self.N_B) +
                                                                   self.beta * tf.matmul(tf.math.square(self.P_b), self.N_B), self.convNeg), 1)) + 0.001)
                                                                   #gamma * tf.matmul(tf.math.pow(self.P_b, 3), self.N_B) +
                                                                   #delta * tf.matmul(tf.math.pow(self.P_b, 4), self.N_B), self.convNeg), 1)) + 0.001)



        # Text-Node
        p7 = tf.math.log(tf.nn.sigmoid(tf.reduce_sum(tf.multiply(self.convA, self.N_B), 1)) + 0.001)

        p8 = tf.math.log(tf.nn.sigmoid(-tf.reduce_sum(tf.multiply(self.convA, self.N_NEG), 1)) + 0.001)

        p17 = tf.math.log(tf.nn.sigmoid(tf.reduce_sum(tf.multiply(self.convB, self.N_A), 1)) + 0.001)

        p18 = tf.math.log(tf.nn.sigmoid(-tf.reduce_sum(tf.multiply(self.convB, self.N_NEG), 1)) + 0.001)

        rho1 = 0.7
        rho2 = 1.0
        rho3 = 0.1
        temp_loss = rho1 * (p1 + p2 + p11 + p12) + rho2 * (p3 + p4 + p13 + p14) + rho3 * (p5 + p6 + p15 + p16) + rho3 * (p7 + p8 + p17 + p18)
        loss = -tf.reduce_sum(temp_loss)
        return loss

# ***Negative Sample***

In [None]:
def InitNegTable(edges):
    a_list, b_list = zip(*edges)
    a_list = list(a_list)
    b_list = list(b_list)
    node = a_list
    node.extend(b_list)

    node_degree = {}
    for i in node:
        if i in node_degree:
            node_degree[i] += 1
        else:
            node_degree[i] = 1
    sum_degree = 0
    for i in node_degree.values():
        sum_degree += pow(i, 0.75)

    por = 0
    cur_sum = 0
    vid = -1
    neg_table = []
    degree_list = list(node_degree.values())
    node_id = list(node_degree.keys())
    for i in range(neg_table_size):
        if ((i + 1) / float(neg_table_size)) > por:
            cur_sum += pow(degree_list[vid + 1], NEG_SAMPLE_POWER)
            por = cur_sum / sum_degree
            vid += 1
        neg_table.append(node_id[vid])
    print(f'Neg. table size: {len(neg_table)}')
    return neg_table


# ***Classify***

In [None]:
class TopKRanker(OneVsRestClassifier):
    def predict(self, X, top_k_list):
        probs = np.asarray(super(TopKRanker, self).predict_proba(X))
        all_labels = []
        for i, k in enumerate(top_k_list):
            probs_ = probs[i, :]
            labels = self.classes_[probs_.argsort()[-k:]].tolist()
            probs_[:] = 0
            probs_[labels] = 1
            all_labels.append(probs_)
        return np.asarray(all_labels)


class Classifier(object):

    def __init__(self, vectors, clf):
        self.embeddings = vectors
        self.clf = TopKRanker(clf)
        self.binarizer = MultiLabelBinarizer(sparse_output=True)

    def train(self, X, Y, Y_all):
        self.binarizer.fit(Y_all)
        # X_train = [self.embeddings[x] for x in X]
        X_train = [self.embeddings[int(x)] for x in X] # For each node in X, take its embedding
        Y = self.binarizer.transform(Y)
        self.clf.fit(X_train, Y)

    def evaluate(self, X, Y):
        top_k_list = [len(l) for l in Y] # For each label in Y, take its size (multi-label)
        Y_ = self.predict(X, top_k_list)
        Y = self.binarizer.transform(Y)
        averages = ["micro", "macro"]
        results = {}
        for average in averages:
            results[average] = f1_score(Y, Y_, average=average)
        return results

    def predict(self, X, top_k_list):
        X_ = np.asarray([self.embeddings[int(x)] for x in X])
        Y = self.clf.predict(X_, top_k_list=top_k_list)
        return Y

    def split_train_evaluate(self, X, Y, train_precent, seed=0):
        state = np.random.get_state()
        training_size = int(train_precent * len(X)) # Set the ratio based on the size of X
        np.random.seed(seed)
        shuffle_indices = np.random.permutation(np.arange(len(X))) # Shuffle the indices of X (X contains all nodes)

        # Access the values of X and Y based on the shuffled indices

        # X_train and Y_train will have "training_size" number of values of X and Y
        X_train = [X[shuffle_indices[i]] for i in range(training_size)]
        Y_train = [Y[shuffle_indices[i]] for i in range(training_size)]

        # X_test and Y_test will have "len(X) - training_size" number of values of X and Y
        X_test = [X[shuffle_indices[i]] for i in range(training_size, len(X))]
        Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))]

        self.train(X_train, Y_train, Y) # Y has the labels of all nodes
        np.random.set_state(state)
        return self.evaluate(X_test, Y_test)



def load_embeddings(filename):
    fin = open(filename, 'r')
    node_num, size = [int(x) for x in fin.readline().strip().split()]
    vectors = {}
    while 1:
        l = fin.readline()
        if l == '':
            break
        vec = l.strip().split(' ')
        assert len(vec) == size + 1
        vectors[vec[0]] = [float(x) for x in vec[1:]]
    fin.close()
    assert len(vectors) == node_num
    return vectors

def read_node_label(filename):
    fin = open(filename, 'r')
    X = []
    Y = []
    XY_dic = {}
    X_Y_dic = {}
    while 1:
        l = fin.readline()
        if l == '':
            break
        # vec = l.strip().split('\t')
        vec = l.strip().split(' ')
        X.append(vec[0])
        Y.append(vec[1:])
        X_Y_dic[str(vec[0])] = str(vec[1:][0])
        XY_dic.setdefault(str(vec[1:][0]), []).append(str(vec[0]))
    fin.close()
    return X, Y, XY_dic, X_Y_dic

# ***Run (Single execution)***

In [None]:
def prepareData(graph, ratio):
  with open(f'{parent_path}/{graph}', 'rb') as f:
    edges = [i for i in f]

  selected = int(len(edges) * float(ratio))
  selected = selected - selected % batch_size
  selected = random.sample(edges, selected)
  remain = [i for i in edges if i not in selected]
  try:
    temp_dir = Path('temp')

    # Check if the directory exists, if so, delete it
    if temp_dir.exists() and temp_dir.is_dir():
        shutil.rmtree(temp_dir)
        print("Existing directory deleted.")

    # Create the directory
    temp_dir.mkdir(parents=True, exist_ok=True)
    print("Directory created successfully.")

  except Exception as e:
      print(f"An error occurred: {e}")

  with open('temp/graph.txt', 'wb') as f:
    for i in selected:
      f.write(i)

  with open('temp/test_graph.txt', 'wb') as f:
    for i in remain:
      f.write(i)

In [None]:
prepareData(graph_file, ratio)

In [None]:
# Load data
#graph_path = os.path.join('/content/temp/graph.txt') # Use this if you executed the prepareData() function

data = dataSet(f'{parent_path}/{data_text_file}',
               f'{parent_path}/{split_graph_file}')

# Saving embeddings
embed_file = f"{parent_path}/Results/DMTE/embed_link_pred_{split_graph_file.split('.')[0]}_{data_text_file.split('.')[0]}.txt"
#embed_file = f"{parent_path}/Results/DMTE/embed_node_clf_{graph_file.split('.')[0]}_{data_text_file.split('.')[0]}.txt" # For node classification the whole graph ('graph.txt') is used

In [None]:
with tf.Graph().as_default():
    sess = tf.compat.v1.Session()
    with sess.as_default():
        model = Model(data.num_vocab, data.num_nodes, alpha, beta)
        opt = tf.compat.v1.train.AdamOptimizer(lr)#tf.keras.optimizers.Adam(learning_rate=lr)
        train_op = opt.minimize(model.loss)#opt.minimize(model.loss, var_list=model.trainable_variables)
        sess.run(tf.compat.v1.global_variables_initializer())
        #total_time = 0

        # training
        print('start training.......')
        start_time = datetime.now()
        for epoch in tqdm(range(num_epoch), desc="Epochs"):
            #start_time = datetime.now()
            loss_epoch = 0
            batches = data.generate_batches()
            h1 = 0
            num_batch = len(batches)
            for i in range(num_batch):
                batch = batches[i]

                node1, node2, node3 = zip(*batch)
                node1, node2, node3 = np.array(node1), np.array(node2), np.array(node3)
                text1, text2, text3 = data.text[node1], data.text[node2], data.text[node3]
                P1, P2, P3 = sub_Mat(data.P, node1), sub_Mat(data.P, node2), sub_Mat(data.P, node3)


                feed_dict = {
                    model.Text_a: text1,
                    model.Text_b: text2,
                    model.Text_neg: text3,
                    model.Node_a: node1,
                    model.Node_b: node2,
                    model.Node_neg: node3,
                    model.P_a: P1,
                    model.P_b: P2,
                    model.P_neg: P3
                }

                # run the graph
                _, loss_batch = sess.run([train_op, model.loss], feed_dict=feed_dict)
                loss_epoch += loss_batch

            #end_time = datetime.now()
            #total_time += (end_time - start_time).total_seconds()
            #print('epoch: ', epoch + 1, ' loss: ', loss_epoch)

        end_time = datetime.now()
        print(f'Total time: {((end_time - start_time).total_seconds()) / 60.0} min')


        # Saving embeddings
        with open(embed_file, 'wb') as f:
            batches = data.generate_batches(mode='add')
            num_batch = len(batches)
            embed = [[] for _ in range(data.num_nodes)]

            for i in range(num_batch):
                batch = batches[i]
                node1, node2, node3 = zip(*batch)
                node1, node2, node3 = np.array(node1), np.array(node2), np.array(node3)
                text1, text2, text3 = data.text[node1], data.text[node2], data.text[node3]
                P1, P2, P3 = sub_Mat(data.P, node1), sub_Mat(data.P, node2), sub_Mat(data.P, node3)

                feed_dict = {
                    model.Text_a: text1,
                    model.Text_b: text2,
                    model.Text_neg: text3,
                    model.Node_a: node1,
                    model.Node_b: node2,
                    model.Node_neg: node3,
                    model.P_a: P1,
                    model.P_b: P2,
                    model.P_neg: P3
                }

                # Fetch embeddings
                convA, convB, TA, TB = sess.run([model.convA, model.convB, model.N_A, model.N_B], feed_dict=feed_dict)

                # For each node in the batch
                for j in range(batch_size):
                    em = list(convA[j]) + list(TA[j]) # Create an embedding by concatenating the text (convA) and node (TA) embeddings
                    embed[node1[j]].append(em) # A node can appear many times in edges. Thus, each time that node will have a different embedding. Append the different embeddings for a particular node

                    em = list(convB[j]) + list(TB[j]) # Create an embedding by concatenating the text (convA) and node (TA) embeddings
                    embed[node2[j]].append(em)


            for i in range(data.num_nodes):
                if embed[i]:
                    tmp = np.sum(embed[i], axis=0) / len(embed[i]) #np.mean(embed[i], axis=0)  # If a node has many different embeddings, take their mean.
                    f.write((' '.join(map(str, tmp)) + '\n').encode())
                else:
                    f.write('\n'.encode())
                    #f.write((' '.join(map(str, zero_list)) + '\n').encode()) # For node classification

gc.collect()

## ***Link Prediction***

In [None]:
node2vec = {}
with open(embed_file, 'rb') as f:
  for i, j in enumerate(f):
    if j.decode() != '\n':
      node2vec[i] = list(map(float, j.strip().decode().split(' ')))


with open(os.path.join(f'{parent_path}/{test_graph_file}'), 'rb') as f:
  edges = [list(map(int, i.strip().decode().split('\t'))) for i in f]


nodes = list(set([i for j in edges for i in j]))
a = 0
b = 0
for i, j in edges:
  if i in node2vec.keys() and j in node2vec.keys():
    dot1 = np.dot(node2vec[i], node2vec[j])
    random_node = random.sample(nodes, 1)[0]
    while random_node == j or random_node not in node2vec.keys():
        random_node = random.sample(nodes, 1)[0]
    dot2 = np.dot(node2vec[i], node2vec[random_node])
    if dot1 > dot2:
        a += 1
    elif dot1 == dot2:
        a += 0.5
    b += 1

print("Auc value:", float(a) / b)

Auc value: 0.8064516129032258


## ***Node Classification***

In [None]:
with open(f'{parent_path}/{categories_file}', 'r') as f:
  tags = f.readlines() # "tags" will be a 2D list. Each sublist will have the form: nodeID     label

if train_classifier:

  clf_test_len = len(nodes) # The number of nodes will be the same in each run since we're using the whole graph and thus, all of its nodes
  print('Train classifier start!')

  X = []
  Y = []
  new_vector = get_vectors_from_file(embed_file)

  for jk in range(0, clf_test_len):
    if str(jk) in nodes: # If the index "jk" is a node
      tag_list = tags[jk].strip().split() # For node "jk", take this info: jk     label
      # Y.append([(int)(i) for i in tags])
      lli = [str(i) for i in tag_list] # For node "jk", lli will contain all of its labels
      if len(lli) != 0:
        if np.array(new_vector[jk]).any() != np.array(zero_list).any(): # If there is no zero value in the embedding of "jk"
          X.append(jk)
          Y.append(lli[1:][0]) # Take the first label (if there are multiple) of node "jk"

  # This part of the code uses only the X and Y lists created above
  mi = {}
  ma = {}
  li1 = []
  li2 = []
  with open(f'{parent_path}/Results/DMTE/{node_clf_results_file}', 'a') as f:
    f.write(f'{embed_file.split('/')[-1]} \n')
    print(embed_file.split('/')[-1])
    for i in range(0, len(clf_ratio)): # Experiment with each ratio
      for j in range(0, clf_num): # clf_num = 5

        clf = Classifier(vectors=new_vector, # All node embeddings
                        clf=LogisticRegression())

        result = clf.split_train_evaluate(X, Y, clf_ratio[i])

        # Results
        li1.append(result['micro'])
        li2.append(result['macro'])


      mi[str(str(clf_ratio[i]) + '-micro')] = sum(li1) / clf_num
      ma[str(str(clf_ratio[i]) + '-macro')] = sum(li2) / clf_num

      print(mi)
      print(ma)
      print()

      f.writelines(str(str(mi)+str(ma)))
      f.write('\n')

      # Reinitialize the dictionaries and lists
      mi = {}
      ma = {}
      li1 = []
      li2 = []

# ***Run (Multiple executions)***

In [None]:
for gf in split_graph_files: # For link prediction. For node classification just use: for gf in ['graph.txt']:
    for t, txtf in enumerate(data_text_files):

      MAX_LEN = MAX_LENS[t]
      print(f'The maximum length is: {MAX_LEN}')

      data = dataSet(f'{parent_path}/{txtf}', f'{parent_path}/{gf}')

      # Logging the execution details
      with open(f'{parent_path}/Results/DMTE/{log_file}', 'a') as f:
          f.write(f'Processing graph: {gf}, text: {txtf}\n')

      print(f'Processing graph: {gf}, text: {txtf}')

      with tf.Graph().as_default():
          sess = tf.compat.v1.Session()
          with sess.as_default():
              model = Model(data.num_vocab, data.num_nodes, alpha, beta)
              opt = tf.compat.v1.train.AdamOptimizer(lr)
              train_op = opt.minimize(model.loss)
              sess.run(tf.compat.v1.global_variables_initializer())
              #total_time = 0

              # Training
              print('start training.......')
              start_time = datetime.now()
              for epoch in tqdm(range(num_epoch)):
                  #start_time = datetime.now()
                  loss_epoch = 0
                  batches = data.generate_batches()
                  h1 = 0
                  num_batch = len(batches)
                  for i in range(num_batch):
                      batch = batches[i]

                      node1, node2, node3 = zip(*batch)
                      node1, node2, node3 = np.array(node1), np.array(node2), np.array(node3)
                      text1, text2, text3 = data.text[node1], data.text[node2], data.text[node3]
                      P1, P2, P3 = sub_Mat(data.P, node1), sub_Mat(data.P, node2), sub_Mat(data.P, node3)


                      feed_dict = {
                          model.Text_a: text1,
                          model.Text_b: text2,
                          model.Text_neg: text3,
                          model.Node_a: node1,
                          model.Node_b: node2,
                          model.Node_neg: node3,
                          model.P_a: P1,
                          model.P_b: P2,
                          model.P_neg: P3
                      }

                      # run the graph
                      _, loss_batch = sess.run([train_op, model.loss], feed_dict=feed_dict)
                      loss_epoch += loss_batch

                  #end_time = datetime.now()
                  #total_time += (end - start).total_seconds()
                  #print('epoch: ', epoch + 1, ' loss: ', loss_epoch)

              end_time = datetime.now()
              with open(f'{parent_path}/Results/DMTE/{log_file}', 'a') as f:
                  f.write(f'Time: {((end_time - start_time).total_seconds()) / 60.0}\n')

              print(f'Total Time: {((end_time - start_time).total_seconds()) / 60.0} min')

              # Save embeddings with a unique name
              embed_file = f"{parent_path}/Results/DMTE/embed_link_pred_{gf.split('.')[0]}_{txtf.split('.')[0]}.txt"
              #embed_file = f"{parent_path}/Results/DMTE/embed_node_clf_{gf.split('.')[0]}_{txtf.split('.')[0]}.txt"

              with open(embed_file, 'wb') as f:
                  batches = data.generate_batches(mode='add')
                  num_batch = len(batches)
                  embed = [[] for _ in range(data.num_nodes)]

                  for i in range(num_batch):
                      batch = batches[i]
                      node1, node2, node3 = zip(*batch)
                      node1, node2, node3 = np.array(node1), np.array(node2), np.array(node3)
                      text1, text2, text3 = data.text[node1], data.text[node2], data.text[node3]
                      P1, P2, P3 = sub_Mat(data.P, node1), sub_Mat(data.P, node2), sub_Mat(data.P, node3)

                      feed_dict = {
                          model.Text_a: text1,
                          model.Text_b: text2,
                          model.Text_neg: text3,
                          model.Node_a: node1,
                          model.Node_b: node2,
                          model.Node_neg: node3,
                          model.P_a: P1,
                          model.P_b: P2,
                          model.P_neg: P3
                      }

                      # Fetch embeddings
                      convA, convB, TA, TB = sess.run([model.convA, model.convB, model.N_A, model.N_B], feed_dict=feed_dict)

                      # For each node in the batch
                      for j in range(batch_size):
                          em = list(convA[j]) + list(TA[j]) # Create an embedding by concatenating the text (convA) and node (TA) embeddings
                          embed[node1[j]].append(em) # A node can appear many times in edges. Thus, each time that node will have a different embedding. Append the different embeddings for a particular node

                          em = list(convB[j]) + list(TB[j]) # Create an embedding by concatenating the text (convA) and node (TA) embeddings
                          embed[node2[j]].append(em)


                  for i in range(data.num_nodes):
                      if embed[i]:
                          tmp = np.sum(embed[i], axis=0) / len(embed[i]) # np.mean(embed[i], axis=0) # If a node has many different embeddings, take their mean.
                          f.write((' '.join(map(str, tmp)) + '\n').encode())
                      else:
                          f.write('\n'.encode())
                          #f.write((' '.join(map(str, zero_list)) + '\n').encode()) # For node classification

              # Log completion
              with open(f'{parent_path}/Results/DMTE/{log_file}', 'a') as f:
                  f.write(f'Embeddings saved to: {embed_file}\n')

      gc.collect()

## ***Node Classification***

In [None]:
embed_files = [f'{parent_path}/Results/DMTE/embed_node_clf_graph_data.txt']

with open(f'{parent_path}/{categories_file}', 'r') as f:
  tags = f.readlines() # "tags" will be a 2D list. Each sublist will have the form: nodeID     label

if train_classifier:

  clf_test_len = len(nodes) # The number of nodes will be the same in each run since we're using the whole graph and thus, all of its nodes
  print('Train classifier start!')

  for ef in embed_files:
    X = []
    Y = []
    new_vector = get_vectors_from_file(ef)

    for jk in range(0, clf_test_len):
      if str(jk) in nodes: # If the index "jk" is a node
        tag_list = tags[jk].strip().split() # For node "jk", take this info: jk     label
        # Y.append([(int)(i) for i in tags])
        lli = [str(i) for i in tag_list] # For node "jk", lli will contain all of its labels
        if len(lli) != 0:
          if np.array(new_vector[jk]).any() != np.array(zero_list).any(): # If there is no zero value in the embedding of "jk"
            X.append(jk)
            Y.append(lli[1:][0]) # Take the first label (if there are multiple) of node "jk"

    # This part of the code uses only the X and Y lists created above
    mi = {}
    ma = {}
    li1 = []
    li2 = []
    with open(f'{parent_path}/Results/DMTE/{node_clf_results_file}', 'a') as f:
      f.write(f'{ef.split('/')[-1]} \n')
      print(ef.split('/')[-1])
      for i in range(0, len(clf_ratio)): # Experiment with each ratio
        for j in range(0, clf_num): # clf_num = 5

          clf = Classifier(vectors=new_vector, # All node embeddings
                          clf=LogisticRegression())

          result = clf.split_train_evaluate(X, Y, clf_ratio[i])

          # Results
          li1.append(result['micro'])
          li2.append(result['macro'])


        mi[str(str(clf_ratio[i]) + '-micro')] = sum(li1) / clf_num
        ma[str(str(clf_ratio[i]) + '-macro')] = sum(li2) / clf_num


        print(mi)
        print(ma)
        print()


        f.writelines(str(str(mi)+str(ma)))
        f.write('\n')

        # Reinitialize the dictionaries and lists
        mi = {}
        ma = {}
        li1 = []
        li2 = []

    gc.collect()

## ***Link Prediction***

In [None]:
embed_files = [[f'{parent_path}/Results/DMTE/embed_link_pred_sgraph15_RAKE5.txt']]

# Initialize a log file to store the AUC results
with open(f'{parent_path}/Results/DMTE/{link_pred_results_file}', "a") as f:
    f.write("Embed File\tAUC Value\n")

for tgfi, tgf in enumerate(test_graph_files):
  for ef in embed_files[tgfi]:
      node2vec = {}

      # Load the embeddings from the current embed file
      with open(ef, 'rb') as f:
          for i, j in enumerate(f):
              if j.decode().strip():
                  node2vec[i] = list(map(float, j.strip().decode().split(' ')))

      # Load the edges from the test graph file
      with open(f'{parent_path}/{tgf}', 'rb') as f:
          edges = [list(map(int, i.strip().decode().split())) for i in f]

      nodes = list(set([i for j in edges for i in j]))

      # Calculate AUC
      a = 0
      b = 0
      for i, j in edges:
          if i in node2vec.keys() and j in node2vec.keys():
              dot1 = np.dot(node2vec[i], node2vec[j])
              random_node = random.sample(nodes, 1)[0]
              while random_node == j or random_node not in node2vec.keys():
                  random_node = random.sample(nodes, 1)[0]
              dot2 = np.dot(node2vec[i], node2vec[random_node])
              if dot1 > dot2:
                  a += 1
              elif dot1 == dot2:
                  a += 0.5
              b += 1

      auc_value = float(a) / b if b > 0 else 0
      print(f"AUC value for {ef.split('/')[-1]}: {auc_value}")

      # Log the result
      with open(f'{parent_path}/Results/DMTE/{link_pred_results_file}', "a") as f:
          f.write(f"{ef}\t{tgf}\t{auc_value}\n")

      gc.collect()