<a href="https://colab.research.google.com/github/GeorgeM2000/DeepEmLAN/blob/main/DeepEmLAN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***Libraries***

In [1]:
import numpy as np
import random
import gc
import sys
import tensorflow as tf; tf.compat.v1.disable_eager_execution()

from math import pow
from datetime import datetime
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer
from time import time
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# ***Global Variables & General Functionality***

In [9]:
MAX_LEN = 100
MAX_LENS = []
neg_table_size = 1000000
NEG_SAMPLE_POWER = 0.75
batch_size = 64
num_epoch = 50
embed_size = 200
lr = 1e-3

tag_size = 7 # Number of categories
Beta = 1.0
negative_ratio = 1
edge_num = 5214 # 5214 # Number of edges # Default: 5429

# Importance coefficients
rho1 = 0.3
rho2 = 0.1
rho3 = 0.3

In [3]:
clf_ratio = [0.15, 0.45, 0.75]
clf_num = 5
train_classifier = True

In [4]:
node_clf_results_file = 'DeepEmLAN_Node_Clf_Res.txt'
link_pred_results_file = 'DeepEmLAN_Link_Pred_Res.txt'
log_file = 'DeepEmLAN_Execution_Logs.txt' # Log file to save execution details

split_graph_file = 'sgraph15.txt'
split_graph_files = ['sgraph15.txt', 'sgraph45.txt', 'sgraph75.txt']
test_graph_file = 'tgraph85.txt'
test_graph_files = ['tgraph85.txt', 'tgraph55.txt', 'tgraph25.txt']


dataset_name = "cora"
categories_file = 'group-v3.txt'
data_text_file = "data.txt" # For a single execution
graph_file = "graph.txt" # For a single execution
data_text_files = ["data.txt", "YAKE.txt", "YAKE10.txt"] # Used for multiple executions of the model
parent_path = f'/content/Datasets/{dataset_name}'

In [17]:
for txtf in data_text_files:#['data.txt']:#data_text_files:
  max_word_count = 0
  min_word_count = float('inf')

  with open(f'{parent_path}/{txtf}', 'r') as file:
      for line in file:
          word_count = len(line.split())

          if word_count > max_word_count:
              max_word_count = word_count

          if word_count < min_word_count:
              min_word_count = word_count

  MAX_LENS.append(max_word_count+1)
  print(f'=== {txtf} ===')
  print("Max word count:", max_word_count)
  print("Min word count:", min_word_count)
  print()

gc.collect()

=== data.txt ===
Max word count: 410
Min word count: 30

=== YAKE.txt ===
Max word count: 15
Min word count: 14

=== YAKE10.txt ===
Max word count: 30
Min word count: 27



0

In [18]:
MAX_LENS

[411, 16, 31]

In [None]:
MAX_LEN = MAX_LENS[-1] # For single execution

In [None]:
# Open and extract the zip file
with zipfile.ZipFile('/content/PartialData.zip', 'r') as zip_ref:
    zip_ref.extractall(parent_path)

print("Extraction complete!")

In [7]:
def get_vectors_from_file(file_path):
  vectors = {}

  with open(f'{file_path}', "r") as f:
      for idx, line in enumerate(f):
          vector = list(map(float, line.strip().split()))  # Convert to list of floats
          vectors[idx] = vector  # Assign embedding to node idx

  return vectors

# ***Dataset***

In [8]:
class dataSet:
	def __init__(self, text_path, graph_path, label_dic):

		text_file, graph_file = self.load(text_path, graph_path)
		self.edges = self.load_edges(graph_file)
		self.label_dic = label_dic # This dictionary contains the label of each node
		self.text, self.num_vocab, self.num_nodes = self.load_text(text_file)
		self.negative_table = InitNegTable(self.edges) # "negative_table" uses only the available edges (depends soley on the graph provided as a parameter)


	def load(self, text_path, graph_path):
		text_file = open(text_path, 'rb').readlines()
		graph_file = open(graph_path, 'rb').readlines()
		return text_file, graph_file

	def load_edges(self, graph_file):
		edges = [] # "edges" will be a 2D list. Each sublist will have two values: [node1, node2]
		for i in graph_file:
			edges.append(list(map(int, i.split()))) # Original: map(int, i.split()) --> This returns an object that is not subscriptable
		return edges

	# Use when the text file has 1s and 0s in each line representing presence or absence of a word
	''' def load_text(self, text_file):
		text_one = [0] * MAX_LEN # Create a zeros vector with "MAX_LEN" length
		text = []
		for te in text_file:
			i = 0
			j = 0
			tte = te.split()[1:]
			for t in tte:
				if t == '1.0' or t == '1' or t == '1.':
					text_one[j] = i+1
					j = j+1
				i = i+1
			text.append(text_one)
			text_one = [0] * MAX_LEN

		text = np.array(text)
		num_vocab = len(text_file[0].split()) + 1
		self.num_nodes = len(text)
		return text, num_vocab, self.num_nodes '''


	def load_text(self, text_file):
		#text_data = [line.strip() for line in text_file]
		text_data = [line.decode('utf-8').strip() for line in text_file] # Decode each line from bytes to string using 'utf-8'

		tokenizer = Tokenizer(oov_token=None) # Default: Tokenizer()
		tokenizer.fit_on_texts(text_data)

		text = tokenizer.texts_to_sequences(text_data)
		text = pad_sequences(text, maxlen=MAX_LEN, padding="post", truncating='post') # Default: pad_sequences(text, maxlen=MAX_LEN, padding="post")

		num_vocab = len(tokenizer.word_index) + 1  # +1 for padding token
		num_nodes = len(text)
		return text, num_vocab, num_nodes


	def negative_sample(self, edges):
		node1, node2 = list(zip(*edges))[0:2] # Original: zip(*edges)[0:2] --> Leads to error
		neg_sample_edges = []
		func = lambda: self.negative_table[random.randint(0, neg_table_size-1)]
		for i in range(len(edges)):
			neg_node = func()
			while self.label_dic[str(node1[i])] == self.label_dic[str(neg_node)] or self.label_dic[str(node2[i])] == \
					self.label_dic[str(neg_node)]:
				neg_node = func()
			neg_sample_edges.append(neg_node)

		return neg_sample_edges

	def generate_batches(self, mode=None):
		r_one_hot = np.zeros(tag_size, dtype=float)
		br_one_hot = np.ones(tag_size, dtype=float)
		num_batch = len(self.edges) // batch_size # Original: len(self.edges) / batch_size --> The result is a float which cannot be used for list slicing
		edge_l = self.edges
		for i in range(0, edge_num):
			r_one_hot[int(self.label_dic[str(edge_l[i][0])])] = 1.0
			r_one_hot[int(self.label_dic[str(edge_l[i][1])])] = 1.0
			br_one_hot[int(self.label_dic[str(edge_l[i][0])])] = Beta
			br_one_hot[int(self.label_dic[str(edge_l[i][1])])] = Beta
			edge_l[i].append(r_one_hot)
			edge_l[i].append(br_one_hot)
			r_one_hot = np.zeros(tag_size, dtype=float)
			br_one_hot = np.ones(tag_size, dtype=float)
		if mode == 'add':
			num_batch += 1
			edge_l.extend(edge_l[:(batch_size - len(self.edges) // batch_size)])
		if mode != 'add':
			random.shuffle(edge_l)


		sample_edges = edge_l[:num_batch * batch_size]
		sample_neg = self.negative_sample(sample_edges)



		shuffle_edges = []
		shuffle_relation = []
		shuffle_brelation = []
		# shuffle_edgetags = []
		for edge in sample_edges:
			shuffle_edges.append([edge[0], edge[1]])
			shuffle_relation.append(edge[2])
			shuffle_brelation.append(edge[3])
		batches_edges = []
		batches_re = []
		batches_neg = []
		batches_br = []
		for j in range(num_batch):
			batches_edges.append(shuffle_edges[j * batch_size:(j + 1) * batch_size])
			batches_re.append(np.array(shuffle_relation[j * batch_size:(j + 1) * batch_size]))
			batches_neg.append(sample_neg[j * batch_size:(j + 1) * batch_size])
			batches_br.append(np.array(shuffle_brelation[j * batch_size:(j + 1) * batch_size]))

		return batches_edges, batches_re, batches_neg, batches_br

# ***DeepEmLAN***

In [31]:
class Model:
    def __init__(self, data, order):
        tf.compat.v1.reset_default_graph() # With every new model all the variables used in a previous model will be reset
        self.cur_epoch = 0
        self.vocab_size = data.num_vocab
        self.num_nodes = data.num_nodes
        self.order = order
        self.data = data
        self.sess = tf.compat.v1.Session()
        self.n_hidden_1 = 64
        self.n_hidden_2 = 32
        self.output_size = tag_size

        cur_seed = random.getrandbits(32)
        initializer = tf.keras.initializers.GlorotNormal(seed=cur_seed)
        with tf.compat.v1.variable_scope("model", reuse=None, initializer=initializer):
            self.build_graph()
        self.sess.run(tf.compat.v1.global_variables_initializer())

    def close_session(self):
        self.sess.close()

    def build_graph(self):
            # '''hyperparameter'''
        with tf.name_scope('read_inputs') as scope:
            self.Text_a = tf.compat.v1.placeholder(tf.int32, [batch_size, MAX_LEN], name='Ta' + str(self.order))
            self.Text_b = tf.compat.v1.placeholder(tf.int32, [batch_size, MAX_LEN], name='Tb' + str(self.order))
            self.Text_pos = tf.compat.v1.placeholder(tf.int32, [batch_size, MAX_LEN], name='Tpos'+ str(self.order))
            self.Text_neg = tf.compat.v1.placeholder(tf.int32, [batch_size, MAX_LEN], name='Tneg'+ str(self.order))
            self.Node_a = tf.compat.v1.placeholder(tf.int32, [batch_size], name='n1' + str(self.order))
            self.Node_b = tf.compat.v1.placeholder(tf.int32, [batch_size], name='n2' + str(self.order))
            self.Node_pa = tf.compat.v1.placeholder(tf.int32, [batch_size], name='n4' + str(self.order))
            self.Node_na = tf.compat.v1.placeholder(tf.int32, [batch_size], name='n5' + str(self.order))
            self.relation = tf.compat.v1.placeholder(tf.float32, [batch_size, tag_size], name='r' + str(self.order))
            self.brelation = tf.compat.v1.placeholder(tf.float32, [batch_size, tag_size], name='br' + str(self.order))

        with tf.name_scope('initialize_para') as scope:
            self.weights = {
                'encoder_h1'+ str(self.order): tf.Variable(tf.compat.v1.random_normal([embed_size // 2, self.n_hidden_1])),
                'encoder_h2'+ str(self.order): tf.Variable(tf.compat.v1.random_normal([self.n_hidden_1, self.n_hidden_2])),
                'encoder_h3'+ str(self.order): tf.Variable(tf.compat.v1.random_normal([self.n_hidden_2, self.output_size]))
            }
            cur_seed = random.getrandbits(32)
            self.biases = {
                'encoder_b1'+ str(self.order): tf.Variable(tf.compat.v1.random_normal([self.n_hidden_1])),
                'encoder_b2'+ str(self.order): tf.Variable(tf.compat.v1.random_normal([self.n_hidden_2])),
                'encoder_b3'+ str(self.order): tf.Variable(tf.compat.v1.random_normal([self.output_size]))
            }

        with tf.name_scope('initialize_embedding') as scope:
            cur_seed = random.getrandbits(32)
            self.context_embed = tf.compat.v1.get_variable(name="context_embeddings"+ str(self.order),
                                                 shape=[self.num_nodes, embed_size // 2],
                                                 initializer=tf.keras.initializers.GlorotNormal(seed=cur_seed))
            self.text_embed = tf.compat.v1.get_variable(name="text_embeddings"+ str(self.order),
                                              shape=[self.vocab_size, embed_size // 2],
                                              initializer=tf.keras.initializers.GlorotNormal(seed=cur_seed))

            self.node_embed = tf.compat.v1.get_variable(name="embeddings" + str(self.order),
                                              shape=[self.num_nodes, embed_size // 2],
                                              initializer=tf.keras.initializers.GlorotNormal(seed=cur_seed))

        with tf.name_scope('lookup_embeddings') as scope:
            self.TA = tf.nn.embedding_lookup(self.text_embed, self.Text_a)
            self.T_A = tf.expand_dims(self.TA, -1)

            self.TB = tf.nn.embedding_lookup(self.text_embed, self.Text_b)
            self.T_B = tf.expand_dims(self.TB, -1)

            self.Tpos = tf.nn.embedding_lookup(self.text_embed, self.Text_pos)
            self.T_POS = tf.expand_dims(self.Tpos, -1)

            self.TNEG = tf.nn.embedding_lookup(self.text_embed, self.Text_neg)
            self.T_NEG = tf.expand_dims(self.TNEG, -1)

            self.N_A = tf.nn.l2_normalize(tf.nn.embedding_lookup(self.node_embed, self.Node_a), 1)
            self.N_B = tf.nn.l2_normalize(tf.nn.embedding_lookup(self.node_embed, self.Node_b), 1)
            self.N_POS = tf.nn.embedding_lookup(self.node_embed, self.Node_pa)
            self.N_NEG = tf.nn.embedding_lookup(self.node_embed, self.Node_na)
            self.N_NEG_list = tf.split(self.N_NEG, negative_ratio, 1)

            self.pos_nb_context = tf.nn.l2_normalize(
                tf.nn.embedding_lookup(self.context_embed, tf.cast(self.Node_b, tf.int32)), 1)
            self.pos_ab = tf.nn.l2_normalize(
                tf.nn.embedding_lookup(self.node_embed, tf.cast(self.Node_pa, tf.int32)), 1)
            self.neg_ab = tf.nn.l2_normalize(
                tf.nn.embedding_lookup(self.node_embed, tf.cast(self.Node_na, tf.int32)), 1)
            self.neg_ab_context = tf.nn.l2_normalize(
                tf.nn.embedding_lookup(self.context_embed, tf.cast(self.Node_na, tf.int32)), 1)

        self.R_AB, self.l_predict = self.conv()
        self.loss = self.compute_loss(self.order)
        optimizer = tf.compat.v1.train.AdamOptimizer(lr)
        self.train_op = optimizer.minimize(self.loss)

    def train_one_epoch(self):
        loss_epoch = 0
        batches_edges, batches_re, batches_neg, batches_br = self.data.generate_batches()
        num_batch = len(batches_edges)
        for i in range(num_batch):
            node1, node2 = zip(*batches_edges[i])
            node_list4 = batches_neg[i]
            batch_r = batches_re[i]
            batch_br = batches_br[i]
            node_list3 = node2
            node1, node2, node_list3, node_list4 = np.array(node1), np.array(node2), \
                                                          np.array(node_list3), np.array(node_list4)
            node_list3 = np.transpose(node_list3)
            text1, text2 = self.data.text[node1], self.data.text[node2]
            text_pos = []
            text_neg = []
            for npp in node_list3:
                text_pos.append(self.data.text[npp])
            for nn in node_list4:
                text_neg.append(self.data.text[nn])

            feed_dict = {
                self.Text_a: text1,
                self.Text_b: text2,
                self.Text_pos: text_pos,
                self.Text_neg: text_neg,
                self.Node_a: node1,
                self.Node_b: node2,
                self.Node_pa: node_list3,
                self.Node_na: node_list4,
                self.relation: batch_r,
                self.brelation: batch_br

            }

            # run the graph
            _, loss_batch = self.sess.run([self.train_op, self.loss], feed_dict=feed_dict)
            # print loss_batch
            loss_epoch += loss_batch
        #print str('order' + str(self.order) + ':'), self.cur_epoch, ' loss: ', loss_epoch
        self.cur_epoch += 1

    def encoder(self, x):
        layer_1 = tf.nn.tanh(tf.add(tf.matmul(tf.reshape(x,[batch_size, embed_size // 2]), self.weights['encoder_h1'+ str(self.order)]),
                                       self.biases['encoder_b1'+ str(self.order)]))
        layer_2 = tf.nn.tanh(tf.add(tf.matmul(layer_1, self.weights['encoder_h2'+ str(self.order)]),
                                       self.biases['encoder_b2'+ str(self.order)]))
        layer_3 = tf.reshape(tf.nn.tanh(tf.add(tf.matmul(layer_2, self.weights['encoder_h3'+ str(self.order)]),
                                       self.biases['encoder_b3'+ str(self.order)])),[batch_size,1,self.output_size])
        return layer_3

    ''' def conv(self):

        TA_norm = tf.sqrt(tf.reduce_sum(self.TA*self.TA, 1))
        TB_norm = tf.sqrt(tf.reduce_sum(self.TB*self.TB, 1))
        TA_TB = tf.reduce_sum(self.TA*self.TB, 1)
        # self.TN = tf.reshape(tf.split(self.TNEG, config.negative_ratio, 1)[0],
        #            [config.batch_size, int(config.embed_size / 2), int(config.embed_size / 2)])
        cosinAB = tf.divide(TA_TB, TA_norm*TB_norm+1e-8)

        cosin1 = tf.expand_dims(cosinAB, -1)


        self.u_A = tf.reshape(tf.matmul(self.TA, cosin1),   [batch_size, embed_size // 2])
        self.u_B = tf.reshape(tf.matmul(self.TB, cosin1),   [batch_size, embed_size // 2])
        self.u_P = tf.reshape(tf.matmul(self.Tpos, cosin1), [batch_size, embed_size // 2])
        self.u_N = tf.reshape(tf.matmul(self.TNEG, cosin1), [batch_size, embed_size // 2])


        R_AB = self.u_A + self.u_B
        self.R_AB = tf.reshape(R_AB, [batch_size, embed_size // 2])

        l_predict = self.encoder(tf.reshape(self.R_AB, [batch_size, 1, embed_size // 2]))
        return R_AB, l_predict '''


    # Used for debugging
    def conv(self):
        # Compute norms
        TA_norm = tf.sqrt(tf.reduce_sum(self.TA * self.TA, axis=1))
        TB_norm = tf.sqrt(tf.reduce_sum(self.TB * self.TB, axis=1))
        TA_TB = tf.reduce_sum(self.TA * self.TB, axis=1)

        ''' print("TA shape:", self.TA.shape)
        print("TB shape:", self.TB.shape)
        print("TA_norm shape:", TA_norm.shape)
        print("TB_norm shape:", TB_norm.shape)
        print("TA_TB shape:", TA_TB.shape) '''

        # Compute cosine similarity
        cosinAB = tf.divide(TA_TB, TA_norm * TB_norm + 1e-8)
        #print("cosinAB shape:", cosinAB.shape)

        # Expand dimensions
        cosin1 = tf.expand_dims(cosinAB, -1)
        #print("cosin1 shape:", cosin1.shape)

        # Compute transformed representations
        self.u_A = tf.reshape(tf.broadcast_to(tf.squeeze(tf.matmul(self.TA, cosin1), axis=-1)[..., tf.newaxis], [batch_size, MAX_LEN, embed_size // 2]), [batch_size * MAX_LEN, embed_size // 2])
        self.u_B = tf.reshape(tf.broadcast_to(tf.squeeze(tf.matmul(self.TB, cosin1), axis=-1)[..., tf.newaxis], [batch_size, MAX_LEN, embed_size // 2]), [batch_size * MAX_LEN, embed_size // 2])
        self.u_P = tf.reshape(tf.broadcast_to(tf.squeeze(tf.matmul(self.Tpos, cosin1), axis=-1)[..., tf.newaxis], [batch_size, MAX_LEN, embed_size // 2]), [batch_size * MAX_LEN, embed_size // 2])
        self.u_N = tf.reshape(tf.broadcast_to(tf.squeeze(tf.matmul(self.TNEG, cosin1), axis=-1)[..., tf.newaxis], [batch_size, MAX_LEN, embed_size // 2]), [batch_size * MAX_LEN, embed_size // 2])

        ''' print("u_A shape:", self.u_A.shape)
        print("u_B shape:", self.u_B.shape)
        print("u_P shape:", self.u_P.shape)
        print("u_N shape:", self.u_N.shape) '''

        # Compute final representation
        R_AB = self.u_A + self.u_B
        self.R_AB = tf.reshape(tf.broadcast_to(R_AB,  [batch_size, MAX_LEN, embed_size // 2]), [batch_size * MAX_LEN, embed_size // 2])
        #print("R_AB shape:", R_AB.shape)
        #print("self.R_AB shape:", self.R_AB.shape)

        # Pass through encoder
        l_predict = self.encoder(tf.reshape(tf.broadcast_to(self.R_AB, [batch_size, 1, embed_size // 2]) [batch_size, 1, embed_size // 2]))
        #print("l_predict shape:", l_predict.shape)


        return R_AB, l_predict

    def compute_loss(self,order):

        rho1 = 0.3
        rho2 = 0.1
        rho3 = 0.3
        p1 = tf.reduce_sum(tf.multiply(self.u_A, self.u_B), 1)
        p1 = tf.math.log(tf.nn.sigmoid(p1) + 0.001)
        p2 = tf.reduce_sum(tf.multiply(self.N_A, self.N_B), 1)
        p2 = tf.math.log(tf.nn.sigmoid(p2) + 0.001)

        p3 = tf.reduce_sum(tf.multiply(self.N_A, self.u_A), 1)
        p3 = tf.math.log(tf.nn.sigmoid(p3) + 0.001)
        p4 = tf.math.reduce_sum(tf.multiply(self.N_B, self.u_B), 1)
        p4 = tf.math.log(tf.nn.sigmoid(p4) + 0.001)


        for i in range(0, 1):
        # for i in range(0, config.negative_ratio):
            u_P1 = tf.reshape(tf.split(self.u_P, negative_ratio, 1)[i], [batch_size, embed_size // 2])
            u_N1 = tf.reshape(tf.split(self.u_N, negative_ratio, 1)[i],
                              [batch_size, embed_size // 2])
            p5 = tf.reduce_sum(tf.multiply(self.u_A, u_P1), 1)
            p5 = tf.math.log(tf.nn.sigmoid(p5) + 0.001)
            p6 = tf.reduce_sum(tf.multiply(self.u_B, u_P1), 1)
            p6 = tf.math.log(tf.nn.sigmoid(p6) + 0.001)
            p7 = tf.reduce_sum(tf.multiply(self.u_A, u_N1), 1)
            p7 = tf.math.log(tf.nn.sigmoid(-p7) + 0.001)
            p8 = tf.reduce_sum(tf.multiply(self.u_B, u_N1), 1)
            p8 = tf.math.log(tf.nn.sigmoid(-p8) + 0.001)
            p9 = tf.reduce_sum(tf.multiply(self.N_A, self.N_NEG), 1)
            p9 = tf.math.log(tf.nn.sigmoid(-p9) + 0.001)
            p10 = tf.reduce_sum(tf.multiply(self.N_B, self.N_NEG), 1)
            p10 = tf.math.log(tf.nn.sigmoid(-p10) + 0.001)

        p11 = tf.reduce_sum(tf.multiply(self.l_predict, self.relation))
        p11 = tf.math.log(tf.nn.sigmoid(p11) + 0.001)

        p_all = rho1*(p1 + p2 + p5 + p6 + p7 + p8 + p9 + p10) + rho2*(p3 + p4) + rho3 * p11

        temp_loss = -tf.reduce_sum(p_all+p11)
        self.sample_sum1 = tf.reduce_sum(tf.exp(tf.multiply(self.pos_ab, self.neg_ab)),
                                         axis=1)
        self.first_loss = tf.reduce_mean(-tf.reduce_sum(tf.multiply(self.N_A, self.N_B), axis=1) +
                                         tf.math.log(self.sample_sum1))
        self.sample_sum2 = tf.reduce_sum(
            tf.exp(tf.multiply(self.pos_ab, self.neg_ab_context)), axis=1)
        self.second_loss = tf.reduce_mean(-tf.reduce_sum(tf.multiply(self.N_A, self.pos_nb_context), axis=1) +
                                          tf.math.log(self.sample_sum2))
        loss = temp_loss + self.first_loss + self.second_loss


        return loss


    def get_embedding(self):
        vectors = {}
        available_vectors = {} # Nodes that have an embedding: 1 --- Nodes that don't: 0

        zero_list = []
        for i in range(0, embed_size):
            zero_list.append(0)
        zero_list = np.array(zero_list)


        embed = [[] for _ in range(self.data.num_nodes)]

        batches_edges, batches_re, batches_neg, batches_br = self.data.generate_batches(mode='add')
        num_batch = len(batches_edges)
        for i in range(num_batch):
            node1, node2 = zip(*batches_edges[i])
            node_list4 = batches_neg[i]
            batch_r = batches_re[i]
            batch_br = batches_br[i]

            node_list3 = node2
            node1, node2, node_list3, node_list4 = np.array(node1), np.array(node2), \
                                                                 np.array(node_list3), np.array(
                node_list4)
            node_list3 = np.transpose(node_list3)
            text1, text2 = self.data.text[node1], self.data.text[node2]
            text_pos = []
            text_neg = []
            for npp in node_list3:
                text_pos.append(self.data.text[npp])
            for nn in node_list4:
                text_neg.append(self.data.text[nn])


            feed_dict = {
                self.Text_a: text1,
                self.Text_b: text2,
                self.Text_pos: text_pos,
                self.Text_neg: text_neg,
                self.Node_a: node1,
                self.Node_b: node2,
                self.Node_pa: node_list3,
                self.Node_na: node_list4,
                self.relation: batch_r,
                self.brelation: batch_br

            }
            uA, uB, rAB, NA, NB = self.sess.run([self.u_A, self.u_B, self.R_AB, self.N_A, self.N_B], feed_dict=feed_dict)

            for i in range(batch_size):
                embed[node1[i]].append(list(NA[i])+list(rAB[i]))  #
                embed[node2[i]].append(list(NB[i])+list(rAB[i]))  #

        for i in range(self.data.num_nodes):
            if embed[i]: # If an embedding exists for node i
                tmp=np.sum(embed[i],axis=0)/len(embed[i])
                vectors[i]=tmp
                available_vectors[i] = 1
                #file.write(' '.join(map(str,tmp))+'\n')
            else:
                vectors[i]=zero_list
                available_vectors[i] = 0

        return vectors, available_vectors

# ***Negative table***

In [12]:
def InitNegTable(edges):
	a_list,b_list=zip(*edges)
	a_list=list(a_list)
	b_list=list(b_list)
	node=a_list
	node.extend(b_list)

	node_degree={}
	for i in node:
		if i in node_degree:
			node_degree[i]+=1
		else:
			node_degree[i]=1
	sum_degree=0
	for i in node_degree.values():
		sum_degree+=pow(i,0.75)

	por=0
	cur_sum=0
	vid=-1
	neg_table=[]
	degree_list=list(node_degree.values())
	node_id=list(node_degree.keys())
	for i in range(neg_table_size):
		if(((i+1)/float(neg_table_size))>por):
			cur_sum+=pow(degree_list[vid+1],NEG_SAMPLE_POWER)
			por=cur_sum/sum_degree
			vid+=1
		neg_table.append(node_id[vid])
	return neg_table

# ***Classify***

In [13]:
class TopKRanker(OneVsRestClassifier):
    def predict(self, X, top_k_list):
        probs = np.asarray(super(TopKRanker, self).predict_proba(X))
        all_labels = []
        for i, k in enumerate(top_k_list):
            probs_ = probs[i, :]
            labels = self.classes_[probs_.argsort()[-k:]].tolist()
            probs_[:] = 0
            probs_[labels] = 1
            all_labels.append(probs_)
        return np.asarray(all_labels)


class Classifier(object):

    def __init__(self, vectors, clf):
        self.embeddings = vectors
        self.clf = TopKRanker(clf)
        self.binarizer = MultiLabelBinarizer(sparse_output=True)

    def train(self, X, Y, Y_all):
        self.binarizer.fit(Y_all)
        # X_train = [self.embeddings[x] for x in X]
        X_train = [self.embeddings[int(x)] for x in X] # For each node in X, take its embedding
        Y = self.binarizer.transform(Y)
        self.clf.fit(X_train, Y)

    def evaluate(self, X, Y):
        top_k_list = [len(l) for l in Y] # For each label in Y, take its size (multi-label)
        Y_ = self.predict(X, top_k_list)
        Y = self.binarizer.transform(Y)
        averages = ["micro", "macro"]
        results = {}
        for average in averages:
            results[average] = f1_score(Y, Y_, average=average)
        return results

    def predict(self, X, top_k_list):
        X_ = np.asarray([self.embeddings[int(x)] for x in X])
        Y = self.clf.predict(X_, top_k_list=top_k_list)
        return Y

    def split_train_evaluate(self, X, Y, train_precent, seed=0):
        state = np.random.get_state()
        training_size = int(train_precent * len(X)) # Set the ratio based on the size of X
        np.random.seed(seed)
        shuffle_indices = np.random.permutation(np.arange(len(X))) # Shuffle the indices of X (X contains all nodes)

        # Access the values of X and Y based on the shuffled indices

        # X_train and Y_train will have "training_size" number of values of X and Y
        X_train = [X[shuffle_indices[i]] for i in range(training_size)]
        Y_train = [Y[shuffle_indices[i]] for i in range(training_size)]

        # X_test and Y_test will have "len(X) - training_size" number of values of X and Y
        X_test = [X[shuffle_indices[i]] for i in range(training_size, len(X))]
        Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))]

        self.train(X_train, Y_train, Y) # Y has the labels of all nodes
        np.random.set_state(state)
        return self.evaluate(X_test, Y_test)



def load_embeddings(filename):
    fin = open(filename, 'r')
    node_num, size = [int(x) for x in fin.readline().strip().split()]
    vectors = {}
    while 1:
        l = fin.readline()
        if l == '':
            break
        vec = l.strip().split(' ')
        assert len(vec) == size + 1
        vectors[vec[0]] = [float(x) for x in vec[1:]]
    fin.close()
    assert len(vectors) == node_num
    return vectors

def read_node_label(filename):
    fin = open(filename, 'r')
    X = []
    Y = []
    XY_dic = {}
    X_Y_dic = {}
    while 1:
        l = fin.readline()
        if l == '':
            break
        # vec = l.strip().split('\t')
        vec = l.strip().split(' ')
        X.append(vec[0])
        Y.append(vec[1:])
        X_Y_dic[str(vec[0])] = str(vec[1:][0])
        XY_dic.setdefault(str(vec[1:][0]), []).append(str(vec[0]))
    fin.close()
    return X, Y, XY_dic, X_Y_dic

# ***Train***

Code that needs to be executed only once

In [14]:
def get_key(dict, value):
    return [k for k, v in dict.items() if v == value]


# Store the label of each abstract
label_dic = {}
with open(f'{parent_path}/{categories_file}', 'r') as f:
  labels = f.readlines()


"""
The "node_tag" text file has to be in the following format: node_id node_label1 node_label2 ...
"""
for la in labels:
  label_dic[la.split()[0]] = la.split()[1:][0] # la.split()[0] = the node id ----- la.split()[1:][0] = The label of that node. If a node has many labels, take the first

zero_list = []

# Place "embed_size * 2" (400) zeros in "zero_list"
for i in range(0, embed_size):
  zero_list.append(0)

## ***Single Execution***

In [15]:
# Save embeddings with a unique name
#embed_file = f"{parent_path}/Results/DeepEmLAN/embed_link_pred_{graph_file.split('.')[0]}_{data_text_file.split('.')[0]}.txt"
embed_file = f"{parent_path}/Results/DeepEmLAN/embed_node_clf_{graph_file.split('.')[0]}_{data_text_file.split('.')[0]}.txt"

In [None]:
# Create the edge list. Store the unique nodes in the list "nodes"
with open(f'{parent_path}/{graph_file}', 'r') as f:
  eedges = f.readlines()

edge_list = []
nodes = [] # "nodes" will contain all the unique nodes of the graph
for ee in eedges:
  edge_list.append(list(ee.split()))
for ll in edge_list:
  for ed in ll:
    if ed not in nodes:
      nodes.append(ed)
    else:
      continue


# Create the dataset. It seems that dataSet() takes the full graph. Perhaps because it is meant for node classification and not for link prediction.
# In link prediction, other NE methods split the graph and the dataSet() takes only a portion of the graph
data = dataSet(f'{parent_path}/{data_text_file}', f'{parent_path}/{graph_file}', label_dic)


# Train the model for "num_epoch" epochs
model = Model(data, 1)


start_time = datetime.now()
for i in range(num_epoch):
  model.train_one_epoch()

vectors, available_vectors = model.get_embedding() # Get the node embeddings



# Store all the node ids with the same label as node "ii".
node_nei_list = {}
one_node_edges = []
for ii in nodes: # For each node "ii"
  for ed in edge_list: # For each edge "ed"
    if ii in ed: # If node "ii" is in "ed". This takes the one-hop neighborhood of "ii"
      if label_dic[ii] == label_dic[ed[0]] and ii not in one_node_edges: # If the label of "ii" is the same as the label of "ed[0]" and "ii" not in "one_node_edges"
        one_node_edges.append(ed[0])
      if label_dic[ii] == label_dic[ed[1]] and ii not in one_node_edges:
        one_node_edges.append(ed[1])
    else:
      pass
  node_nei_list[ii] = one_node_edges # Node "ii" has the same label with nodes in "one_node_edges"
  one_node_edges = []

# Change the vector representation of each node
new_vector = {}
one_node_new_vec = []
for ve in vectors.keys(): # For each node that has an embedding in "vectors"
  if str(ve) in node_nei_list:
    for nnl in node_nei_list[str(ve)]: # Take the nodes with the same label as node "ve"
      one_node_new_vec.append(vectors[int(nnl)]) # Append the embeddings of all nodes with the same label as "ve"
    #one_node_new_vec = np.array(one_node_new_vec).sum(axis=0)/len(node_nei_list[str(ve)]) # Take the mean


    if one_node_new_vec: # Check if one_node_new_vec has elements before calculating the mean to avoid ZeroDivisionError
      one_node_new_vec = np.array(one_node_new_vec).sum(axis=0)/len(node_nei_list[str(ve)]) # Take the mean
    else: # Assign zero_list if the node has no neighbors with the same label
      one_node_new_vec = zero_list

    # The new vector for node "ve" will be the mean of all the embeddings of the one-hop neighbors with the same label as "ve"
    new_vector[ve] = one_node_new_vec
    one_node_new_vec = []
  else:
    new_vector[ve] = zero_list

end_time = datetime.now()
print(f'Time: {((end_time - start_time).total_seconds()) / 60.0}')
model.close_session()


# For link prediction
''' with open(embed_file, 'wb') as f:
  for node_id, node_vec in new_vector.items():
    if available_vectors[node_id] == 1:
        f.write((' '.join(map(str, node_vec)) + '\n').encode())
    else:
        f.write('\n'.encode()) '''

# For node classification
with open(embed_file, 'wb') as f:
  for node_id, node_vec in new_vector.items():
    f.write((' '.join(map(str, node_vec)) + '\n').encode())

# Log completion
with open(f'{parent_path}/Results/DeepEmLAN/{log_file}', 'a') as f:
    f.write(f'Embeddings saved to: {embed_file}\n')

gc.collect()


### ***Link Prediction*** (For both single and multiple executions)

In [None]:
embed_files = [[f'{parent_path}/Results/DeepEmLAN/embed_link_pred_graph_data.txt']]

# Initialize a log file to store the AUC results
with open(f'{parent_path}/Results/DeepEmLAN/{link_pred_results_file}', "a") as f:
    f.write("Embed File\tAUC Value\n")

for tgfi, tgf in enumerate(test_graph_files):
  for ef in embed_files[tgfi]:
      node2vec = {}

      # Load the embeddings from the current embed file
      with open(ef, 'rb') as f:
          for i, j in enumerate(f):
              if j.decode().strip():
                  node2vec[i] = list(map(float, j.strip().decode().split(' ')))

      # Load the edges from the test graph file
      with open(f'{parent_path}/{tgf}', 'rb') as f:
          edges = [list(map(int, i.strip().decode().split())) for i in f]

      nodes = list(set([i for j in edges for i in j])) # All the unique nodes in "edges"

      # Calculate AUC
      a = 0
      b = 0
      for i, j in edges:
          if i in node2vec.keys() and j in node2vec.keys():
              dot1 = np.dot(node2vec[i], node2vec[j])
              random_node = random.sample(nodes, 1)[0]
              while random_node == j or random_node not in node2vec.keys():
                  random_node = random.sample(nodes, 1)[0]
              dot2 = np.dot(node2vec[i], node2vec[random_node])
              if dot1 > dot2:
                  a += 1
              elif dot1 == dot2:
                  a += 0.5
              b += 1

      auc_value = float(a) / b if b > 0 else 0
      print(f"AUC value for {ef.split('/')[-1]}: {auc_value}")

      # Log the result
      with open(f'{parent_path}/Results/DeepEmLAN/{link_pred_results_file}', "a") as f:
          f.write(f"{ef}\t{tgf}\t{auc_value}\n")

      gc.collect()

### ***Node Classification*** (For both single and multiple executions)

In [None]:
embed_files = [f'{parent_path}/Results/DeepEmLAN/embed_node_clf_graph_data.txt',
               f'{parent_path}/Results/DeepEmLAN/embed_node_clf_graph_YAKE.txt',
               f'{parent_path}/Results/DeepEmLAN/embed_node_clf_graph_YAKE10.txt',
               f'{parent_path}/Results/DeepEmLAN/embed_node_clf_graph_PositionRank.txt',
               f'{parent_path}/Results/DeepEmLAN/embed_node_clf_graph_PositionRank10.txt']

with open(f'{parent_path}/{categories_file}', 'r') as f:
  tags = f.readlines() # "tags" will be a 2D list. Each sublist will have the form: nodeID     label

if train_classifier:

  clf_test_len = len(nodes) # The number of nodes will be the same in each run since we're using the whole graph and thus, all of its nodes

  for ef in embed_files:
    X = []
    Y = []
    new_vector = get_vectors_from_file(ef)

    for jk in range(0, clf_test_len):
      if str(jk) in nodes: # If the index "jk" is a node
        tag_list = tags[jk].strip().split() # For node "jk", take this info: jk     label
        # Y.append([(int)(i) for i in tags])
        lli = [str(i) for i in tag_list] # For node "jk", lli will contain all of its labels
        if len(lli) != 0:
          if np.array(new_vector[jk]).any() != np.array(zero_list).any(): # If there is no zero value in the embedding of "jk"
            X.append(jk)
            Y.append(lli[1:][0]) # Take the first label (if there are multiple) of node "jk"

    # This part of the code uses only the X and Y lists created above
    mi = {}
    ma = {}
    li1 = []
    li2 = []
    with open(f'{parent_path}/Results/DeepEmLAN/{node_clf_results_file}', 'a') as f:
      f.write(f"{ef.split('/')[-1]} \n")
      print(ef.split('/')[-1])
      for i in range(0, len(clf_ratio)): # Experiment with each ratio
        for j in range(0, clf_num): # clf_num = 5

          clf = Classifier(vectors=new_vector, # All node embeddings
                          clf=LogisticRegression(max_iter=1000))

          result = clf.split_train_evaluate(X, Y, clf_ratio[i])

          # Results
          li1.append(result['micro'])
          li2.append(result['macro'])

        mi[str(str(clf_ratio[i]) + '-micro')] = sum(li1) / clf_num
        ma[str(str(clf_ratio[i]) + '-macro')] = sum(li2) / clf_num

        print(mi)
        print(ma)
        print()

        f.writelines(str(str(mi)+str(ma)))
        f.write('\n')

        # Reinitialize the dictionaries and lists
        mi = {}
        ma = {}
        li1 = []
        li2 = []

    gc.collect()

## ***Multiple Executions***

In [32]:
for gf in ['graph.txt']: #for gf in split_graph_files: # For link prediction. For node classification just use: for gf in ['graph.txt']:

  # Create the edge list. Store the unique nodes in the list "nodes"
  with open(f'{parent_path}/{gf}', 'r') as f:
    eedges = f.readlines()

  edge_list = []
  nodes = []
  for ee in eedges:
    edge_list.append(list(ee.split()))
  for ll in edge_list:
    for ed in ll:
      if ed not in nodes:
        nodes.append(ed)
      else:
        continue

  for t, txtf in enumerate(data_text_files):

    MAX_LEN = MAX_LENS[t]
    print(f'The maximum length is: {MAX_LEN}')

    # Create the dataset. It seems that dataSet() takes the full graph. Perhaps because it is meant for node classification and not for link prediction.
    # In link prediction, other NE methods split the graph and the dataSet() takes only a portion of the graph
    data = dataSet(f'{parent_path}/{txtf}', f'{parent_path}/{gf}', label_dic)

    # Logging the execution details
    with open(f'{parent_path}/Results/DeepEmLAN/{log_file}', 'a') as f:
        f.write(f'Processing graph: {gf}, text: {txtf}\n')

    print(f'Processing graph: {gf}, text: {txtf}')

    # Train the model for "num_epoch" epochs
    model = Model(data, 1)

    start_time = datetime.now()
    for i in range(num_epoch):
      model.train_one_epoch()

    vectors, available_vectors = model.get_embedding() # Get the node embeddings


    # Store all the node ids with the same label as node "ii".
    node_nei_list = {}
    one_node_edges = []
    for ii in nodes: # For each node "ii"
      for ed in edge_list: # For each edge "ed"
        if ii in ed: # If node "ii" is in "ed". This takes the one-hop neighborhood of "ii"
          if label_dic[ii] == label_dic[ed[0]] and ii not in one_node_edges: # If the label of "ii" is the same as the label of "ed[0]" and "ii" not in "one_node_edges"
            one_node_edges.append(ed[0])
          if label_dic[ii] == label_dic[ed[1]] and ii not in one_node_edges:
            one_node_edges.append(ed[1])
        else:
          pass
      node_nei_list[ii] = one_node_edges # Node "ii" has the same label with nodes in "one_node_edges"
      one_node_edges = []

    # Change the vector representation of each node
    new_vector = {}
    one_node_new_vec = []
    for ve in vectors.keys(): # For each node that has an embedding in "vectors"
      if str(ve) in node_nei_list:
        for nnl in node_nei_list[str(ve)]: # Take the nodes with the same label as node "ve"
          one_node_new_vec.append(vectors[int(nnl)]) # Append the embeddings of all nodes with the same label as "ve"
        #one_node_new_vec = np.array(one_node_new_vec).sum(axis=0)/len(node_nei_list[str(ve)]) # Take the mean


        if one_node_new_vec: # Check if one_node_new_vec has elements before calculating the mean to avoid ZeroDivisionError
          one_node_new_vec = np.array(one_node_new_vec).sum(axis=0)/len(node_nei_list[str(ve)]) # Take the mean
        else: # Assign zero_list if the node has no neighbors with the same label
          one_node_new_vec = zero_list

        # The new vector for node "ve" will be the mean of all the embeddings of the one-hop neighbors with the same label as "ve"
        new_vector[ve] = one_node_new_vec
        one_node_new_vec = []
      else:
        new_vector[ve] = zero_list

    end_time = datetime.now()
    print(f'Time: {((end_time - start_time).total_seconds()) / 60.0} min')
    model.close_session()

    with open(f'{parent_path}/Results/DeepEmLAN/{log_file}', 'a') as f:
      f.write(f'Time: {((end_time - start_time).total_seconds()) / 60.0} min\n')

    # Save embeddings with a unique name
    #embed_file = f"{parent_path}/Results/DeepEmLAN/embed_link_pred_{gf.split('.')[0]}_{tf.split('.')[0]}.txt"
    embed_file = f"{parent_path}/Results/DeepEmLAN/embed_node_clf_{gf.split('.')[0]}_{txtf.split('.')[0]}.txt"

    ''' with open(embed_file, 'wb') as f:
      for node_id, node_vec in new_vector.items():
        if available_vectors[node_id] == 1:
            f.write((' '.join(map(str, node_vec)) + '\n').encode())
        else:
            f.write('\n'.encode()) '''

    with open(embed_file, 'wb') as f:
      for node_id, node_vec in new_vector.items():
        f.write((' '.join(map(str, node_vec)) + '\n').encode())

    # Log completion
    with open(f'{parent_path}/Results/DeepEmLAN/{log_file}', 'a') as f:
        f.write(f'Embeddings saved to: {embed_file}\n')

    gc.collect()


The maximum length is: 411
Processing graph: graph.txt, text: data.txt


ValueError: Dimensions must be equal, but are 26304 and 411 for '{{node model/BroadcastTo_4}} = BroadcastTo[T=DT_FLOAT, Tidx=DT_INT32](model/add_1, model/BroadcastTo_4/shape)' with input shapes: [26304,100], [3] and with input tensors computed as partial shapes: input[1] = [64,411,100].

***IMPORTANT*** ==> Because the maximum length is always the same (100), the computation time, given different data text files, will always be approximately the same.


***Hypothesis*** ==> When using the entire abstract, which consists of more that 100(MAX_LEN) words, we see a reduction in performance when compared to the keywords/keyphrases. This happens because the maximum words in all lines of the keyword files are less than 100 so we don't lose any information whereas the maximum words in all lines of the original data file (data.txt) exceed 100. Even if there are abstracts in data.txt with less than 100 words, they must be very few.