# ***Libraries & Tools***

In [73]:
import argparse
import os
import random
import numpy as np
import tensorflow as tf
from math import pow
from datetime import datetime
from pathlib import Path

In [None]:
!pwd

# ***Global Variables***

In [74]:
MAX_LEN = 300 # Default value
neg_table_size = 1000000
NEG_SAMPLE_POWER = 0.75
batch_size = 64
num_epoch = 200
embed_size = 200
lr = 1e-3


datasetName = "cora"
ratio = 0.75
rho = "1.0,0.3,0.3"

In [88]:
max_word_count = 0
min_word_count = float('inf')

with open('/content/datasets/%s/data.txt' % datasetName, 'r') as file:
    for line in file:
        word_count = len(line.split())

        if word_count > max_word_count:
            max_word_count = word_count

        if word_count < min_word_count:
            min_word_count = word_count

print("Max word count:", max_word_count)
print("Min word count:", min_word_count)
MAX_LEN = max_word_count

Max word count: 410
Min word count: 30


# ***DataSet***

In [76]:
class dataSet:
    def __init__(self, text_path, graph_path):

        text_file, graph_file = self.load(text_path, graph_path)

        self.edges = self.load_edges(graph_file)

        self.text, self.num_vocab, self.num_nodes = self.load_text(text_file)

        self.negative_table = InitNegTable(self.edges)

    def load(self, text_path, graph_path):
        text_file = open(text_path, 'rb').readlines()
        for a in range(0, len(text_file)):
            text_file[a] = str(text_file[a])
        graph_file = open(graph_path, 'rb').readlines()

        return text_file, graph_file

    def load_edges(self, graph_file):
        edges = []
        for i in graph_file:
            edges.append(list(map(int, i.strip().decode().split('\t'))))

        print("Total load %d edges." % len(edges))

        return edges

    def load_text(self, text_file):
        """
        Adapting with adapt(text_data):

        vectorize_layer.adapt(text_data) analyzes text_data, builds a vocabulary, and assigns a unique integer ID to each word based on its frequency (most frequent words get lower IDs).
        Transforming with vectorize_layer(text_data):

        This maps each word in text_data to its corresponding integer token ID, producing a 2D array where each row represents a sequence of token IDs for a given input line, padded or truncated to max_len.
        """
        vectorize_layer = tf.keras.layers.TextVectorization(
            max_tokens=None,  # Set a limit if needed
            output_mode='int',
            output_sequence_length=MAX_LEN
        )

        text_data = [line.strip() for line in text_file]

        vectorize_layer.adapt(text_data)

        text = vectorize_layer(text_data).numpy()

        num_vocab = len(vectorize_layer.get_vocabulary())
        print(f'Vocabulary: {num_vocab}')
        num_nodes = len(text)

        return text, num_vocab, num_nodes

    def negative_sample(self, edges):
        node1, node2 = zip(*edges)
        sample_edges = []
        func = lambda: self.negative_table[random.randint(0, neg_table_size - 1)]
        for i in range(len(edges)):
            neg_node = func()
            while node1[i] == neg_node or node2[i] == neg_node:
                neg_node = func()
            sample_edges.append([node1[i], node2[i], neg_node])

        return sample_edges

    def generate_batches(self, mode=None):

        num_batch = len(self.edges) // batch_size
        edges = self.edges
        # if mode == 'add':
        #     num_batch += 1
        #     edges.extend(edges[:(config.batch_size - len(self.edges) // config.batch_size)])
        if mode != 'add':
            random.shuffle(edges)
        sample_edges = edges[:num_batch * batch_size]
        sample_edges = self.negative_sample(sample_edges)

        batches = []
        for i in range(num_batch):
            batches.append(sample_edges[i * batch_size:(i + 1) * batch_size])
        # print sample_edges[0]
        return batches


# ***CANE***

In [77]:
class Model:
    def __init__(self, vocab_size, num_nodes, rho):
        rho = rho.split(",")
        self.rho1 = float(rho[0])
        self.rho2 = float(rho[1])
        self.rho3 = float(rho[2])
        # '''hyperparameter'''
        with tf.name_scope('read_inputs') as scope:
          """
          self.Text_a = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='Ta')
          self.Text_b = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='Tb')
          self.Text_neg = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='Tneg')
          self.Node_a = tf.keras.Input(shape=(1,), dtype=tf.int32, name='n1')  # or shape=(,) for a single int
          self.Node_b = tf.keras.Input(shape=(1,), dtype=tf.int32, name='n2')
          self.Node_neg = tf.keras.Input(shape=(1,), dtype=tf.int32, name='n3')
          """
          self.Text_a = tf.compat.v1.placeholder(tf.int32, [batch_size, MAX_LEN], name='Ta')
          self.Text_b = tf.compat.v1.placeholder(tf.int32, [batch_size, MAX_LEN], name='Tb')
          self.Text_neg = tf.compat.v1.placeholder(tf.int32, [batch_size, MAX_LEN], name='Tneg')
          self.Node_a = tf.compat.v1.placeholder(tf.int32, [batch_size], name='n1')
          self.Node_b = tf.compat.v1.placeholder(tf.int32, [batch_size], name='n2')
          self.Node_neg = tf.compat.v1.placeholder(tf.int32, [batch_size], name='n3')

        with tf.name_scope('initialize_embedding') as scope:
            self.text_embed = tf.Variable(tf.random.truncated_normal([vocab_size, embed_size // 2], stddev=0.3))
            self.node_embed = tf.clip_by_norm(tf.Variable(tf.random.truncated_normal([num_nodes, embed_size // 2], stddev=0.3)), clip_norm=1, axes=1)

        with tf.name_scope('lookup_embeddings') as scope:
            #self.text_embed_layer = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_size // 2, name='text_embedding')
            #self.node_embed_layer = tf.keras.layers.Embedding(input_dim=num_nodes, output_dim=embed_size // 2, name='node_embedding')

            self.TA = tf.nn.embedding_lookup(self.text_embed, self.Text_a)#self.text_embed_layer(self.Text_a)
            #print(f'\nTA: {self.TA} \n{type(self.TA)} {self.TA.shape}')
            self.T_A = tf.expand_dims(self.TA, -1)

            self.TB = tf.nn.embedding_lookup(self.text_embed, self.Text_b)#self.text_embed_layer(self.Text_b)
            self.T_B = tf.expand_dims(self.TB, -1)

            self.TNEG = tf.nn.embedding_lookup(self.text_embed, self.Text_neg)#self.text_embed_layer(self.Text_neg)
            self.T_NEG = tf.expand_dims(self.TNEG, -1)

            self.N_A = tf.nn.embedding_lookup(self.node_embed, self.Node_a)#self.node_embed_layer(self.Node_a)
            self.N_B = tf.nn.embedding_lookup(self.node_embed, self.Node_b)#self.node_embed_layer(self.Node_b)
            self.N_NEG = tf.nn.embedding_lookup(self.node_embed, self.Node_neg)#self.node_embed_layer(self.Node_neg)

        self.convA, self.convB, self.convNeg = self.conv()
        self.loss = self.compute_loss()

    def conv(self):
        W2 = tf.Variable(tf.random.truncated_normal([2, embed_size // 2, 1, 100], stddev=0.3))
        rand_matrix = tf.Variable(tf.random.truncated_normal([100, 100], stddev=0.3))

        convA = tf.nn.conv2d(self.T_A, W2, strides=[1, 1, 1, 1], padding='VALID')
        convB = tf.nn.conv2d(self.T_B, W2, strides=[1, 1, 1, 1], padding='VALID')
        convNEG = tf.nn.conv2d(self.T_NEG, W2, strides=[1, 1, 1, 1], padding='VALID')

        hA = tf.tanh(tf.squeeze(convA))
        hB = tf.tanh(tf.squeeze(convB))
        hNEG = tf.tanh(tf.squeeze(convNEG))

        tmphA = tf.reshape(hA, [batch_size * (MAX_LEN - 1), embed_size // 2])
        ha_mul_rand = tf.reshape(tf.matmul(tmphA, rand_matrix),
                                 [batch_size, MAX_LEN - 1, embed_size // 2])
        r1 = tf.matmul(ha_mul_rand, hB, adjoint_b=True)
        r3 = tf.matmul(ha_mul_rand, hNEG, adjoint_b=True)
        att1 = tf.expand_dims(tf.stack(r1), -1)
        att3 = tf.expand_dims(tf.stack(r3), -1)

        att1 = tf.tanh(att1)
        att3 = tf.tanh(att3)

        pooled_A = tf.reduce_mean(att1, 2)
        pooled_B = tf.reduce_mean(att1, 1)
        pooled_NEG = tf.reduce_mean(att3, 1)

        a_flat = tf.squeeze(pooled_A)
        b_flat = tf.squeeze(pooled_B)
        neg_flat = tf.squeeze(pooled_NEG)

        w_A = tf.nn.softmax(a_flat)
        w_B = tf.nn.softmax(b_flat)
        w_NEG = tf.nn.softmax(neg_flat)

        rep_A = tf.expand_dims(w_A, -1)
        rep_B = tf.expand_dims(w_B, -1)
        rep_NEG = tf.expand_dims(w_NEG, -1)

        hA = tf.transpose(hA, perm=[0, 2, 1])
        hB = tf.transpose(hB, perm=[0, 2, 1])
        hNEG = tf.transpose(hNEG, perm=[0, 2, 1])

        rep1 = tf.matmul(hA, rep_A)
        rep2 = tf.matmul(hB, rep_B)
        rep3 = tf.matmul(hNEG, rep_NEG)

        attA = tf.squeeze(rep1)
        attB = tf.squeeze(rep2)
        attNEG = tf.squeeze(rep3)

        return attA, attB, attNEG

    def compute_loss(self):
        p1 = tf.reduce_sum(tf.multiply(self.convA, self.convB), 1)
        p1 = tf.math.log(tf.nn.sigmoid(p1) + 0.001)

        p2 = tf.reduce_sum(tf.multiply(self.convA, self.convNeg), 1)
        p2 = tf.math.log(tf.nn.sigmoid(-p2) + 0.001)

        p3 = tf.reduce_sum(tf.multiply(self.N_A, self.N_B), 1)
        p3 = tf.math.log(tf.nn.sigmoid(p3) + 0.001)

        p4 = tf.reduce_sum(tf.multiply(self.N_A, self.N_NEG), 1)
        p4 = tf.math.log(tf.nn.sigmoid(-p4) + 0.001)

        p5 = tf.reduce_sum(tf.multiply(self.convB, self.N_A), 1)
        p5 = tf.math.log(tf.nn.sigmoid(p5) + 0.001)

        p6 = tf.reduce_sum(tf.multiply(self.convNeg, self.N_A), 1)
        p6 = tf.math.log(tf.nn.sigmoid(-p6) + 0.001)

        p7 = tf.reduce_sum(tf.multiply(self.N_B, self.convA), 1)
        p7 = tf.math.log(tf.nn.sigmoid(p7) + 0.001)

        p8 = tf.reduce_sum(tf.multiply(self.N_B, self.convNeg), 1)
        p8 = tf.math.log(tf.nn.sigmoid(-p8) + 0.001)

        rho1 = self.rho1
        rho2 = self.rho2
        rho3 = self.rho3
        temp_loss = rho1 * (p1 + p2) + rho2 * (p3 + p4) + rho3 * (p5 + p6) + rho3 * (p7 + p8)
        loss = -tf.reduce_sum(temp_loss)
        return loss

# ***Negative Sample***

In [78]:
def InitNegTable(edges):
    a_list, b_list = zip(*edges)
    a_list = list(a_list)
    b_list = list(b_list)
    node = a_list
    node.extend(b_list)

    node_degree = {}
    for i in node:
        if i in node_degree:
            node_degree[i] += 1
        else:
            node_degree[i] = 1
    sum_degree = 0
    for i in node_degree.values():
        sum_degree += pow(i, 0.75)

    por = 0
    cur_sum = 0
    vid = -1
    neg_table = []
    degree_list = list(node_degree.values())
    node_id = list(node_degree.keys())
    for i in range(neg_table_size):
        if ((i + 1) / float(neg_table_size)) > por:
            cur_sum += pow(degree_list[vid + 1], NEG_SAMPLE_POWER)
            por = cur_sum / sum_degree
            vid += 1
        neg_table.append(node_id[vid])
    print(len(neg_table))
    return neg_table


# ***Run***

In [79]:
def prepareData(datasetName, ratio):
  f = open('/content/datasets/%s/graph.txt' % datasetName, 'rb')
  edges = [i for i in f]
  selected = int(len(edges) * float(ratio))
  selected = selected - selected % batch_size
  selected = random.sample(edges, selected)
  remain = [i for i in edges if i not in selected]
  try:
    Path('temp').mkdir(parents=True, exist_ok=True)  # parents=True allows creating parent directories; exist_ok=True avoids errors if the directory already exists
    print(f"Directory '{directory}' created successfully.")
  except Exception as e:
    print(f"An error occurred: {e}")

  fw1 = open('temp/graph.txt', 'wb')
  fw2 = open('temp/test_graph.txt', 'wb')

  for i in selected:
      fw1.write(i)
  for i in remain:
      fw2.write(i)

In [80]:
prepareData(datasetName, ratio)

In [89]:
# load data
dataset_name = datasetName
graph_path = os.path.join('/content/temp/graph.txt')
text_path = os.path.join("/content", "datasets", dataset_name, 'data.txt')

data = dataSet(text_path, graph_path)

Total load 3904 edges.
Vocabulary: 16169
1000000


In [90]:
with tf.Graph().as_default():
    sess = tf.compat.v1.Session()
    with sess.as_default():
        model = Model(data.num_vocab, data.num_nodes, rho)
        opt = tf.compat.v1.train.AdamOptimizer(lr)#tf.keras.optimizers.Adam(learning_rate=lr)
        train_op = opt.minimize(model.loss)#opt.minimize(model.loss, var_list=model.trainable_variables)
        sess.run(tf.compat.v1.global_variables_initializer())
        time = 0

        # training
        print('start training.......')

        for epoch in range(num_epoch):
            start = datetime.now()
            loss_epoch = 0
            batches = data.generate_batches()
            h1 = 0
            num_batch = len(batches)
            for i in range(num_batch):
                batch = batches[i]

                node1, node2, node3 = zip(*batch)
                node1, node2, node3 = np.array(node1), np.array(node2), np.array(node3)
                text1, text2, text3 = data.text[node1], data.text[node2], data.text[node3]

                feed_dict = {
                    model.Text_a: text1,
                    model.Text_b: text2,
                    model.Text_neg: text3,
                    model.Node_a: node1,
                    model.Node_b: node2,
                    model.Node_neg: node3
                }

                """
                # Directly call the model with inputs as arguments
                with tf.GradientTape() as tape:
                    model_outputs = model([text1, text2, text3, node1, node2, node3], training=True)
                    loss_batch = model.compute_loss()  # Adjust compute_loss for easy access

                grads = tape.gradient(loss_batch, model.trainable_variables)
                opt.apply_gradients(zip(grads, model.trainable_variables))

                loss_epoch += loss_batch.numpy()  # Convert tensor to a scalar for logging
                """
                # run the graph
                _, loss_batch = sess.run([train_op, model.loss], feed_dict=feed_dict)

                loss_epoch += loss_batch

            end = datetime.now()
            time += (end - start).total_seconds()
            print('epoch: ', epoch + 1, ' loss: ', loss_epoch)

        print(f'Time: {time}')
        # Saving embeddings
        with open('temp/embed.txt', 'wb') as file:
            batches = data.generate_batches(mode='add')
            num_batch = len(batches)
            embed = [[] for _ in range(data.num_nodes)]

            for i in range(num_batch):
                batch = batches[i]
                node1, node2, node3 = zip(*batch)
                node1, node2, node3 = np.array(node1), np.array(node2), np.array(node3)
                text1, text2, text3 = data.text[node1], data.text[node2], data.text[node3]

                feed_dict = {
                    model.Text_a: text1,
                    model.Text_b: text2,
                    model.Text_neg: text3,
                    model.Node_a: node1,
                    model.Node_b: node2,
                    model.Node_neg: node3
                }

                # Fetch embeddings
                #convA, convB, TA, TB = sess.run([model.convA, model.convB, model.N_A, model.N_B])
                convA, convB, TA, TB = sess.run([model.convA, model.convB, model.N_A, model.N_B], feed_dict=feed_dict)

                for j in range(batch_size):
                    em = list(TA[j])
                    embed[node1[j]].append(em)
                    em = list(TB[j])
                    embed[node2[j]].append(em)


            for i in range(data.num_nodes):
                if embed[i]:
                    tmp = np.mean(embed[i], axis=0)
                    file.write((' '.join(map(str, tmp)) + '\n').encode())
                else:
                    file.write('\n'.encode())

start training.......
epoch:  1  loss:  13871.87629699707
epoch:  2  loss:  10191.769454956055
epoch:  3  loss:  10159.646377563477
epoch:  4  loss:  10121.372207641602
epoch:  5  loss:  10063.07632446289
epoch:  6  loss:  9990.905639648438
epoch:  7  loss:  9899.683502197266
epoch:  8  loss:  9746.915420532227
epoch:  9  loss:  9550.246963500977
epoch:  10  loss:  9268.257263183594
epoch:  11  loss:  8989.82534790039
epoch:  12  loss:  8732.20588684082
epoch:  13  loss:  8545.261978149414
epoch:  14  loss:  8402.643768310547
epoch:  15  loss:  8218.621681213379
epoch:  16  loss:  8014.382919311523
epoch:  17  loss:  7916.400215148926
epoch:  18  loss:  7764.685127258301
epoch:  19  loss:  7670.103096008301
epoch:  20  loss:  7541.582809448242
epoch:  21  loss:  7386.544410705566
epoch:  22  loss:  7223.188362121582
epoch:  23  loss:  7068.006080627441
epoch:  24  loss:  6991.621826171875
epoch:  25  loss:  6908.595809936523
epoch:  26  loss:  6746.548187255859
epoch:  27  loss:  6766.

In [91]:
node2vec = {}
# dataset_name = "zhihu"
f = open('temp/embed.txt', 'rb')
for i, j in enumerate(f):
    if j.decode() != '\n':
        node2vec[i] = list(map(float, j.strip().decode().split(' ')))
f1 = open(os.path.join('temp/test_graph.txt'), 'rb')
edges = [list(map(int, i.strip().decode().split('\t'))) for i in f1]
nodes = list(set([i for j in edges for i in j]))
a = 0
b = 0
for i, j in edges:
    if i in node2vec.keys() and j in node2vec.keys():
        dot1 = np.dot(node2vec[i], node2vec[j])
        random_node = random.sample(nodes, 1)[0]
        while random_node == j or random_node not in node2vec.keys():
            random_node = random.sample(nodes, 1)[0]
        dot2 = np.dot(node2vec[i], node2vec[random_node])
        if dot1 > dot2:
            a += 1
        elif dot1 == dot2:
            a += 0.5
        b += 1

print("Auc value:", float(a) / b)

Auc value: 0.9487847222222222
