# ***Libraries & Tools***

In [None]:
import time
import networkx as nx
import numpy as np
import torch
import re
import math
import random
import gc

from torch import nn
from torch.nn import init
from torch.nn import functional as F

from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression

In [None]:
gpu_id = 2
gpu = torch.device('cuda', gpu_id)

# ***Global Variables & General Functionality***

In [None]:
dataset_name = "arxiv"
parent_path = f'Datasets/{dataset_name}/graph-v2'
graph_file = 'graph.txt'
categories_file = 'group-v2.txt'

data_text_file  = "data-v3-500.txt" 
data_text_files = ["data-v3-500.txt", "data-v3-500C.txt", "YAKE10.txt", "YAKE5.txt", "RAKE10.txt", "RAKE5.txt", "RAKE10C.txt", "RAKE5C.txt", "TFIDF10.txt", "TFIDF5.txt", "PosR5.txt",
                   "PosR10.txt", "TextR5.txt", "TextR10.txt", "TopicR5.txt", "TopicR10.txt"]

vocab_file = 'vocab.txt'

log_file               = 'Net2Net-NE_Execution_Logs.txt'
link_pred_results_file = 'Net2Net-NE_Link_Pred_Res.txt'
node_clf_results_file  = 'Net2Net-NE_Node_Clf_Res.txt'


split_graph_file  = 'sgraph15.txt' 
split_graph_files = ['sgraph15.txt', 'sgraph45.txt', 'sgraph75.txt']
test_graph_file   = 'tgraph85.txt' 
test_graph_files  = ['tgraph85.txt', 'tgraph55.txt', 'tgraph25.txt']


word_num = 12619
MAX_LEN = 300 # Default value for single execution
MAX_LENS = [] # List to hold values for multiple executions

word_emb_dim = 500
conv_dim = 500
kernel_num = 200
kernel_sizes = [1, 2, 3, 4, 5]
conv_drop = 0.2
enc_dim = 500
batch_size = 64
epoch_num = 50
l_rate = 1e-3 
clf_ratio = [0.15, 0.45, 0.75]
clf_num = 5
train_classifier = True

In [None]:
# Find the average number of words from each data text file
for txtf in data_text_files: # 1) ['data-v3-500.txt.txt'] 2) data_text_files:
    total_word_count = 0
    total_lines = 0

    with open(f'{parent_path}/{txtf}', 'r', encoding='utf-8') as file:
        for line in file:
            total_word_count += len(re.findall(r"\b\w+\b", line))
            total_lines += 1

    mean_word_count = total_word_count / total_lines if total_lines > 0 else 0
    MAX_LENS.append(int(math.ceil(mean_word_count)))
    print(f'=== {txtf} ===')
    print("Mean word count:", math.ceil(mean_word_count))
    print()

In [None]:
MAX_LENS

In [None]:
MAX_LEN = MAX_LENS[-1] # For single execution

In [None]:
def get_vectors_from_file(file_path):
  vectors = {}

  with open(f'{file_path}', "r", encoding='utf-8') as f:
      for idx, line in enumerate(f):
          vector = list(map(float, line.strip().split()))  # Convert to list of floats
          vectors[idx] = vector  # Assign embedding to node idx

  return vectors

In [None]:
zero_list = []
for i in range(0, word_emb_dim):
    zero_list.append(0)
zero_list = np.array(zero_list)

# ***Classify***

In [None]:
class TopKRanker(OneVsRestClassifier):
    def predict(self, X, top_k_list):
        probs = np.asarray(super(TopKRanker, self).predict_proba(X))
        all_labels = []
        for i, k in enumerate(top_k_list):
            probs_ = probs[i, :]
            labels = self.classes_[probs_.argsort()[-k:]].tolist()
            probs_[:] = 0
            probs_[labels] = 1
            all_labels.append(probs_)
        return np.asarray(all_labels)


class Classifier(object):
    def __init__(self, vectors, clf):
        self.embeddings = vectors
        self.clf = TopKRanker(clf)
        self.binarizer = MultiLabelBinarizer(sparse_output=True)

    def train(self, X, Y, Y_all):
        self.binarizer.fit(Y_all)
        X_train = [self.embeddings[x] for x in X]
        Y = self.binarizer.transform(Y)
        self.clf.fit(X_train, Y)

    def evaluate(self, X, Y):
        top_k_list = [len(l) for l in Y]
        Y_ = self.predict(X, top_k_list)
        Y = self.binarizer.transform(Y)
        # averages = ["micro", "macro", "samples", "weighted"]
        # f1_results = {}
        # pre_results = {}
        # rec_results = {}
        # acc_results = accuracy_score(Y, Y_)
        # f1_macro = f1_score(Y, Y_, average="macro")
        f1_micro = f1_score(Y, Y_, average="micro")
        # for average in averages:
        #      f1_results[average] = f1_score(Y, Y_, average=average)
        #     pre_results[average] = precision_score(Y, Y_, average=average)
        #     rec_results[average] = recall_score(Y, Y_, average=average)
        # print 'Results, using embeddings of dimensionality', len(self.embeddings[X[0]])
        # print '-------------------'
        # print('\nF1 Score: ')
        # print(f1_results)
        # print('\nPrecision Score:')
        # print(pre_results)
        # print('\nRecall Score:')
        # print(rec_results)
        # print('Accuracy Score:', acc_results)

        # return f1_results, pre_results, rec_results, acc_results
        return f1_micro
        # print '-------------------'

    def predict(self, X, top_k_list):
        X_ = np.asarray([self.embeddings[x] for x in X])
        Y = self.clf.predict(X_, top_k_list=top_k_list)
        return Y

    def split_train_evaluate(self, X, Y, train_precent, seed=0):
        state = np.random.get_state()

        training_size = int(train_precent * len(X))
        np.random.seed(seed)
        shuffle_indices = np.random.permutation(np.arange(len(X)))
        X_train = [X[shuffle_indices[i]] for i in range(training_size)]
        Y_train = [Y[shuffle_indices[i]] for i in range(training_size)]
        X_test = [X[shuffle_indices[i]] for i in range(training_size, len(X))]
        Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))]

        self.train(X_train, Y_train, Y)
        np.random.set_state(state)
        return self.evaluate(X_test, Y_test)


def load_embeddings(filename):
    fin = open(filename, 'r')
    node_num, size = [int(x) for x in fin.readline().strip().split()]
    vectors = {}
    while 1:
        l = fin.readline()
        if l == '':
            break
        vec = l.strip().split(' ')
        assert len(vec) == size + 1
        vectors[vec[0]] = [float(x) for x in vec[1:]]
    fin.close()
    assert len(vectors) == node_num
    return vectors


def read_node_label(filename):
    fin = open(filename, 'r')
    X = []
    Y = []
    while 1:
        l = fin.readline()
        if l == '':
            break
        vec = l.strip().split(' ')

        if len(vec) == 2:
            X.append(int(vec[0]))
            Y.append([int(v) for v in vec[1:]])
    fin.close()
    return X, Y


# ***Utilities***

In [None]:
# Read node features from file
def read_node_fea(feature_path):
    fea = []
    fin = open(feature_path, 'r')
    for l in fin.readlines():
        vec = l.split()
        fea.append(np.array([float(x) for x in vec[1:]]))
    fin.close()
    return np.array(fea, dtype='float32')


def read_word_code(text_path, voca_path):
    words = []
    fin = open(voca_path, 'r')
    for l in fin.readlines():
        words.append(l.strip())
    fin.close()
    word_map = {words[i]: i for i in range(len(words))}
    pad_code = word_map['<eos>']

    content_code = []
    fin = open(text_path, 'r')
    for l in fin.readlines():
        info = l.strip().split(' ')
        doc_code = [word_map[w] for w in info]
        # if len(doc_code) > max_len:
        #     doc_code = doc_code[0: max_len]
        # else:
        #     doc_code.extend([pad_code for _ in range(max_len - len(doc_code))])
        content_code.append(doc_code)
    return content_code, pad_code
    # return np.array(content_code, dtype='int')


def fetch(content_code, ids, max_len, pad_code):
    code = []
    for id in ids:
        doc_code = content_code[id]
        if len(doc_code) > max_len:
            doc_code = doc_code[0: max_len]
        else:
            doc_code.extend([pad_code for _ in range(max_len - len(doc_code))])
        code.append(doc_code)

    return code


def node_classification(hidden, idx, label, ratio):
    lr = Classifier(vectors=hidden, clf=LogisticRegression())
    f1_mi = lr.split_train_evaluate(idx, label, ratio)
    return f1_mi


def exclusive_combine(*in_list):
    res = set()
    in_list = list(*in_list)
    for n_l in in_list:
        for i in n_l:
            res.add(i)
    return list(res)


def identity_map(n_list):
    id_dict = {}
    for i in range(len(n_list)):
        id_dict[n_list[i]] = i
    return id_dict


def agg_mean(M, id_dict, keys):
    idList = []
    for id in keys:
        idList.append(id_dict[id])

    return torch.mean(M[idList, :], 0, True)


def agg_max(M, id_dict, keys):
    idList = []
    for id in keys:
        idList.append(id_dict[id])
    res, _ = torch.max(M[idList, :], 0, True)
    return res

# ***Graph***

In [None]:
class MyGraph(object):
    def __init__(self, path, edgelist=True):
        self.neighbor_dict = {}
        if edgelist:
            fin = open(path, 'r')
            for l in fin.readlines():
                e = l.split()
                i, j = int(e[0]), int(e[1])
                # Undirected edges
                self.update_edge(i, j)
                self.update_edge(j, i)
            fin.close()

        # Convert node's neighbors from dict to list
        for key in self.neighbor_dict.keys():
            self.neighbor_dict[key] = list(self.neighbor_dict[key])

        self.node_list = list(self.neighbor_dict.keys())
        self.node_list.sort()
        self.node_num = len(self.node_list)

    def update_edge(self, i, j):
        if i in self.neighbor_dict:
            self.neighbor_dict[i].add(j)
        else:
            self.neighbor_dict[i] = {j}

        if j in self.neighbor_dict:
            self.neighbor_dict[j].add(i)
        else:
            self.neighbor_dict[j] = {i}

    def get_batches(self, batch_size):
        # np.random.seed(1)
        np.random.shuffle(self.node_list)
        num_batches = self.node_num // batch_size
        batch_list = []

        # Create "num_batches" number of batches
        for n in range(num_batches):
            batch_list.append(self.node_list[n * batch_size: (n + 1) * batch_size])
        
        # Create a final batch that contains the remaining nodes
        if self.node_num > num_batches * batch_size:
            batch_list.append(self.node_list[num_batches * batch_size:])

        self.node_list.sort()
        return batch_list

    def get_neighbors(self, in_list):
        neighbors = [self.neighbor_dict[i] for i in in_list]
        return exclusive_combine(neighbors)

    def diffuse(self, step, nodes):
        cur_list = nodes
        scale_list = [cur_list]
        for s in range(step):
            neighbors = self.get_neighbors(cur_list)
            cur_list = exclusive_combine([cur_list, neighbors])
            scale_list.append(cur_list)
        return scale_list  # From now to the past

    def statistic(self):
        neigh_num = []
        for n in self.node_list:
            neigh_num.append(len(self.neighbor_dict[n]))

        return np.max(neigh_num), np.min(neigh_num), np.mean(neigh_num)

# ***Models***

In [None]:
class MeanAggregator(nn.Module):

    def __init__(self, features, cur_device, gcn=False):

        super(MeanAggregator, self).__init__()
        self.features = features
        self.device = cur_device
        self.gcn = gcn
        
    def forward(self, nodes, to_neighs):
        samp_neighs = [samp_neigh + [nodes[i]] for i, samp_neigh in enumerate(to_neighs)]

        unique_nodes_list = exclusive_combine(samp_neighs)
        unique_nodes = {n: i for i, n in enumerate(unique_nodes_list)}
        # The mask for aggregation
        mask = torch.zeros(len(samp_neighs), len(unique_nodes), requires_grad=False, device=self.device)
        # The connections
        column_indices = [unique_nodes[n] for samp_neigh in samp_neighs for n in samp_neigh]
        row_indices = [i for i in range(len(samp_neighs)) for j in range(len(samp_neighs[i]))]
        mask[row_indices, column_indices] = 1
        # Normalize
        num_neigh = mask.sum(1, keepdim=True)
        mask = mask.div(num_neigh)

        embed_matrix = self.features(unique_nodes_list)
        to_feats = mask.mm(embed_matrix)
        return to_feats  # node_num * fea_dim

In [None]:
class EgoEncoder(nn.Module):
    def __init__(self, features, feature_dim, embed_dim, graph, aggregator, base_model=None):
        super(EgoEncoder, self).__init__()

        self.features = features
        self.feat_dim = feature_dim
        self.embed_dim = embed_dim
        self.graph = graph
        self.aggregator = aggregator
        if base_model is not None:
            self.base_model = base_model

        self.weight = nn.Parameter(torch.FloatTensor(self.feat_dim, embed_dim))
        init.xavier_uniform(self.weight)

    def forward(self, nodes):
        to_neighs = [self.graph.neighbor_dict[node] for node in nodes]
        neigh_feats = self.aggregator.forward(nodes, to_neighs)
        combined = neigh_feats
        combined.mm(self.weight)
        combined = torch.tanh(combined)
        return combined  # node_num * emb_dim

In [None]:
class ContentCNN(nn.Module):
    def __init__(self, word_num, word_emb_dim, conv_dim, kernel_num, kernel_sizes, dropout, cur_device):
        super(ContentCNN, self).__init__()
        self.word_embeddings = nn.Embedding(word_num, word_emb_dim)
        # self.word_embeddings.weight = nn.Parameter(torch.FloatTensor(word_num, word_emb_dim))
        # self.word_embeddings.cuda(cur_device)

        # CNN with different kernel sizes
        self.conv_list = nn.ModuleList([nn.Conv2d(1, kernel_num, (K, word_emb_dim)) for K in kernel_sizes])

        self.dropout = nn.Dropout(dropout)
        # self.fc = nn.Linear(len(kernel_sizes) * kernel_num, conv_dim)
        self.weight = nn.Parameter(torch.FloatTensor(len(kernel_sizes) * kernel_num, conv_dim))
        self.device = cur_device

        init.xavier_uniform(self.word_embeddings.weight)
        init.xavier_uniform(self.weight)

    def conv_and_pool(self, x, conv):
        x_conv = conv(x)
        x_act = F.relu(x_conv).squeeze(3)  # (N, Co, W)
        x_pool = F.max_pool1d(x_act, x_act.size(2)).squeeze(2)
        return x_pool

    def forward(self, node_batch):
        query = torch.LongTensor(node_batch).cuda(self.device)
        x = self.word_embeddings(query)  # (N, W, D)

        x = x.unsqueeze(1)  # (N, Ci, W, D)

        x = [F.relu(conv(x)).squeeze(3) for conv in self.conv_list]  # [(N, Co, W), ...]*len(Ks)

        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(N, Co), ...]*len(Ks)

        x = torch.cat(x, 1)

        x = self.dropout(x)  # (N, len(Ks)*Co)
        # logit = self.fc(x)  # (N, C)
        logit = x.mm(self.weight)
        logit = torch.tanh(logit)
        return logit

# ***Net2Net-NE***

In [None]:
class Net2Net(nn.Module):
    def __init__(self, global_graph, features, encoder):
        super(Net2Net, self).__init__()
        self.graph = global_graph
        self.node_num = self.graph.node_num
        self.embed_dim = encoder.embed_dim
        self.features = features
        self.encoder = encoder
        self.xent = nn.CrossEntropyLoss()

        self.weight = nn.Parameter(torch.FloatTensor(self.embed_dim, self.node_num))
        init.xavier_uniform(self.weight)

    def forward(self, nodes):
        embeds = self.encoder(nodes)
        scores = embeds.mm(self.weight)
        return scores

    def loss(self, nodes, labels):
        scores = self.forward(nodes)
        return self.xent(scores, labels.squeeze())

    def evaluate(self, b_list, lab, ratio):
        self.eval()
        hidden = []
        idx = []
        for bat in b_list:
            h = self.encoder(bat)
            hidden.extend(h.detach().cpu().numpy())
            idx.extend(bat)

        f1 = []
        for r in ratio:
            f1.append(node_classification(hidden, np.arange(len(lab)), [lab[i] for i in idx], r))
        return f1

# ***Train(Single Execution)***

In [None]:
start = time.time()

# Read graph
graph = MyGraph(f'{parent_path}/{graph_file}')

# Read node labels
_, labels = read_node_label(f'{parent_path}/{categories_file}')

# Read node content (abstracts) and vocabulary of contents
node_content, pad_code = read_word_code(f'{parent_path}/{data_text_file}', f'{parent_path}/{vocab_file}')

features = ContentCNN(word_num, word_emb_dim, conv_dim, kernel_num, kernel_sizes, conv_drop, gpu)

agg1 = MeanAggregator(lambda nodes: features(fetch(node_content, nodes, MAX_LEN, pad_code)), gpu)
enc1 = EgoEncoder(lambda nodes: features(fetch(node_content, nodes, MAX_LEN, pad_code)), conv_dim, enc_dim, graph, agg1)

agg2 = MeanAggregator(lambda nodes: enc1(nodes), gpu)
enc2 = EgoEncoder(lambda nodes: enc1(nodes), enc1.embed_dim, enc_dim, graph, agg2, base_model=enc1)

c2n = Net2Net(graph, features, enc2)
c2n.cuda(gpu)

optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, c2n.parameters()), lr=l_rate)

for e in range(epoch_num):
    avg_loss = []
    c2n.train()
    batch_list = graph.get_batches(batch_size)
    for batch in batch_list:
        optimizer.zero_grad()
        loss = c2n.loss(batch, torch.tensor(batch, dtype=torch.int64, device=gpu))
        loss.backward()
        optimizer.step()
        avg_loss.append(loss.item())

    # Node classification results
    f1_micro = c2n.evaluate(batch_list, labels, class_ratio)
    minute = np.around((time.time() - start) / 60)
    ls = np.mean(avg_loss)
    print('Epoch:', e, 'loss:', ls, 'mi-F1:', np.around(f1_micro, 3), 'time:', minute, 'mins.')
    avg_loss.clear()


## Link Prediction (For both single and multiple executions)


In [None]:
embed_files = [[f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph15_data-v3-500.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph15_data-v3-500C.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph15_YAKE10.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph15_PosR10.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph15_PosR5.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph15_YAKE5.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph15_RAKE10.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph15_RAKE5.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph15_RAKE10C.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph15_RAKE5C.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph15_TFIDF5.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph15_TFIDF10.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph15_TextR10.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph15_TextR5.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph15_TopicR10.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph15_TopicR5.txt'],

               [f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph45_data-v3-500.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph45_data-v3-500C.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph45_YAKE10.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph45_PosR10.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph45_PosR5.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph45_YAKE5.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph45_RAKE10.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph45_RAKE5.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph45_RAKE10C.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph45_RAKE5C.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph45_TFIDF5.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph45_TFIDF10.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph45_TextR10.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph45_TextR5.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph45_TopicR10.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph45_TopicR5.txt'],

               [f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph75_data-v3-500.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph75_data-v3-500C.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph75_YAKE10.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph75_PosR10.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph75_PosR5.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph75_YAKE5.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph75_RAKE10.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph75_RAKE5.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph75_RAKE10C.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph75_RAKE5C.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph75_TFIDF5.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph75_TFIDF10.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph75_TextR10.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph75_TextR5.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph75_TopicR10.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_link_pred_sgraph75_TopicR5.txt']]


# Initialize a log file to store the AUC results
with open(f'{parent_path}/Results/Net2Net-NE/{link_pred_results_file}', "a") as f:
    f.write("Embed File\tAUC Value\n")

for tgfi, tgf in enumerate(test_graph_files):
  for ef in embed_files[tgfi]:
      node2vec = {}

      # Load the embeddings from the current embed file
      with open(ef, 'rb') as f:
          for i, j in enumerate(f):
              if j.decode() != '\n':
                  node2vec[i] = list(map(float, j.strip().decode().split()))

      # Load the edges from the test graph file
      with open(f'{parent_path}/{tgf}', 'rb') as f:
          edges = [list(map(int, i.strip().decode().split())) for i in f]

      nodes = list(set([i for j in edges for i in j])) # All the unique nodes in "edges"

      # Calculate AUC
      a = 0
      b = 0
      for i, j in edges:
          if i in node2vec.keys() and j in node2vec.keys():
              dot1 = np.dot(node2vec[i], node2vec[j])
              random_node = random.sample(nodes, 1)[0]
              while random_node == j or random_node not in node2vec.keys():
                  random_node = random.sample(nodes, 1)[0]
              dot2 = np.dot(node2vec[i], node2vec[random_node])
              if dot1 > dot2:
                  a += 1
              elif dot1 == dot2:
                  a += 0.5
              b += 1

      auc_value = float(a) / b if b > 0 else 0
      print(f"AUC value for {ef.split('/')[-1]}: {auc_value}")

      # Log the result
      with open(f'{parent_path}/Results/Net2Net-NE/{link_pred_results_file}', "a") as f:
          f.write(f"{ef}\t{tgf}\t{auc_value}\n")

      gc.collect()

## Node Classification (For both single and multiple executions)


In [None]:
embed_files = [f'{parent_path}/Results/Net2Net-NE/embed_node_clf_graph_data-v3-500.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_node_clf_graph_data-v3-500C.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_node_clf_graph_YAKE10.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_node_clf_graph_PosR10.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_node_clf_graph_PosR5.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_node_clf_graph_YAKE5.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_node_clf_graph_RAKE10.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_node_clf_graph_RAKE5.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_node_clf_graph_RAKE10C.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_node_clf_graph_RAKE5C.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_node_clf_graph_TFIDF5.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_node_clf_graph_TFIDF10.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_node_clf_graph_TextR10.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_node_clf_graph_TextR5.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_node_clf_graph_TopicR10.txt',
               f'{parent_path}/Results/Net2Net-NE/embed_node_clf_graph_TopicR5.txt']

with open(f'{parent_path}/{categories_file}', 'r') as f:
  tags = f.readlines() # "tags" will be a 2D list. Each sublist will have the form: nodeID     label

if train_classifier:

  clf_test_len = len(nodes) # The number of nodes will be the same in each run since we're using the whole graph and thus, all of its nodes

  for ef in embed_files:
    X = []
    Y = []
    new_vector = get_vectors_from_file(ef)

    for jk in range(0, clf_test_len):
      if str(jk) in nodes: # If the index "jk" is a node
        tag_list = tags[jk].strip().split() # For node "jk", take this info: jk     label
        # Y.append([(int)(i) for i in tags])
        lli = [str(i) for i in tag_list] # For node "jk", lli will contain all of its labels
        if len(lli) != 0:
          if np.array(new_vector[jk]).any() != np.array(zero_list).any(): # If there is no zero value in the embedding of "jk"
            X.append(jk)
            Y.append(lli[1:][0]) # Take the first label (if there are multiple) of node "jk"

    # This part of the code uses only the X and Y lists created above
    mi = {}
    ma = {}
    li1 = []
    li2 = []
    with open(f'{parent_path}/Results/DeepEmLAN/{node_clf_results_file}', 'a') as f:

      f.write(f"{ef.split('/')[-1]} \n")
      print(ef.split('/')[-1])

      for i in range(0, len(clf_ratio)): # Experiment with each ratio
        for j in range(0, clf_num): # clf_num = 5

          clf = Classifier(vectors=new_vector, # All node embeddings
                          clf=LogisticRegression())

          result = clf.split_train_evaluate(X, Y, clf_ratio[i])

          # Results
          li1.append(result['micro'])
          li2.append(result['macro'])

        mi[str(str(clf_ratio[i]) + '-micro')] = sum(li1) / clf_num
        ma[str(str(clf_ratio[i]) + '-macro')] = sum(li2) / clf_num

        print(mi)
        print(ma)
        print()

        f.writelines(str(str(mi)+str(ma)))
        f.write('\n')

        # Reinitialize the dictionaries and lists
        mi = {}
        ma = {}
        li1 = []
        li2 = []

    gc.collect()