# Load Dataset

In [None]:
import csv
from tqdm import tqdm
from tqdm import trange

Index = []
Title = {}
Year = {}
Abstract = {}
train_Index = []
valid_Index = []
test_Index = []

with open('./train_data.csv', 'r') as f:
    reader = list(csv.reader(f))
    for r in trange(len(reader)):
        row = reader[r]
        Index.append(int(row[0]))
        train_Index.append(int(row[0]))
        Title[int(row[0])] = (row[2])
        Year[int(row[0])] = (row[3])
        Abstract[int(row[0])] = (row[4])

In [None]:
with open('./valid_data.csv', 'r') as f:
    reader = list(csv.reader(f))
    for r in trange(len(reader)):
        row = reader[r]
        Index.append(int(row[0]))
        valid_Index.append(int(row[0]))
        Title[int(row[0])] = (row[2])
        Year[int(row[0])] = (row[3])
        Abstract[int(row[0])] = (row[4])

In [None]:
with open('./test_data.csv', 'r') as f:
    reader = list(csv.reader(f))
    for r in trange(len(reader)):
        row = reader[r]
        Index.append(int(row[0]))
        test_Index.append(int(row[0]))
        Title[int(row[0])] = (row[2])
        Year[int(row[0])] = (row[3])
        Abstract[int(row[0])] = (row[4])

In [None]:
Label = {}
with open('./labels.txt','r') as file:
    for line in file:
        temp1,temp2 = line[:-1].split('\t')
        Label[int(temp1)] = int(temp2)

# Calculate # of Node Tokens

In [None]:
from transformers import AutoTokenizer
model_name = "Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def calculate_token_num(text):
    encoded = tokenizer.encode(text)
    return len(encoded)

node_token_num = []

for i in trange(len(Title)):
    text = Title[i] + Abstract[i]
    node_token_num.append(calculate_token_num(text))
print(len(node_token_num))

In [None]:
import numpy as np
tokens = np.array(node_token_num)
print(f"Max #node tokens: {tokens.max()}")
print(f"Min #node tokens: {tokens.min()}")
print(f"Avg. #node tokens: {tokens.mean()}")

# Calculate # of Node Degrees

In [None]:

Graph=[[] for i in range(len(Title))]
Self_to_Self_node_count = 0
Edge_Num = 0
with open('./graph.txt','r') as file:
    for line in file:  
        start,end,_=line[:-1].split('\t')
        Edge_Num+=2
        if int(end) == int(start): Self_to_Self_node_count+=1
        Graph[int(start)].append(int(end))
        Graph[int(end)].append(int(start))
    print("Self-Self:", Self_to_Self_node_count)
    print("# Nodes:", len(Graph))
    print("# Edges:", Edge_Num)

In [None]:
import numpy as np
from tqdm import trange

node_degree_num = []
count=0
for i in trange(len(Graph)): 
    node_degree_num.append(len(Graph[i]))
    
    if len(Graph[i])==0: count+=1

degrees_before = np.array(node_degree_num)

print(f"Max #node degrees_before: {degrees_before.max()}")
print(f"Min #node degrees_before: {degrees_before.min()}")
print(f"Avg. #node degrees_before: {degrees_before.mean()}")

print(f"#node degree=0: {count}")
print(f"#node degree=0/#total node: {count/len(Graph)}")

In [None]:
import numpy as np
from tqdm import trange

count_train=0
count_valid=0
count_test=0
for i in train_Index: 
    if len(Graph[int(i)])==0: count_train += 1
for i in valid_Index: 
    if len(Graph[int(i)])==0: count_valid += 1   
for i in test_Index: 
    if len(Graph[int(i)])==0: count_test += 1

print(f"#node degree=0 on training: {count_train}")
print(f"#node degree=0/#total node on training: {count_train/len(train_Index)}")

print(f"#node degree=0 on validation: {count_valid}")
print(f"#node degree=0/#total node on validation: {count_valid/len(valid_Index)}")

print(f"#node degree=0 on testing: {count_test}")
print(f"#node degree=0/#total node on testing: {count_test/len(test_Index)}")

In [None]:
import numpy as np
from tqdm import trange

node_degree_num = []
for i in trange(len(Graph)): node_degree_num.append(len(Graph[i]))

degrees = np.array(node_degree_num)
degrees = np.ceil(np.log1p(degrees))  

print(f"Max #node degrees: {degrees.max()}")
print(f"Min #node degrees: {degrees.min()}")
print(f"Avg. #node degrees: {degrees.mean()}")

# Calculate Node H

In [None]:
H = tokens*degrees # H = T * [log(D+1)]
print("# Nodes:", len(H))
print(f"Max node H: {H.max()}")
print(f"Min node H: {H.min()}")
print(f"Avg. node H: {H.mean()}")
print(f"Sum node H: {H.sum()}")

# Generate Node's one-hop Neighbors and Random Walks

In [None]:
train_nodes=train_Index

In [None]:

Neighs = {}
for node in train_nodes: Neighs[node] = []
Avg_degree = degrees_before.mean()

for i,central_node in enumerate(train_nodes):
    
    neighs = Graph[central_node] # Neighbors
    central_node_H = H[central_node]/2 # H*/2
    
    neighs2H = {}
    for j in neighs: neighs2H[j] = H[j]
    sorted_neighs2H = dict(sorted(neighs2H.items(), key=lambda item: item[1], reverse=True))
    
    # Obtain central node's Neighbor (#Neighbors <= Avg. #Degree)
    Neigh_temp = []
    for neigh, neigh_H in sorted_neighs2H.items(): Neigh_temp.append(neigh)
    if len(Neigh_temp) >= int(Avg_degree): 
        Neighs[central_node]= Neigh_temp[:int(Avg_degree)]
        for neigh in range(int(Avg_degree),len(Neigh_temp)):
             if sorted_neighs2H[Neigh_temp[neigh]] >= central_node_H: Neighs[central_node].append(Neigh_temp[neigh])
    else:  Neighs[central_node] = Neigh_temp



count =0
avg_count=0
for i,central_node in enumerate(train_nodes):
    if len(Neighs[central_node])==0: count+=1
    avg_count+=len(Neighs[central_node])
print(f"#Central Node's obtained Neighbor=0: {count}")
print(f"#Central Node's obtained Neighbor=0: {count/len(train_nodes)*100}%")
print(f"Avg. #Central Node's obtained Neighbor: {avg_count/len(train_nodes)}")
print(f"Avg. #Central Node's obtained Neighbor (#/=0): {avg_count/(len(train_nodes)-count)}")


In [None]:

import random
RWs = {}
for node in train_nodes: RWs[node] = []
start_idx = train_nodes 

rand=random.Random()
max_walk_num = int(Avg_degree)
alpha=0.15
path_length=10000
for line in tqdm(range(len(start_idx))):
    central_node = start_idx[line]

    if len(Graph[central_node])==0: continue

    if len(Neighs[central_node]) < max_walk_num: 
        walk_num = max_walk_num - len(Neighs[central_node])
    else: 
        continue

    node_paths = []
    while len(node_paths) < walk_num:
        temp_path=[]
        start = central_node
        temp_path.append(start)
        
        if len(Graph[temp_path[-1]]) == 0: break 
        for i in range(path_length):
            cur = temp_path[-1]

            if (len(Graph[cur])>0) and rand.random()>=alpha:
                next_ = rand.choice(Graph[cur])
                while(rand.random()>=alpha): 
                    next_ = rand.choice(Graph[cur]) 
                temp_path.append(next_)
            else: break
        if len(temp_path)>=2 and len(temp_path)<=10: node_paths.append(temp_path)
    RWs[central_node] = node_paths

In [None]:

count =0
avg_count=0
for i,central_node in enumerate(train_nodes):
    if len(RWs[central_node])==0: count+=1
    avg_count+=len(RWs[central_node])
print(f"#Central Node's obtained RWs=0: {count}")
print(f"#Central Node's obtained RWs=0: {count/len(train_nodes)*100}%")
print(f"Avg. #Central Node's obtained RWs: {avg_count/len(train_nodes)}")
print(f"Avg. #Central Node's obtained RWs (#/=0): {avg_count/(len(train_nodes)-count)}")

# Generate Node Classification Instruction

In [None]:
def get_subgraph(central_node, Neighs, RWs):
    
    Input =  "The compact graph description of this PAPER is listed as follows:\n"
    Input += "Title: {" + Title[central_node] + "} Abstract: {" + Abstract[central_node] + "}"
    
    if len(Neighs)==0 and len(RWs)==0: return Input
    else: Input += " "
    
    #---------------------------------------------------------------------
    # 构造ego graph nodes
    input_graph = "Ego graph nodes: {"
    graph_node = []
    graph_node.append(central_node)
    
    # 加入neigh中node信息
    temp_nodes = []
    for node in Neighs: 
        if node == central_node: continue
        temp_nodes.append(node)
   
    # 加入RW中node信息:
    for path in RWs:
        for node in path:
            if node == central_node: continue
            temp_nodes.append(node)
           
    temp_nodes = list(set(temp_nodes))
    graph_node += temp_nodes
    
    count = 1
    node2index = {}
    input_graph += "PAPER: ["
    for j,node in enumerate(graph_node):
        if j != (len(graph_node)-1): input_graph += ("("+str(j+1)+") \'" +Title[node]+"\', ")
        else: input_graph += ("("+str(j+1)+") \'" +Title[node]+"\']}\n")
        node2index[node]=str(j+1)

    Input += input_graph
    
    #---------------------------------------------------------------------
    # 构造 Neighborhood information
    input_neighborhood = "One-hop neighbors: {"
    count = 0
    for j,node in enumerate(Neighs):
        if j!= len(Neighs)-1: input_neighborhood += ("("+node2index[node] + "), ")
        else: input_neighborhood += ("("+node2index[node] + ")}\n")
    Input += input_neighborhood
    
    #---------------------------------------------------------------------
    # 构造 RW
    
    if len(RWs)>0:
        input_rw = "Random walks: {"
        for j,path in enumerate(RWs):
            input_rw += (chr(ord('A')+j) + ". ")
            for k,node in enumerate(path):
                if k != len(path)-1:
                    temp = "("+ node2index[node] + ") cited "
                    input_rw += temp
                else: 
                    temp = "("+ node2index[node] + ")"
                    input_rw += temp
            if j!= len(RWs)-1: input_rw += "; "
            else: input_rw += "}"
        Input += input_rw
    return Input

In [None]:
NC_Type = []
with open('./label_text.dat','r') as file:
    for line in file:
        temp1,temp2 = line[:-1].split('\t')
        NC_Type.append(temp2)

In [None]:
type_text = "{"
for i,L in enumerate(NC_Type):
    if i!= len(NC_Type)-1: type_text += str(i+1) + ". " + L + " "
    else: type_text += str(i+1) + ". " + L + "}"
type_text

In [None]:

texts = []
for i, central_node in enumerate(train_nodes): 
    text = {}
    Instruction = "Given the target PAPER with the compact graph description in Arxiv dataset, which of the following subcategories of computer science does this PAPER belong to: " + type_text + \
    ". Directly give the most likely category of this PAPER." 
    Output = str(Label[central_node]+1) + ". " + NC_Type[Label[central_node]]
    Input = get_subgraph(central_node, Neighs[central_node], RWs[central_node])
    
    text["instruction"] = Instruction
    text["input"] = Input
    text["output"] = Output
    text["index"] =  int(central_node)

    texts.append(text) 

In [None]:
import json
with open("nc_arxiv.json", "w") as f:
    json.dump(texts[:], f, indent=2)