# arxiv-2023 conversion script

## Data

In [57]:
import os
import torch
import pandas as pd
import numpy
import json

In [19]:
base_path="./LLM/dataset/arxiv_2023"
# Load processed data
edge_index = torch.load(os.path.join(base_path, "processed", "edge_index.pt"))
    
# Load raw data
# edge_df = pd.read_csv(os.path.join(base_path, "raw", "edge.csv.gz"), compression='gzip')
titles_df = pd.read_csv(os.path.join(base_path, "raw", "titles.csv.gz"), compression='gzip')
abstracts_df = pd.read_csv(os.path.join(base_path, "raw", "abstracts.csv.gz"), compression='gzip')
ids_df = pd.read_csv(os.path.join(base_path, "raw", "ids.csv.gz"), compression='gzip')
labels_df = pd.read_csv(os.path.join(base_path, "raw", "labels.csv.gz"), compression='gzip')
    
# Load split data
train_id_df = pd.read_csv(os.path.join(base_path, "split", "train.csv.gz"), compression='gzip')
val_id_df = pd.read_csv(os.path.join(base_path, "split", "valid.csv.gz"), compression='gzip')
test_id_df = pd.read_csv(os.path.join(base_path, "split", "test.csv.gz"), compression='gzip')
    
num_nodes = len(ids_df)
titles = titles_df['titles'].tolist()
abstracts = abstracts_df['abstracts'].tolist()
ids = ids_df['ids'].tolist()
labels = labels_df['labels'].tolist()
train_id = train_id_df['train_id'].tolist()
val_id = val_id_df['val_id'].tolist()
test_id = test_id_df['test_id'].tolist()

features = torch.load(os.path.join(base_path, "processed", "features.pt"))

y = torch.load(os.path.join(base_path, "processed", "labels.pt"))
    
train_mask = torch.tensor([x in train_id for x in range(num_nodes)])
val_mask = torch.tensor([x in val_id for x in range(num_nodes)])
test_mask = torch.tensor([x in test_id for x in range(num_nodes)])

In [45]:
from gli.io import save_graph, Attribute
node_attrs=[
  Attribute(
    "Titles",
    numpy.array(titles),
    "Title of each node",
    "str",
    "Tensor",
  ),
  Attribute(
    "Abstracts",
    numpy.array(abstracts),
    "Abstract of each article(node)",
    "str",
    "Tensor",
  ),
  Attribute(
    "Ids",
    numpy.array([str(id) for id in ids]),
    "Id of each article(node)",
    "str",
    "Tensor",
  ),
  
]

metadata = save_graph(
  name="arxiv-2023",
  edge=numpy.array(edge_index).T,
  num_nodes=num_nodes,
  node_attrs=node_attrs,
  description="ARXIV-2023 dataset.",
  cite="@misc{huang2023llms,\ntitle={Can LLMs Effectively Leverage Graph Structural Information: When and Why},\nauthor={Jin Huang and Xingjian Zhang and Qiaozhu Mei and Jiaqi Ma},\nyear={2023},\neprint={2309.16595},\narchivePrefix={arXiv},\nprimaryClass={cs.LG}\n}",
)


In [61]:
num_nodes

33868

In [56]:
from gli.io import save_task_node_classification

task_data = save_task_node_classification(
  name="arxiv-2023",
  description="Node classification on arxiv-2023 dataset.",
  feature=["Node/NodeFeature"],
  target="Node/NodeLabel",
  num_classes=40,
  train_set=numpy.array(train_mask),
  val_set=numpy.array(val_mask),
  test_set=numpy.array(test_mask),
  task_id="1"
)


In [62]:
edge_index.shape

torch.Size([2, 305672])