In [1]:
import csv

# 1. 导出边集合
with open('cora.cites', 'r') as f_in, open('cora.csv', 'w', newline='') as f_out:
    writer = csv.writer(f_out)
    writer.writerow(['from', 'to'])  # 表头
    for line in f_in:
        src, dst = line.strip().split()
        writer.writerow([src, dst])

# 2. 导出真实标签

# 收集所有标签并建立映射
label_set = set()
node_label_pairs = []

with open('cora.content', 'r') as f_in:
    for line in f_in:
        parts = line.strip().split()
        node_id = parts[0]
        label = parts[-1]
        node_label_pairs.append((node_id, label))
        label_set.add(label)

# 创建类别到数字的映射
label_to_id = {label: idx for idx, label in enumerate(sorted(label_set))}

# 写入 labels.csv（数字编码）
with open('cora-labels.csv', 'w', newline='') as f_out:
    writer = csv.writer(f_out)
    writer.writerow(['node', 'label'])  # 表头
    for node_id, label in node_label_pairs:
        writer.writerow([node_id, label_to_id[label]])


In [3]:
import csv

# 1. 收集所有节点ID
node_set = set()

# 从边文件收集
with open('cora.cites', 'r') as f:
    for line in f:
        src, dst = line.strip().split()
        node_set.add(src)
        node_set.add(dst)

# 从内容文件收集
node_label_pairs = []
label_set = set()
with open('cora.content', 'r') as f:
    for line in f:
        parts = line.strip().split()
        node_id = parts[0]
        label = parts[-1]
        node_label_pairs.append((node_id, label))
        label_set.add(label)
        node_set.add(node_id)

# 2. 创建节点ID映射
sorted_nodes = sorted(node_set)
node_to_new_id = {node_id: idx for idx, node_id in enumerate(sorted_nodes)}

# 3. 导出边集合（映射后的ID）
with open('cora.cites', 'r') as f_in, open('cora.csv', 'w', newline='') as f_out:
    writer = csv.writer(f_out)
    writer.writerow(['from', 'to'])
    for line in f_in:
        src, dst = line.strip().split()
        writer.writerow([node_to_new_id[src], node_to_new_id[dst]])

# 4. 创建类别到数字的映射
label_to_id = {label: idx for idx, label in enumerate(sorted(label_set))}

# 5. 导出标签文件（映射后的ID，按编号排序）
# 先转成 (新编号, label_id)
mapped_labels = [
    (node_to_new_id[node_id], label_to_id[label])
    for node_id, label in node_label_pairs
]
# 排序
mapped_labels.sort()

with open('cora-labels.csv', 'w', newline='') as f_out:
    writer = csv.writer(f_out)
    writer.writerow(['node', 'label'])
    for new_node_id, label_id in mapped_labels:
        writer.writerow([new_node_id, label_id])
