In [1]:
import csv
import re
from sklearn.preprocessing import LabelEncoder

# 文件路径
node_file = 'Pubmed-Diabetes.NODE.paper.tab'
edge_file = 'Pubmed-Diabetes.DIRECTED.cites.tab'

# 读取节点和标签
paper_ids = []
labels = []

with open(node_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()[2:]  # 跳过前两行说明
    for line in lines:
        parts = line.strip().split('\t')
        paper_id = parts[0].strip()
        label_match = re.search(r'cat=(\d)', parts[1])
        if label_match:
            label = int(label_match.group(1))
            paper_ids.append(paper_id)
            labels.append(label)

# 创建映射和编码
paper_id_to_idx = {pid: idx for idx, pid in enumerate(paper_ids)}
encoded_labels = LabelEncoder().fit_transform(labels)

# 保存 label 到 CSV
with open('pubmed-labels.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['node_id', 'label'])
    for idx, label in enumerate(encoded_labels):
        writer.writerow([idx, label])

# 读取边并保存为 CSV
with open('pubmed.csv', 'w', newline='', encoding='utf-8') as f_out:
    writer = csv.writer(f_out)
    writer.writerow(['source', 'target'])
    with open(edge_file, 'r', encoding='utf-8') as f_in:
        lines = f_in.readlines()[2:]  # 跳过前两行
        for line in lines:
            parts = line.strip().split('\t')
            if len(parts) != 2:
                continue
            src_match = re.search(r'paper:(\d+)', parts[0])
            dst_match = re.search(r'paper:(\d+)', parts[1])
            if src_match and dst_match:
                src_id, dst_id = src_match.group(1), dst_match.group(1)
                if src_id in paper_id_to_idx and dst_id in paper_id_to_idx:
                    writer.writerow([paper_id_to_idx[src_id], paper_id_to_idx[dst_id]])

print("CSV 文件保存成功：pubmed_labels.csv 和 pubmed_edges.csv")


CSV 文件保存成功：pubmed_labels.csv 和 pubmed_edges.csv
