In [None]:
import os
import glob
import csv
from keybert import KeyBERT
import hashlib
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from collections import defaultdict
import torch

In [16]:
root_dir = "spider"
conferences = ["AAAI", "CVPR", "ICML", "KDD", "NeurIPS1"]
output_csv = "triplets.csv"


In [19]:
kw_model = KeyBERT()
bert_model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda" if torch.cuda.is_available() else "cpu")
triplets = []

In [20]:
def extract_keywords(text, top_n=5):
    keywords = kw_model.extract_keywords(text, top_n=top_n, stop_words='english')
    return [kw[0] for kw in keywords]

In [21]:
def hash_filename(filepath):
    return hashlib.md5(filepath.encode()).hexdigest()

In [26]:
for conf in conferences:
    txt_dir = os.path.join(root_dir, conf, "txt")
    for txt_path in glob.glob(os.path.join(txt_dir, "*.txt")):
        with open(txt_path, "r", encoding='utf-8', errors='ignore') as f:
            text = f.read()
        paper_id = hash_filename(txt_path)  # 避免路径过长
        keywords = extract_keywords(text)

        # 添加关系: 论文属于会议weeew
        triplets.append((paper_id, "published_in", conf))

        # 添加关系: 论文包含关键词
        for kw in keywords:
            triplets.append((paper_id, "has_keyword", kw))

        # 手动映射方式（已废弃）
        # field_dict = {
        #     "vision": "Computer Vision", "image": "Computer Vision", "transformer": "Deep Learning",
        #     "reinforcement": "Reinforcement Learning", "policy": "Reinforcement Learning", "language": "Natural Language Processing",
        #     "text": "Natural Language Processing", "graph": "Graph Learning", "music": "Creative AI",
        # }
        # for kw in keywords:
        #     for k, v in field_dict.items():
        #         if k in kw.lower():
        #             triplets.append((kw, "belongs_to", v))
        #             breakd

# 自动聚类关键词并添加 belongs_to 三元组
all_keywords = list(set([t for (_, r, t) in triplets if r == "has_keyword"]))
keyword_embeddings = bert_model.encode(all_keywords, batch_size=64, convert_to_tensor=True, device=bert_model.device)


In [27]:
n_clusters = 6 # 分6个主题，可调
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(keyword_embeddings.cpu().numpy())

In [28]:
topic_names = {
    0: "Natural Language Processing",
    1: "Computer Vision",
    2: "Deep Learning",
    3: "Reinforcement Learning",
    4: "Graph Learning",
    5: "Creative AI"
}

In [29]:
cluster_map = {kw: topic_names.get(int(label), f"Topic_{label}") for kw, label in zip(all_keywords, labels)}

In [30]:
for h, r, t in list(triplets):
    if r == "has_keyword" and t in cluster_map:
        triplets.append((t, "belongs_to", cluster_map[t]))

In [31]:
with open(output_csv, "w", newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["head", "relation", "tail"])
    for h, r, t in triplets:
        writer.writerow([h, r, t])

print(f"已完成三元组构建，共生成 {len(triplets)} 条，保存在 {output_csv}")


已完成三元组构建，共生成 13068 条，保存在 triplets.csv
