In [2]:
import pandas as pd
import json
import ast
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import torch
from Bio import Entrez
import re

def load_icd9_mappings(diagnoses_path, procedures_path):
    diag_df = pd.read_csv(diagnoses_path, dtype=str)
    proc_df = pd.read_csv(procedures_path, dtype=str)
    diag_map = dict(zip(diag_df['icd9_code'].str.strip(), diag_df['long_title'].str.strip()))
    proc_map = dict(zip(proc_df['icd9_code'].str.strip(), proc_df['long_title'].str.strip()))
    icd9_dict = {**diag_map, **proc_map}
    return icd9_dict

def is_pure_number(s):
    """判断关键词是否全为数字（或小数），如'40391'或'250.00'，则返回True"""
    return bool(re.fullmatch(r'\d+(\.\d+)?', s.strip()))

def extract_keywords_from_structured_report(
    structured_report,
    icd9_code_field=None,
    icd9_dict=None
):
    keywords = []
    if isinstance(structured_report, str):
        try:
            report = json.loads(structured_report)
        except Exception:
            report = {}
    else:
        report = structured_report

    # 诊断字段优先
    diag = report.get("diagnosis", "") or report.get("诊断", "")
    if diag:
        keywords += [d.strip() for d in str(diag).replace(",", ";").replace("，", ";").split(";") if d.strip()]

    # ICD9诊断
    codes_diag = report.get("main_diagnosis_icd9", None)
    if codes_diag:
        if isinstance(codes_diag, str):
            try:
                codes_diag = ast.literal_eval(codes_diag)
            except:
                codes_diag = [codes_diag]
        keywords += [icd9_dict.get(str(c).lstrip("0"), str(c)) for c in codes_diag if c]

    # ICD9操作
    codes_proc = report.get("main_procedure_icd9", None)
    if codes_proc:
        if isinstance(codes_proc, str):
            try:
                codes_proc = ast.literal_eval(codes_proc)
            except:
                codes_proc = [codes_proc]
        keywords += [icd9_dict.get(str(c).lstrip("0"), str(c)) for c in codes_proc if c]

    # 兜底字段
    if not keywords and icd9_code_field:
        codes = [d.strip() for d in str(icd9_code_field).replace(",", ";").replace("，", ";").split(";") if d.strip()]
        keywords += [icd9_dict.get(c.lstrip("0"), c) for c in codes]

    # 去重去空，只保留包含字母的描述
    keywords = [k for k in dict.fromkeys(keywords) if k and not is_pure_number(k)]
    return keywords

def search_pubmed(query, retmax=5):
    Entrez.email = "your_email@example.com"
    try:
        handle = Entrez.esearch(db="pubmed", term=query, retmax=retmax)
        record = Entrez.read(handle)
        pmids = record["IdList"]
        summaries = []
        for pmid in pmids:
            fetch = Entrez.efetch(db="pubmed", id=pmid, rettype="abstract", retmode="text")
            text = fetch.read()
            fetch.close()
            summaries.append({"pmid": pmid, "summary": text})
        return summaries
    except Exception as e:
        print(f"PubMed检索异常: {e}")
        return []

def get_embedder(model_path):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModel.from_pretrained(model_path)
    def embed(text):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)
        return embeddings[0].numpy()
    return embed

def rerank_by_similarity(query, candidates, embedder, topk=3):
    if not candidates:
        return []
    q_emb = embedder(query)
    c_embs = [embedder(c["summary"]) for c in candidates]
    if not c_embs:
        return []
    sims = cosine_similarity([q_emb], c_embs)[0]
    results = sorted(zip(candidates, sims), key=lambda x: -x[1])
    return [r[0] for r in results[:topk]]

def enrich_structured_report_with_pubmed(
    input_path,
    output_path,
    icd9_diag_path,
    icd9_proc_path,
    model_name,
    topk=3
):
    df = pd.read_csv(input_path)
    icd9_dict = load_icd9_mappings(icd9_diag_path, icd9_proc_path)
    embedder = get_embedder(model_name)
    enriched = []
    for idx, row in df.iterrows():
        try:
            keywords = extract_keywords_from_structured_report(
                row["structured_report"],
                row.get("icd9_code", None),
                icd9_dict=icd9_dict
            )
            print(f"第{idx}条关键词: {keywords}")
            if not keywords:
                enriched.append(str(row["structured_report"]) + "\n\n【相关文献补充】未找到诊断/操作关键词，无法检索文献。")
                continue
            if len(keywords) > 2:
                query = " OR ".join(keywords[:2])
            else:
                query = " OR ".join(keywords)
            pubmed_results = search_pubmed(query, retmax=5)
            print(f"第{idx}条PubMed结果数: {len(pubmed_results)}")
            diag_text = "; ".join(keywords)
            if pubmed_results:
                top_pubs = rerank_by_similarity(diag_text, pubmed_results, embedder, topk=topk)
                addendum = "\n\n【相关文献补充】\n"
                if top_pubs:
                    for item in top_pubs:
                        addendum += f"PMID: {item['pmid']}\n摘要: {item['summary']}\n---\n"
                else:
                    addendum += "未找到高相关性文献。\n"
            else:
                addendum = "\n\n【相关文献补充】未检索到相关文献。\n"
            enriched.append(str(row["structured_report"]) + addendum)
        except Exception as e:
            print(f"第{idx}条补充失败: {e}")
            enriched.append(str(row["structured_report"]) + "\n\n【相关文献补充】增强失败。")
    df["structured_report_rag"] = enriched
    df.to_csv(output_path, index=False)
    print(f"RAG增强结构化报告已保存到: {output_path}")

# 用法示例
if __name__ == "__main__":
    input_path = r"C:\Users\Administrator\Desktop\1\20237431-马金丽\代码\anonymized\structured_emr.csv"
    output_path = r"C:\Users\Administrator\Desktop\1\mimic-iii-clinical-database-demo-1.4\anonymized\structured_emr_rag.csv"
    icd9_diag_path = r"C:\Users\Administrator\Desktop\1\20237431-马金丽\代码\anonymized\d_icd_diagnoses.csv"
    icd9_proc_path = r"C:\Users\Administrator\Desktop\1\20237431-马金丽\代码\anonymized\d_icd_procedures.csv"
    local_model_path = r"C:\Users\Administrator\Desktop\1\all-MiniLM-L6-v2"
    enrich_structured_report_with_pubmed(
        input_path,
        output_path,
        icd9_diag_path,
        icd9_proc_path,
        model_name=local_model_path
    )

第0条关键词: ['Sepsis', 'Infection and inflammatory reaction due to other vascular device, implant, and graft', 'Revision of ureterointestinal anastomosis', 'Hypertensive chronic kidney disease, unspecified, with chronic kidney disease stage V or end stage renal disease', 'Atrial fibrillation', 'Congestive heart failure, unspecified', 'Partial esophagectomy', 'Esophagectomy, not otherwise specified', 'Unspecified Eustachian tube disorder', 'Internal fixation of bone without fracture reduction, unspecified site', 'Kidney dialysis as the cause of abnormal reaction of patient, or of later complication, without mention of misadventure at time of procedure', 'Infection with microorganisms resistant to penicillins', 'Diverticulitis of colon (without mention of hemorrhage)', 'Anemia of other chronic disease', 'Diabetes mellitus without mention of complication, type II or unspecified type, not stated as uncontrolled', 'Long-term (current) use of insulin', 'Anticoagulants causing adverse effects in 

KeyboardInterrupt: 

In [None]:
#将以上获得的关键词加入到文件中