In [None]:
import pandas as pd
import time
from openai import OpenAI

def load_and_merge_mimic3_tables(data_dir):
    # 1. 入院信息
    adm = pd.read_csv(f"{data_dir}/ADMISSIONS.csv")
    # 2. 诊断
    dx = pd.read_csv(f"{data_dir}/diagnoses_icd.csv")
    # 3. 操作
    proc = pd.read_csv(f"{data_dir}/procedures_icd.csv")
    # 4. 用药
    try:
        pre = pd.read_csv(f"{data_dir}/prescriptions.csv")
    except Exception:
        pre = pd.DataFrame(columns=["hadm_id"])

    # 合并诊断
    dx_grouped = dx.groupby("hadm_id")["icd9_code"].apply(lambda x: "; ".join(x.astype(str))).reset_index()
    # 合并操作
    proc_grouped = proc.groupby("hadm_id")["icd9_code"].apply(lambda x: "; ".join(x.astype(str))).reset_index()
    # 合并用药
    if not pre.empty and "drug" in pre.columns:
        pre_grouped = pre.groupby("hadm_id")["drug"].apply(lambda x: "; ".join(x.astype(str))).reset_index()
    else:
        pre_grouped = pd.DataFrame(columns=["hadm_id", "drug"])

    # 主表左连接
    adm = adm.merge(dx_grouped, how="left", left_on="hadm_id", right_on="hadm_id", suffixes=("", "_dx"))
    adm = adm.merge(proc_grouped, how="left", left_on="hadm_id", right_on="hadm_id", suffixes=("", "_proc"))
    adm = adm.merge(pre_grouped, how="left", left_on="hadm_id", right_on="hadm_id", suffixes=("", "_pre"))
    return adm

def build_emr_text(row):
    # 拼接病历核心内容
    lines = [
        f"入院时间：{row.get('admittime', '')}",
        f"出院时间：{row.get('dischtime', '')}",
        f"入院方式：{row.get('admission_type', '')}",
        f"入院科室：{row.get('admission_type', '')}",
        f"出院方式：{row.get('discharge_location', '')}",
        f"主要诊断icd9：{row.get('icd9_code', '')}",
        f"主要操作icd9：{row.get('icd9_code_proc', '')}",
        f"入院前病史摘要：{row.get('history_of_present_illness', '') if 'history_of_present_illness' in row else ''}",
        f"用药记录：{row.get('drug', '')}",
        f"出院小结：{row.get('diagnosis', '') if 'diagnosis' in row else ''}"
    ]
    return "\n".join(lines)

def call_deepseek_structured_api(text, client):
    prompt = (
        "请将以下住院电子病历文本结构化，输出JSON格式。字段包括：主诉、现病史、体征、辅助检查、诊断、治疗过程、治疗建议、疗效。\n"
        f"原文：{text}\n"
        "输出："
    )
    try:
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": "You are a helpful assistant"},
                {"role": "user", "content": prompt},
            ],
            stream=False,
            max_tokens=1024,
            temperature=0.2
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"API调用失败: {e}")
        return ""

def batch_generate_structured_emr(data_dir, output_path, api_key, base_url, limit=10, sleep_time=2):
    adm = load_and_merge_mimic3_tables(data_dir)
    emr_inputs = adm.apply(build_emr_text, axis=1)
    structured_reports = []

    client = OpenAI(api_key=api_key, base_url=base_url)

    for idx, text in enumerate(emr_inputs):
        print(f"处理第 {idx+1} 条电子病历...")
        structured = call_deepseek_structured_api(text, client)
        structured_reports.append(structured)
        time.sleep(sleep_time)
        if idx+1 >= limit:   # 可去掉limit，批量全部处理
            break

    adm_subset = adm.iloc[:len(structured_reports)].copy()
    adm_subset["structured_report"] = structured_reports
    adm_subset.to_csv(output_path, index=False)
    print(f"结构化电子病历已保存到: {output_path}")

if __name__ == "__main__":
    root_dir = r"C:\Users\YourUsername\Documents"  # 替换为你的实际路径
    data_dir = fr"{root_dir}\mimic-iii-clinical-database-demo-1.4\anonymized"
    output_path = fr"{root_dir}\anonymized\anonymized\structured_emr.csv"
    api_key = "your_api_key"  # 替换为你的DeepSeek API Key
    base_url = "https://api.deepseek.com"
    batch_generate_structured_emr(data_dir, output_path, api_key, base_url, limit=10, sleep_time=2)

ImportError: cannot import name 'Sequence' from 'typing_extensions' (c:\ProgramData\anaconda3\envs\ecg_env\lib\site-packages\typing_extensions.py)