In [None]:
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
from openai import OpenAI
import re

In [None]:
def get_ds_res(query):
    api_key = ""
    base_url = ""

    client = OpenAI(
        api_key=api_key,
        base_url=base_url
    )

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": query,
            }
        ],
        model="deepseek-v3",
        temperature=0.5,
        presence_penalty=0,
        frequency_penalty=0,
        top_p=1,
        response_format={
            'type': 'json_object'
        }
    )
    return chat_completion.choices[0].message.content

In [None]:
def ehr2prompt(ehr):
    column_names = ehr.columns.tolist()

    prompt_list = []

    for i in range(1, len(ehr)):
        row_data = ehr.iloc[i].tolist()
        ehr_prompt = "患者信息："
        row_data[0] = str(np.int32(row_data[0]))
        for i in range(len(column_names)):
            if isinstance(row_data[i], np.float64) or isinstance(row_data[i], np.int64):
                if np.isnan(row_data[i]):
                    row_data[i] = "未知"
                else:
                    row_data[i] = str(row_data[i])
            if pd.isna(row_data[i]):
                row_data[i] = "未知"
            ehr_prompt += f"{column_names[i]}：{row_data[i]}，"
        prompt_list.append(ehr_prompt)
    return prompt_list

In [None]:
def qa2prompt(qa):
    prompt = json.dumps(qa, ensure_ascii=False)
    prompt = "问题集：" + prompt
    return prompt

In [None]:
ehr = pd.read_excel("../../dataset/EHR.xlsx")
ehr_prompt_list = ehr2prompt(ehr)

In [None]:
summaried_qa = json.load(open("../../dataset/summaried_qa.json", 'r'))
qa_prompt = qa2prompt(summaried_qa)

In [None]:
result = []
for ehr_prompt in tqdm(ehr_prompt_list):
    prompt = ehr_prompt + '\n'
    prompt += qa_prompt + '\n'
    prompt += "请根据'患者信息'，从'问题集'的'病情诊断'，'康复指导'和'心理干预'中挑选出最符合患者情况且最可能同时出现的问题各1条，构成一组患者问题集合。挑选10次，形成10组各不相同的患者问题集合并用json输出。\n"
    prompt += '''EXAMPLE JSON OUTPUT:
        [
            ['病情诊断问题1', '康复指导问题1', '心理干预问题1'],
            ['病情诊断问题2', '康复指导问题2', '心理干预问题2'],
            ...
            ['病情诊断问题10', '康复指导问题10', '心理干预问题10']
        ]'''
    result.append(get_ds_res(prompt))

In [None]:
vp_info = []
for i in range(len(result)):
    matchs = re.search(r"```json(.*?)```", result[i], re.DOTALL)
    if matchs:
        vp_info.append({
            "ehr_prompt": ehr_prompt_list[i],
            "question_list": json.loads(matchs.group(1))
        })
    else:
        print(result[i])

In [None]:
with open('../../dataset/vp_info.json', 'w') as file:
    json.dump(vp_info, file, ensure_ascii=False)