In [5]:
import pandas as pd
import os
import json
import requests

In [6]:
sub_data = pd.read_excel("../data/sampled_df_with_generated_questions.xlsx")

In [7]:
def gpt_4_1(headers, prompt):

    url = "https://apim.stanfordhealthcare.org/openai-eastus2/deployments/gpt-4.1/chat/completions?api-version=2025-01-01-preview"
    payload = json.dumps({
        "model": "gpt-4.1", 
        "messages": [{"role": "user", "content": prompt}]
    })
    response = requests.request("POST", url, headers=headers, data=payload)
    message_content = response.json()["choices"][0]["message"]["content"]
    return message_content


In [8]:
def gemini_2_5_pro(headers, prompt):
    url = "https://apim.stanfordhealthcare.org/gemini-25-pro/gemini-25-pro"
    payload = json.dumps({
        "contents": [{"role": "user", "parts": [{"text": prompt}]}]
    })
    response = requests.request("POST", url, headers=headers, data=payload)
    message_content = response.json()[0]['candidates'][0]['content']['parts'][0]['text']
    return message_content

In [9]:
my_key = os.getenv("HEALTHREX_API_KEY")
# Common Headers (Used for all models)
headers = {'Ocp-Apim-Subscription-Key': my_key, 'Content-Type': 'application/json'}
def parser_LLM(prompt, model = "gpt-4.1", 
             headers = headers):
    message_content = ""
    if model == "gpt-4.1":
        message_content = gpt_4_1(headers, prompt)
    if model == "gemini-2.5-pro":
        message_content = gemini_2_5_pro(headers, prompt)
    return message_content
def evaluator_LLM(evaluation_prompt, model = "gpt-4.1", headers = headers):
    message_content = ""
    if model == "gpt-4.1":
        message_content = gpt_4_1(headers, evaluation_prompt)
    if model == "gemini-2.5-pro":
        message_content = gemini_2_5_pro(headers, evaluation_prompt)
    return message_content





In [10]:
def prompt_preparation(result, actual_response, subject, LLM_response):
    result_json = json.loads(result)
    result_json["message_subject"] = subject
    result_json["LLM-generated_response"] = LLM_response
    result_json["actual_response"] = actual_response
    evaluation_prompt = evaluator_LLM(result_json)
    return evaluation_prompt


In [11]:
patient_message = sub_data["Patient Message"][0]

In [12]:
actual_response = sub_data["Actual Response Sent to Patient"][0]

In [13]:
notes = sub_data["Prompt Sent to LLM"][0]

In [14]:
subject = sub_data["Subject"][0]

In [15]:
LLM_response = sub_data["Suggested Response from LLM"][0]

In [16]:
def prompt_preparation(notes, patient_message):
  prompt = f"""You are an expert clinical data extractor. Given the unstructured clinical information below, extract and parse it into the structured JSON format provided.

  **Follow these rules carefully:**

  - **Field Fidelity:** Populate each field using only the explicit information provided in the clinical data. Use ‚ÄúUnknown‚Äù or "" for missing or unclear fields, per the template.
  - **Information Granularity:** For list fields (e.g., "history_of_present_illness", "past_medical_history"), enter each bullet, sentence, or clinically distinct concept as a separate item.
  - **Relevance:** Include all clinically relevant complaints or concerns discussed, including those introduced in direct messages by the patient, as symptoms/chief complaints or in new "assessment_and_plan" issues.
  - **Physical Exam:** Record each PE subfield as completely as possible. If side-specific findings are present (e.g., right/left ear), include these in the most granular field appropriate.
  - **Assessment and Plan:** Enter each active issue, including newly raised complaints from the patient, along with provider instructions, recommended follow-up, or referral steps. If a complaint is new (e.g., from a patient message, not the prior note), include your clinical response as an entry.
  - **Instructions:** General instructions (e.g., when to follow up, how to schedule) should be recorded in "general_guidelines"; pharmacy details as specified.
  - **Patient Message:** Always copy the patient‚Äôs message verbatim.
  - **Additional Notes:** Include any clinical details, context, or provider action plans not clearly fitting in the other structured fields.

  **Strict Guidelines:**
  - Do not infer or hallucinate any data not clearly present.
  - Do not summarize or condense the patient's clinical complaints or history; preserve their language and details in the output.
  - Fields with multiple possible entries (e.g., medications, history, complaints) should be output as complete arrays.

  ### Clinical Information:
  {notes}

  ### Structured JSON Template:
  {{
    "provider_info": {{
      "provider_name": "",
      "department_specialty": "",
      "department_name": "",
      "department_phone": "",
      "primary_care_provider": ""
    }},
    "patient_info": {{
      "patient_name": "",
      "patient_age": ""
    }},
    "visit_info": {{
      "visit_date": "",
      "visit_type": "",
      "location": {{
        "patient": "",
        "provider": ""
      }},
      "chief_complaint": "",
      "history_of_present_illness": [],
      "active_problems": [
        {{
          "problem": "",
          "code": ""
        }}
      ],
      "past_medical_history": [
        {{
          "condition": "",
          "diagnosed": "",
          "medication": "",
          "note": ""
        }}
      ],
      "physical_exam": {{
        "general": "",
        "HEENT": "",
        "respiratory": "",
        "neurological": "",
        "cardiovascular": "",
        "gastrointestinal": "",
        "musculoskeletal": "",
        "skin": "",
        "psych": ""
      }},
      "assessment_and_plan": [
        {{
          "issue": "",
          "instructions": []
        }}
      ]
    }},
    "instructions": {{
      "general_guidelines": [],
      "pharmacy_info": {{
        "default_pharmacy": {{
          "name": "",
          "address": "",
          "phone": "",
          "fax": ""
        }}
      }}
    }},
    "additional_notes": "", 
    "patient_message": {patient_message}
  }}

  Respond ONLY with the completed JSON. No additional explanation or commentary."""
  return prompt


In [17]:
def evaluation_prompt_preparation(result_json):
  evaluation_prompt = f"""
  Comprehensive Evaluation Prompt
  You are a clinical quality-assessment assistant tasked with evaluating responses generated by an AI model to patient messages.

  Your evaluation involves:

  Message Categorization: Classify the patient‚Äôs message into a category based on the subject title and the message content.

  Response Evaluation: Provide quantitative scores (0-10) for each defined evaluation dimension.

  Error Identification: Clearly identify specific errors, categorize them (Clinical Error vs Non-Clinical Error), assign a severity score (1-5), and localize the exact text excerpt of each error.

  üìå Scoring Definitions
  Response Quality Dimensions (Score: 0-10):
  Clinical Accuracy (0-10):

  0: Completely incorrect or dangerous advice.

  5: Partially correct but incomplete or with moderate mistakes.

  10: Completely accurate, clinically appropriate, and safe.

  Urgency Recognition (0-10):

  0: Urgency completely unrecognized or incorrectly interpreted, potentially dangerous.

  5: Moderately appropriate urgency recognition, partial omission of critical issues.

  10: Completely correct urgency identification and action advice.

  Professional Consultation Guidance (0-10):

  0: No guidance or completely incorrect guidance.

  5: Partial or incomplete professional guidance.

  10: Excellent and appropriate consultation guidance.

  Sensitivity & Clarity (0-10):

  0: Rude, dismissive, or entirely unclear.

  5: Moderately empathetic, some unclear language or slight dismissiveness.

  10: Fully empathetic, clear, and supportive language.

  Error Severity Definitions (Severity Score: 1-5):
  1: Minor issue unlikely to affect patient care or experience.

  2: Noticeable but unlikely to significantly affect clinical outcome.

  3: Significant error that could lead to patient confusion or delay in care.

  4: Serious error, potentially impacting patient health negatively.

  5: Critical or potentially lethal error requiring immediate correction.

  üìë Structured Output Template
  Return your response strictly in this JSON format:
  {{
    "message_categorization": {{
      "subject": "[provided subject title]",
      "type": "[Appointment Request | Medication Request | Test Result Inquiry | Clinical Advice Request | Referral Request | Administrative Request | General Inquiry| Other (specify)]",
      "reasoning": "[brief reasoning for the chosen category]"
    }},
    "response_evaluation": {{
      "clinical_accuracy": {{
        "score": "[0-10]",
        "reasoning": "[brief reasoning]"
      }},
      "urgency_recognition": {{
        "score": "[0-10]",
        "reasoning": "[brief reasoning]"
      }},
      "professional_consultation_guidance": {{
        "score": "[0-10]",
        "reasoning": "[brief reasoning]"
      }},
      "sensitivity_clarity": {{
        "score": "[0-10]",
        "reasoning": "[brief reasoning]"
      }}
    }},
    "errors_identified": [
      {{
        "type": "[Clinical Error | Non-Clinical Error]",
        "severity": "[1-5]",
        "description": "[brief clear description of the error]",
        "text_excerpt": "[exact problematic text excerpt from response]",
        "error_in_physician_response": "[Yes | No]",
        "reason_for_error_in_physician_response": "[exact text excerpt from actual physician response from the result_json to explain why this error is/isn't in physician response]"
      }}
    ]
  }}

  Task Instructions
  Given the structured data below, perform your evaluation exactly as specified above:
  {result_json}

  Rules:
  Focus solely on evaluating the quality, appropriateness, accuracy, and clarity of the LLM-generated response.

  Do NOT evaluate the physician‚Äôs actual response (it's provided only for reference as a ground truth).

  Be precise, objective, and adhere strictly to the provided scoring scales and categories.

  If there are no identifiable errors, return "errors_identified": [].

  Do not generate additional narrative commentary outside the JSON structure.
  """
  return evaluation_prompt


In [18]:
parse_prompt = prompt_preparation(notes, patient_message)
result = parser_LLM(parse_prompt)

In [19]:
print(result)

{
  "provider_info": {
    "provider_name": "Belanger, Ann Marie",
    "department_specialty": "Family Medicine",
    "department_name": "Alameda Family Physicians",
    "department_phone": "510-521-2300",
    "primary_care_provider": "Belanger, Ann Marie"
  },
  "patient_info": {
    "patient_name": "Skylar Howell",
    "patient_age": "29 Y"
  },
  "visit_info": {
    "visit_date": "9/16/2024",
    "visit_type": "Ambulatory Visit",
    "location": {
      "patient": "",
      "provider": ""
    },
    "chief_complaint": "Preventative Care",
    "history_of_present_illness": [
      "Patient presents with Preventative Care",
      "Pt has no other issues or concerns"
    ],
    "active_problems": [
      {
        "problem": "Encounter to establish care",
        "code": ""
      },
      {
        "problem": "Preventative health care",
        "code": ""
      },
      {
        "problem": "Anxiety and depression",
        "code": ""
      },
      {
        "problem": "PTSD (post-tra

In [20]:
evaluation_prompt = evaluation_prompt_preparation(result)
evaluation = evaluator_LLM(evaluation_prompt)
print(evaluation)

{
  "message_categorization": {
    "subject": "Hearing concerns and request for testing after military service",
    "type": "Clinical Advice Request",
    "reasoning": "The patient is expressing concern about worsening hearing, noting a possible connection to military service, and is specifically asking for advice regarding hearing testing."
  },
  "response_evaluation": {
    "clinical_accuracy": {
      "score": "10",
      "reasoning": "The response appropriately addresses the concern by recommending formal audiologic evaluation and provides clear follow-up actions, aligning with standard of care for new/progressive hearing loss."
    },
    "urgency_recognition": {
      "score": "10",
      "reasoning": "The response identifies the urgency, offering explicit instruction to seek urgent in-person evaluation if sudden worsening or severe symptoms develop."
    },
    "professional_consultation_guidance": {
      "score": "10",
      "reasoning": "Provides clear advice for audiology

In [21]:
print(patient_message)

Hi Dr Belanger,I'm reaching out because I forgot to mention a concern that my husband and I both have about my hearing. It seems to be getting worse, fairly quickly, to us & I wonder if it's due to my military service. Could we arrange for me to have testing done? thanks& I look forward to hearing back from you :)-Skylar


In [22]:
print(actual_response.replace("<10>", "\n"))

Hi Skylar, I've ordered this for your to be done at the Hearing Zone here in Alameda, if you prefer Stanford in Emeryville let me know. Dr. Belanger 


In [23]:
print(LLM_response.replace("<10>", "\n"))

Hi Skylar,

Thank you for reaching out. I'm sorry to hear about your concerns with your hearing. Given the rapid changes you're experiencing, it would be a good idea to have this evaluated further. I will review your request and arrange for a referral to an audiologist for comprehensive hearing testing. 

Please keep an eye on your MyHealth account for the referral details. If you have any other questions or need further assistance, feel free to reach out.

Best regards,
