#### using actual notes and questions from the eConsult_QA table and test LLM performance on  the icd 10 extract 
### check description table for the chief complaints


In [12]:
# Import the API and other necessary libraries
import sys
sys.path.append('..')  # Add parent directory to path to import the API

# Data manipulation and display
import pandas as pd
pd.set_option('display.max_columns', None)  # Show all columns in the output
from IPython.display import Image, display
import random
import numpy as np
import matplotlib.pyplot as plt
import json
import os
from dotenv import load_dotenv

# Google BigQuery related imports
from google.cloud import bigquery
from google.cloud.bigquery import dbapi
%load_ext google.cloud.bigquery

# Local API imports
from api.bigquery_api import BigQueryAPI
from importlib import reload
from api import bigquery_api
reload(bigquery_api)

# Initialize BigQuery client
client = bigquery.Client("som-nero-phi-jonc101")

# import LLM API: use langgraph as of now
# from groq import Groq
from typing import TypedDict, Annotated, Sequence
from langgraph.graph import StateGraph, END
from langchain_core.messages import HumanMessage, AIMessage
from langchain_groq import ChatGroq
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnableLambda
import operator
import json
import re
import logging
import datetime
import requests
load_dotenv()






The google.cloud.bigquery extension is already loaded. To reload it, use:
  %reload_ext google.cloud.bigquery




True

# Query

In [2]:
%%bigquery --use_rest_api eConsult_QA
select * from som-nero-phi-jonc101.Digital_Medical_Con.eConsult_QA limit 100

Query is running:   0%|          |

Downloading:   0%|          |

In [54]:
eConsult_QA.iloc[[random.randint(0, len(eConsult_QA))]]

Unnamed: 0,anon_id,jittered_note_date,question,answer,prov_type,dept_specialty,dept_name
2,JC701846,2024-04-11 17:30:00,My Clinical Question (condition: growth/lesion...,- based on chart review only Recommendatio...,PHYSICIAN,DERMATOLOGY,DERMATOLOGY HOOVER


In [3]:
%%bigquery --use_rest_api top_200_icd10_codes
select distinct icd10, dx_name, dm.specialty, count(icd10) as count from som-nero-phi-jonc101.shc_core_2024.diagnosis as dx
JOIN `som-nero-phi-jonc101.shc_core_2024.dep_map` dm
  ON dx.dept_id = dm.department_id
where dm.specialty IN ('Infectious Diseases', 'Endocrinology', 'Hematology')
group by icd10,dx_name,dm.specialty
order by count desc
limit 10

Query is running:   0%|          |

Downloading:   0%|          |

In [4]:
top_200_icd10_codes

Unnamed: 0,icd10,dx_name,specialty,count
0,M81.0,Age-related osteoporosis without current patho...,Endocrinology,133009
1,Z79.4,Long term (current) use of insulin,Endocrinology,90259
2,E11.65,Type 2 diabetes mellitus with hyperglycemia,Endocrinology,88002
3,E03.9,"Hypothyroidism, unspecified type",Endocrinology,85715
4,E03.9,"Hypothyroidism, unspecified",Endocrinology,80150
5,E78.5,"Hyperlipidemia, unspecified",Endocrinology,77241
6,I10,Essential (primary) hypertension,Endocrinology,74292
7,E55.9,"Vitamin D deficiency, unspecified",Endocrinology,73312
8,B20,Human immunodeficiency virus (HIV) disease (CM...,Infectious Diseases,61123
9,Z94.81,Bone marrow transplant status,Hematology,60425


In [5]:
top_200_icd10_codes_cleaned = top_200_icd10_codes[['icd10']].drop_duplicates()[:400]

# Logging Setup

In [9]:
# Set up logging
class NonEmptyFileHandler(logging.FileHandler):
    def __init__(self, filename, mode='a', encoding=None, delay=False):
        super().__init__(filename, mode, encoding, delay=True)
        self.filename = filename
        self._has_logged = False

    def emit(self, record):
        if not self._has_logged:
            self._has_logged = True
            self._open()
        super().emit(record)

    def close(self):
        if self._has_logged:
            super().close()
        else:
            # If no logs were written, remove the empty file
            try:
                os.remove(self.filename)
            except OSError:
                pass

# Create logs directory if it doesn't exist
log_dir = "logs"
os.makedirs(log_dir, exist_ok=True)

# Set up logging with the custom handler
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        NonEmptyFileHandler(os.path.join(log_dir, f'clinical_workflow_{datetime.datetime.now().strftime("%Y%m%d")}.log')),
        logging.StreamHandler()
    ]
)

In [10]:
def log_stage(stage_name: str, input_data: dict, output_data: dict):
    """Log the input and output of each stage."""
    input_copy = input_data.copy()
    output_copy = output_data.copy()
    
    if 'icd10_codes' in input_copy and isinstance(input_copy['icd10_codes'], pd.DataFrame):
        input_copy['icd10_codes'] = input_copy['icd10_codes'].to_string()
    if 'icd10_codes' in output_copy and isinstance(output_copy['icd10_codes'], pd.DataFrame):
        output_copy['icd10_codes'] = output_copy['icd10_codes'].to_string()
    
    logging.info(f"\n{'='*50}")
    logging.info(f"Stage: {stage_name}")
    logging.info(f"Input: {json.dumps(input_copy, indent=2)}")
    logging.info(f"Output: {json.dumps(output_copy, indent=2)}")
    logging.info(f"{'='*50}\n")

# Build Langgraph to parse patient info

In [14]:
# api_key = os.getenv("GROQ_API_KEY")
api_key = os.getenv("HEALTHREX_API_KEY")
headers = {'Ocp-Apim-Subscription-Key': api_key, 'Content-Type': 'application/json'}
url = "https://apim.stanfordhealthcare.org/openai-eastus2/deployments/gpt-4.1/chat/completions?api-version=2025-01-01-preview" 
def query_llm(my_question):
    payload = json.dumps({
        "model": "gpt-4.1", 
        "messages": [{"role": "user", "content": my_question}]
    })
    response = requests.request("POST", url, headers=headers, data=payload)
    message_content = response.json()["choices"][0]["message"]["content"]
    print(message_content)
    return message_content


In [15]:
def clean_output(output):
    """
    Clean up the output by removing content wrapped in <think> tags and extracting only the actual response.
    """
    # If the output is already a DataFrame, return it directly
    if isinstance(output, pd.DataFrame):
        return output
    
    # Remove all content between <think> tags
    cleaned_output = re.sub(r'<think>.*?</think>', '', output, flags=re.DOTALL)
    
    # Remove any leading/trailing whitespace
    cleaned_output = cleaned_output.strip()
    
    return cleaned_output

    


In [28]:
# Define the state type
class ClinicalState(TypedDict):
    clinical_question: str
    clinical_notes: str
    icd10_codes: pd.DataFrame
    patient_age: int | None
    patient_gender: str | None
    icd10_code: str | None
    rationale: str | None
    error: str | None
    retry_count: int      
    stopped: bool | None  # <--- to flag if stopped

In [29]:
def stopper_node(state: dict) -> dict:
    state = state.copy()
    state['stopped'] = True
    state['error'] = f"Stopped after {state.get('retry_count', 0)} retries. Manual review required."
    log_stage("stopper_node", state, state)
    return state


In [25]:
def extract_patient_info(state: dict) -> dict:
    """Extract patient age and gender from clinical notes."""
    input_state = state.copy()
    
    # llm = ChatGroq(
    #     model_name="Deepseek-R1-Distill-Llama-70b",
    #     temperature=0.3,
    #     api_key=api_key
    # )
    # llm = ChatOpenAI(model="gpt-4", temperature=0.3)
    prompt = f"""
    Extract the patient's age and gender from the following clinical notes.
    Return ONLY a JSON object with 'age' and 'gender' fields.
    DO NOT include any other text, thinking process, or explanation.
    The response should start with {{ and end with }}.
    
    Example of expected format:
    {{"age": 55, "gender": "male"}}
    
    Clinical Notes: {state.get('clinical_notes')}
    """
    logging.info(f"LLM Prompt for extract_patient_info:\n{prompt}")
    
    
    response = query_llm(prompt)
    logging.info(f"LLM Response for extract_patient_info:\n{response}")

    try:
        # Clean the response to handle markdown code block
        content = clean_output(response)        
        info = json.loads(content)
        # print("Parsed JSON:", info)  # Debug print
        state['patient_age'] = info['age']
        state['patient_gender'] = info['gender']
    except Exception as e:
        # print("Error occurred:", str(e))  # Debug print
        state['error'] = f"Failed to extract patient information: {str(e)}"
    
    log_stage("extract_patient_info", input_state, state)
    return state


In [51]:
def match_icd10_code(state: dict) -> dict:
    """Match clinical information to ICD-10 code."""

    # if state.get('error'):
    #     return state
    
    state['error'] = None
    state['retry_count'] += 1
    
    input_state = state.copy()

        
    # llm = ChatGroq(
    #     model="Deepseek-R1-Distill-Llama-70b",
    #     api_key= api_key
    # )
    # llm = ChatOpenAI(model="gpt-4", temperature=0.3)
    
    prompt = f"""
    Match the clinical information to the most appropriate ICD-10 code from the provided list.
    Return ONLY a JSON object with exactly two fields: 'icd10_code' and 'rationale'.
    DO NOT include any other text, thinking process, or explanation.
    The response should start with {{ and end with }}.

    Example of expected format:
    {{"icd10_code": "xxx", "rationale": "xxxxx"}}

    
    Clinical Question: {state.get('clinical_question')}
    Clinical Notes: {state.get('clinical_notes')}
    Patient Age: {state.get('patient_age')}
    Patient Gender: {state.get('patient_gender')}
    
    Available ICD-10 Codes:
    {state.get('icd10_codes')}
    """
    logging.info(f"LLM Prompt for match_icd10_code:\n{prompt}")
    # response = llm.invoke([HumanMessage(content=prompt)])
    response = query_llm(prompt)
    logging.info(f"LLM Response for match_icd10_code:\n{response}")
    try:
        output = clean_output(response)
        match = json.loads(output)
        # print("now is printing match")
        # print(match)
        state['icd10_code'] = match['icd10_code']
        state['rationale'] = match['rationale']
        # print("now is printing state")
        # print(state)
    except:
        state['error'] = "Failed to match ICD-10 code"
    
        
    log_stage("match_icd10_code", input_state, state)
    return state

In [42]:
def validate_icd10_code_exists(state: dict) -> dict:
    """Validate if the ICD-10 code exists in the provided list."""
    input_state = state.copy()
    # Check if the code is in the provided list
    valid_codes = state['icd10_codes']['icd10'].tolist()
    if state.get('icd10_code') not in valid_codes:
        logging.warning(f"Invalid code {state.get('icd10_code')}, will rerun matching...")
        print(f"Invalid code {state.get('icd10_code')}, will rerun matching...")
        state['error'] = f"Invalid code {state.get('icd10_code')}, not in provided list"
        state['icd10_code'] = None
        state['rationale'] = None
        # return state
    else:
        # Clear any previous errors if validation passes
        state['error'] = None
    log_stage("validate_icd10_code_exists", input_state, state)
    return state

In [43]:
def validate_icd10_clinical_match(state: dict) -> dict:
    """Validate if the matched ICD-10 code is clinically appropriate."""
    input_state = state.copy()
    # llm = ChatGroq(
    #     model_name="Deepseek-R1-Distill-Llama-70b",
    #     temperature=0.3,
    #     api_key= api_key
    # )
    # llm = ChatOpenAI(model="gpt-4", temperature=0.3)
    
    prompt = f"""
    Validate if the matched ICD-10 code is appropriate for the clinical case.
    Return ONLY a JSON object with exactly two fields: 'is_valid' (boolean) and 'reason' (string).
    DO NOT include any other text, thinking process, or explanation.

    Example of expected format:
    {{"is_valid": true, "reason": "The code I10 matches the patient's hypertension diagnosis"}}
    or
    {{"is_valid": false, "reason": "The code I10 is too general for this specific case"}}

    Current Match:
    ICD-10 Code: {state.get('icd10_code')}
    Rationale: {state.get('rationale')}

    Clinical Question: {state.get('clinical_question')}
    Clinical Notes: {state.get('clinical_notes')}
    Patient Age: {state.get('patient_age')}
    Patient Gender: {state.get('patient_gender')}
    
    Available ICD-10 Codes:
    {state['icd10_codes'].to_string()}
    """
    logging.info(f"LLM Prompt for validate_icd10_clinical_match:\n{prompt}")
    
    # response = llm.invoke([HumanMessage(content=prompt)])
    response = query_llm(prompt)
    logging.info(f"LLM Response for validate_icd10_clinical_match:\n{response}")
    try:
        output = clean_output(response)
        validation = json.loads(output)
        logging.info(f"Validation result: {validation}")
        
        if not validation['is_valid']:
            print("Invalid match, will rerun matching...")
            state['error'] = f"Invalid match: {validation['reason']}"
            state['icd10_code'] = None
            state['rationale'] = None
            return state
        else:
             # Clear any previous errors if validation passes
            state['error'] = None
    except Exception as e:
        logging.error(f"Validation error: {str(e)}")
        state['error'] = f"Failed to validate ICD-10 code: {str(e)}"
        return state
    
   
    log_stage("validate_icd10_clinical_match", input_state, state)
    return state

In [52]:
MAX_RETRIES = 3
def create_clinical_graph() -> StateGraph:
    workflow = StateGraph(dict)
    
    # Add nodes
    workflow.add_node("extract_patient_info", RunnableLambda(extract_patient_info))
    workflow.add_node("match_icd10_code", RunnableLambda(match_icd10_code))
    workflow.add_node("validate_icd10_code_exists", RunnableLambda(validate_icd10_code_exists))
    workflow.add_node("validate_icd10_clinical_match", RunnableLambda(validate_icd10_clinical_match))
    # Add stopper node
    workflow.add_node("stopper", RunnableLambda(stopper_node))

  
    # Add basic edges
    workflow.add_edge("extract_patient_info", "match_icd10_code")
    workflow.add_edge("match_icd10_code", "validate_icd10_code_exists")

    # Helper to increment retry count
    def check_and_route(state, next_success):
        if state.get("error"):
            # Only increment retry_count when a retry will actually happen
            if state.get("retry_count", 0) >= MAX_RETRIES:
                return "stopper"
            return "match_icd10_code"
        else:
            return next_success

        
    # Conditional for code existence validation
    workflow.add_conditional_edges(
        "validate_icd10_code_exists",
        lambda x: check_and_route(x, "validate_icd10_clinical_match"),
        {
            "match_icd10_code": "match_icd10_code",
            "validate_icd10_clinical_match": "validate_icd10_clinical_match",
            "stopper": "stopper"
        }
    )

    # Conditional for clinical validation
    workflow.add_conditional_edges(
        "validate_icd10_clinical_match",
        lambda x: check_and_route(x, END),
        {
            "match_icd10_code": "match_icd10_code",
            END: END,
            "stopper": "stopper"
        }
    )
    
    # # Define conditional edges
    # workflow.add_conditional_edges(
    #     "validate_icd10_code_exists",
    #     lambda x: "match_icd10_code" if x.get("error") else "validate_icd10_clinical_match",
    #     {
    #         "match_icd10_code": "match_icd10_code",
    #         "validate_icd10_clinical_match": "validate_icd10_clinical_match"
    #     }
    # )
    
    # # ensure it goes back to match_icd10_code when validation fails
    # workflow.add_conditional_edges(
    #     "validate_icd10_clinical_match",
    #     lambda x: "match_icd10_code" if x.get("error") else END,  # When error, go back to matching
    #     {
    #         "match_icd10_code": "match_icd10_code",  # Map the return value to the actual node
    #         END: END
    #     }
    # )
    
    # Set entry point
    workflow.set_entry_point("extract_patient_info")
    
    return workflow.compile()

In [49]:
# Example usage
def process_clinical_case(clinical_question: str, clinical_notes: str, icd10_codes_df: pd.DataFrame) -> dict:
    # Create the graph
    """Process a clinical case through the workflow."""
    logging.info(f"\n{'='*50}")
    logging.info("Starting new clinical case processing")
    logging.info(f"Clinical Question: {clinical_question}")
    logging.info(f"Clinical Notes: {clinical_notes}")
    logging.info(f"{'='*50}\n")
    graph = create_clinical_graph()
    
    # Initialize state
    initial_state = {
        "clinical_question": clinical_question,
        "clinical_notes": clinical_notes,
        "icd10_codes": icd10_codes_df,
        "patient_age": None,
        "patient_gender": None,
        "icd10_code": None,
        "rationale": None,
        "error": None,
        "retry_count": 1,
        "stopped": False
    }
    # print("Initial state:", initial_state)  # Debug print
    
    # Run the graph
    config = {"recursion_limit": 100}  # Increase from default 25 to 100
    result = graph.invoke(initial_state, config=config)
    result = graph.invoke(initial_state)
    clean_result = {
        "patient_age": result.get("patient_age"),
        "patient_gender": result.get("patient_gender"),
        "icd10_code": result.get("icd10_code"),
        "rationale": result.get("rationale"),
        "error": result.get("error"),
        "retry_count": result.get("retry_count"),
        "stopped": result.get("stopped")
    }
    # Log final result
    logging.info(f"\n{'='*50}")
    logging.info("Final Result:")
    logging.info(json.dumps(clean_result, indent=2))
    logging.info(f"{'='*50}\n")
    
    # Return the result
    return clean_result


In [55]:
# clinical_question ="Could this patient's chronic upper abdominal discomfort and iron deficiency anemia indicate a peptic ulcer or upper GI malignancy, and is EGD indicated?"

# clinical_notes = "47-year-old male with no significant past medical history presents with 3-month history of epigastric discomfort, early satiety, and unintentional 10 lb weight loss. Denies NSAID use, alcohol, or overt GI bleeding. Labs show iron deficiency anemia (Hgb 10.5, MCV 74, ferritin 12). Physical exam unremarkable. Concern for peptic ulcer disease or less likely gastric cancer. Seeking input on need for upper endoscopy."
# Process the case
clinical_question = "In a patient with persistent fever, night sweats, and weight loss despite broad-spectrum antibiotics, could this represent disseminated mycobacterial infection or an atypical fungal process, and what diagnostic workup is indicated?"

clinical_notes = (
    "52‑year‑old female with rheumatoid arthritis on methotrexate and low‑dose prednisone presents with "
    "6‑week history of daily fevers up to 102°F, drenching night sweats, and a 12‑lb unintentional weight loss. "
    "Initial blood cultures and chest X‑ray were unrevealing. She denies cough, dyspnea, or focal pain. "
    "Lab results show mild anemia (Hgb 11.2), elevated ESR (85 mm/hr), and CRP (12 mg/dL). "
    "CT chest/abdomen reveals multiple small pulmonary nodules and hepatosplenic lesions. "
    "Concern for disseminated non‑tuberculous mycobacteria vs. histoplasmosis. Input on biopsy site selection and empiric therapy is requested."
)
result = process_clinical_case(clinical_question, clinical_notes, top_200_icd10_codes_cleaned)
print(json.dumps(result, indent=2))

2025-06-04 02:17:06,844 - INFO - 
2025-06-04 02:17:06,846 - INFO - Starting new clinical case processing
2025-06-04 02:17:06,846 - INFO - Clinical Question: In a patient with persistent fever, night sweats, and weight loss despite broad-spectrum antibiotics, could this represent disseminated mycobacterial infection or an atypical fungal process, and what diagnostic workup is indicated?
2025-06-04 02:17:06,848 - INFO - Clinical Notes: 52‑year‑old female with rheumatoid arthritis on methotrexate and low‑dose prednisone presents with 6‑week history of daily fevers up to 102°F, drenching night sweats, and a 12‑lb unintentional weight loss. Initial blood cultures and chest X‑ray were unrevealing. She denies cough, dyspnea, or focal pain. Lab results show mild anemia (Hgb 11.2), elevated ESR (85 mm/hr), and CRP (12 mg/dL). CT chest/abdomen reveals multiple small pulmonary nodules and hepatosplenic lesions. Concern for disseminated non‑tuberculous mycobacteria vs. histoplasmosis. Input on bio

{"age": 52, "gender": "female"}


2025-06-04 02:17:10,783 - INFO - LLM Response for match_icd10_code:
{"icd10_code": "B20", "rationale": "The patient's persistent fevers, night sweats, weight loss, and multiple organ involvement in the context of immunosuppression are most consistent with an opportunistic infection, and of the available codes, B20 (HIV disease resulting in infectious and parasitic diseases) is the only code related to disseminated infectious processes in immunocompromised hosts. While the patient does not have a known HIV diagnosis, no other listed codes fit the clinical scenario of systemic infection. Therefore, B20 is the closest ICD-10 code from the list to represent a disseminated infectious process in an immunocompromised host."}
2025-06-04 02:17:10,785 - INFO - 
2025-06-04 02:17:10,786 - INFO - Stage: match_icd10_code
2025-06-04 02:17:10,786 - INFO - Input: {
  "clinical_question": "In a patient with persistent fever, night sweats, and weight loss despite broad-spectrum antibiotics, could this re

{"icd10_code": "B20", "rationale": "The patient's persistent fevers, night sweats, weight loss, and multiple organ involvement in the context of immunosuppression are most consistent with an opportunistic infection, and of the available codes, B20 (HIV disease resulting in infectious and parasitic diseases) is the only code related to disseminated infectious processes in immunocompromised hosts. While the patient does not have a known HIV diagnosis, no other listed codes fit the clinical scenario of systemic infection. Therefore, B20 is the closest ICD-10 code from the list to represent a disseminated infectious process in an immunocompromised host."}


2025-06-04 02:17:12,529 - INFO - LLM Response for validate_icd10_clinical_match:
{"is_valid": false, "reason": "The code B20 specifically refers to HIV disease with resulting infections; the patient does not have HIV, so this code is not appropriate for her immunosuppression due to rheumatoid arthritis therapy."}
2025-06-04 02:17:12,530 - INFO - Validation result: {'is_valid': False, 'reason': 'The code B20 specifically refers to HIV disease with resulting infections; the patient does not have HIV, so this code is not appropriate for her immunosuppression due to rheumatoid arthritis therapy.'}
2025-06-04 02:17:12,534 - INFO - LLM Prompt for match_icd10_code:

    Match the clinical information to the most appropriate ICD-10 code from the provided list.
    Return ONLY a JSON object with exactly two fields: 'icd10_code' and 'rationale'.
    DO NOT include any other text, thinking process, or explanation.
    The response should start with { and end with }.

    Example of expected forma

{"is_valid": false, "reason": "The code B20 specifically refers to HIV disease with resulting infections; the patient does not have HIV, so this code is not appropriate for her immunosuppression due to rheumatoid arthritis therapy."}
Invalid match, will rerun matching...


2025-06-04 02:17:14,683 - INFO - LLM Response for match_icd10_code:
{"icd10_code": "B20", "rationale": "The patient's immunosuppressed status and systemic symptoms with concern for disseminated infection match B20 (HIV disease resulting in infectious and parasitic diseases), as this is the closest code among those provided relating to disseminated opportunistic infections, although the patient does not have diagnosed HIV; the other codes do not pertain to infectious etiologies."}
2025-06-04 02:17:14,686 - INFO - 
2025-06-04 02:17:14,686 - INFO - Stage: match_icd10_code
2025-06-04 02:17:14,687 - INFO - Input: {
  "clinical_question": "In a patient with persistent fever, night sweats, and weight loss despite broad-spectrum antibiotics, could this represent disseminated mycobacterial infection or an atypical fungal process, and what diagnostic workup is indicated?",
  "clinical_notes": "52\u2011year\u2011old female with rheumatoid arthritis on methotrexate and low\u2011dose prednisone pre

{"icd10_code": "B20", "rationale": "The patient's immunosuppressed status and systemic symptoms with concern for disseminated infection match B20 (HIV disease resulting in infectious and parasitic diseases), as this is the closest code among those provided relating to disseminated opportunistic infections, although the patient does not have diagnosed HIV; the other codes do not pertain to infectious etiologies."}


2025-06-04 02:17:16,633 - INFO - LLM Response for validate_icd10_clinical_match:
{"is_valid": false, "reason": "The code B20 is specific for HIV disease; the patient is immunosuppressed from medications but does not have HIV, so this code is inappropriate for her diagnosis."}
2025-06-04 02:17:16,634 - INFO - Validation result: {'is_valid': False, 'reason': 'The code B20 is specific for HIV disease; the patient is immunosuppressed from medications but does not have HIV, so this code is inappropriate for her diagnosis.'}
2025-06-04 02:17:16,637 - INFO - LLM Prompt for match_icd10_code:

    Match the clinical information to the most appropriate ICD-10 code from the provided list.
    Return ONLY a JSON object with exactly two fields: 'icd10_code' and 'rationale'.
    DO NOT include any other text, thinking process, or explanation.
    The response should start with { and end with }.

    Example of expected format:
    {"icd10_code": "xxx", "rationale": "xxxxx"}


    Clinical Question: 

{"is_valid": false, "reason": "The code B20 is specific for HIV disease; the patient is immunosuppressed from medications but does not have HIV, so this code is inappropriate for her diagnosis."}
Invalid match, will rerun matching...


2025-06-04 02:17:18,956 - INFO - LLM Response for match_icd10_code:
{"icd10_code": "B20", "rationale": "The patient's presentation is suggestive of a disseminated opportunistic infection, which is commonly encountered in immunocompromised individuals. Although she is not explicitly described as HIV-positive, among the given codes, B20 (HIV disease resulting in infectious and parasitic diseases) most closely matches the scenario of an immunosuppressed state with disseminated mycobacterial or fungal infection. None of the other codes relate to infections or immunosuppression, making B20 the most appropriate choice."}
2025-06-04 02:17:18,957 - INFO - 
2025-06-04 02:17:18,958 - INFO - Stage: match_icd10_code
2025-06-04 02:17:18,959 - INFO - Input: {
  "clinical_question": "In a patient with persistent fever, night sweats, and weight loss despite broad-spectrum antibiotics, could this represent disseminated mycobacterial infection or an atypical fungal process, and what diagnostic workup is

{"icd10_code": "B20", "rationale": "The patient's presentation is suggestive of a disseminated opportunistic infection, which is commonly encountered in immunocompromised individuals. Although she is not explicitly described as HIV-positive, among the given codes, B20 (HIV disease resulting in infectious and parasitic diseases) most closely matches the scenario of an immunosuppressed state with disseminated mycobacterial or fungal infection. None of the other codes relate to infections or immunosuppression, making B20 the most appropriate choice."}


2025-06-04 02:17:20,511 - INFO - LLM Response for validate_icd10_clinical_match:
{"is_valid": false, "reason": "The code B20 is specific for HIV disease resulting in infectious and parasitic diseases, but the patient is immunocompromised due to rheumatoid arthritis treatment, not HIV, making B20 inappropriate for this case."}
2025-06-04 02:17:20,511 - INFO - Validation result: {'is_valid': False, 'reason': 'The code B20 is specific for HIV disease resulting in infectious and parasitic diseases, but the patient is immunocompromised due to rheumatoid arthritis treatment, not HIV, making B20 inappropriate for this case.'}
2025-06-04 02:17:20,513 - INFO - 
2025-06-04 02:17:20,514 - INFO - Stage: stopper_node
2025-06-04 02:17:20,514 - INFO - Input: {
  "clinical_question": "In a patient with persistent fever, night sweats, and weight loss despite broad-spectrum antibiotics, could this represent disseminated mycobacterial infection or an atypical fungal process, and what diagnostic workup is

{"is_valid": false, "reason": "The code B20 is specific for HIV disease resulting in infectious and parasitic diseases, but the patient is immunocompromised due to rheumatoid arthritis treatment, not HIV, making B20 inappropriate for this case."}
Invalid match, will rerun matching...


2025-06-04 02:17:21,641 - INFO - LLM Response for extract_patient_info:
{"age": 52, "gender": "female"}
2025-06-04 02:17:21,643 - INFO - 
2025-06-04 02:17:21,643 - INFO - Stage: extract_patient_info
2025-06-04 02:17:21,644 - INFO - Input: {
  "clinical_question": "In a patient with persistent fever, night sweats, and weight loss despite broad-spectrum antibiotics, could this represent disseminated mycobacterial infection or an atypical fungal process, and what diagnostic workup is indicated?",
  "clinical_notes": "52\u2011year\u2011old female with rheumatoid arthritis on methotrexate and low\u2011dose prednisone presents with 6\u2011week history of daily fevers up to 102\u00b0F, drenching night sweats, and a 12\u2011lb unintentional weight loss. Initial blood cultures and chest X\u2011ray were unrevealing. She denies cough, dyspnea, or focal pain. Lab results show mild anemia (Hgb 11.2), elevated ESR (85\u202fmm/hr), and CRP (12\u202fmg/dL). CT chest/abdomen reveals multiple small pulm

{"age": 52, "gender": "female"}


2025-06-04 02:17:24,262 - INFO - LLM Response for match_icd10_code:
{"icd10_code": "B20", "rationale": "The patient's presentation with persistent fever, night sweats, weight loss, multiple small pulmonary nodules, and hepatosplenic lesions on immunosuppression raises concern for an opportunistic infection. B20 (HIV disease) is the only infectious disease code provided and is used for immunosuppression-related infections not otherwise specified by other codes in this list. While she is not documented as HIV-positive, none of the other available codes represent mycobacterial or fungal infections; hence, B20 most closely matches the clinical scenario of immunosuppression and opportunistic infection risk."}
2025-06-04 02:17:24,265 - INFO - 
2025-06-04 02:17:24,265 - INFO - Stage: match_icd10_code
2025-06-04 02:17:24,266 - INFO - Input: {
  "clinical_question": "In a patient with persistent fever, night sweats, and weight loss despite broad-spectrum antibiotics, could this represent dissem

{"icd10_code": "B20", "rationale": "The patient's presentation with persistent fever, night sweats, weight loss, multiple small pulmonary nodules, and hepatosplenic lesions on immunosuppression raises concern for an opportunistic infection. B20 (HIV disease) is the only infectious disease code provided and is used for immunosuppression-related infections not otherwise specified by other codes in this list. While she is not documented as HIV-positive, none of the other available codes represent mycobacterial or fungal infections; hence, B20 most closely matches the clinical scenario of immunosuppression and opportunistic infection risk."}


2025-06-04 02:17:25,783 - INFO - LLM Response for validate_icd10_clinical_match:
{"is_valid": false, "reason": "The code B20 refers specifically to HIV disease, but the patient has immunosuppression due to rheumatoid arthritis treatment and not HIV, so B20 is not appropriate for this case."}
2025-06-04 02:17:25,784 - INFO - Validation result: {'is_valid': False, 'reason': 'The code B20 refers specifically to HIV disease, but the patient has immunosuppression due to rheumatoid arthritis treatment and not HIV, so B20 is not appropriate for this case.'}
2025-06-04 02:17:25,787 - INFO - 
2025-06-04 02:17:25,787 - INFO - Stage: stopper_node
2025-06-04 02:17:25,787 - INFO - Input: {
  "clinical_question": "In a patient with persistent fever, night sweats, and weight loss despite broad-spectrum antibiotics, could this represent disseminated mycobacterial infection or an atypical fungal process, and what diagnostic workup is indicated?",
  "clinical_notes": "52\u2011year\u2011old female with r

{"is_valid": false, "reason": "The code B20 refers specifically to HIV disease, but the patient has immunosuppression due to rheumatoid arthritis treatment and not HIV, so B20 is not appropriate for this case."}
Invalid match, will rerun matching...
{
  "patient_age": 52,
  "patient_gender": "female",
  "icd10_code": null,
  "rationale": null,
  "error": "Stopped after 4 retries. Manual review required.",
  "retry_count": 4,
  "stopped": true
}


In [45]:
# # Example usage
# clinical_question = "Could this patient's recurrent, exertional chest pain with recent ECG abnormalities suggest underlying ischemic heart disease, and would further cardiac workup (e.g., stress testing or angiography) be appropriate at this time?"

# clinical_notes = """55-year-old male with a history of hypertension and hyperlipidemia presents with 2-month history of intermittent chest discomfort described as a pressure-like sensation localized to the left chest, occasionally radiating to the jaw, occurring primarily during brisk walking or stair climbing. Denies associated nausea, diaphoresis, or syncope. Symptoms improve with rest. No prior cardiac history. Vital signs stable. Physical exam unremarkable. Recent resting ECG showed nonspecific ST changes. Lipid panel elevated; LDL 145 mg/dL. Concerned about possible stable angina. Requesting input on next steps for diagnostic evaluation and whether referral to cardiology is appropriate."""

# # print("Clinical notes before processing:", clinical_notes)  # Debug print

# # Process the case
# result = process_clinical_case(clinical_question, clinical_notes, top_200_icd10_codes_cleaned)
# print(json.dumps(result, indent=2))

In [46]:
# Initialize the API
api = BigQueryAPI()

2025-05-12 01:47:52,472 - INFO - Successfully initialized BigQuery client for project som-nero-phi-jonc101


In [47]:
result

{'patient_age': 52,
 'patient_gender': 'female',
 'icd10_code': 'Z79.4',
 'rationale': 'The patient is on long-term (current) use of methotrexate and low-dose prednisone, which is represented by the ICD-10 code Z79.4.',
 'error': None}

In [48]:
api.get_orders(
    params=result,
    result_type='proc',
    limit=100
)


2025-05-12 01:47:52,510 - INFO - Building query for params={'patient_age': 52, 'patient_gender': 'female', 'icd10_code': 'Z79.4', 'rationale': 'The patient is on long-term (current) use of methotrexate and low-dose prednisone, which is represented by the ICD-10 code Z79.4.', 'error': None}, type=proc, year=2024
2025-05-12 01:47:52,531 - INFO - Executing BigQuery query...
2025-05-12 01:47:55,968 - INFO - Query completed successfully. Returned 75 rows.


Unnamed: 0,itemId,description,order_procdepartment,encounter_department,patientRate,encounterRate,nPatientscohortItem,nEncounterscohortItem,nPatientsCohortTotal,nEncountersCohortTotal
0,LABUALB,"ALBUMIN WITH CREATININE, URINE (RANDOM)",Endocrinology,Endocrinology,27.11,10.63,810,1990,2988,18715
1,LABA1C,HEMOGLOBIN A1C,Endocrinology,Endocrinology,26.77,16.12,800,3017,2988,18715
2,LABMETB,"METABOLIC PANEL, BASIC",Endocrinology,Endocrinology,22.49,10.12,672,1894,2988,18715
3,LABMETC,"METABOLIC PANEL, COMPREHENSIVE",Endocrinology,Endocrinology,19.44,6.66,581,1247,2988,18715
4,LABLPDC,LIPID PANEL WITH CALCULATED LDL,Endocrinology,Endocrinology,15.76,5.75,471,1076,2988,18715
...,...,...,...,...,...,...,...,...,...,...
70,LABGLNF,"GLUCOSE NON-FASTING, SERUM/PLASMA",Endocrinology,Endocrinology,0.37,0.06,11,11,2988,18715
71,LABANTITG,THYROGLOBULIN AB ULTRA-SENSITIVE,Endocrinology,Endocrinology,0.37,0.06,11,12,2988,18715
72,EXTK,POTASSIUM (MANUAL ENTRY),Endocrinology,Endocrinology,0.37,0.06,11,11,2988,18715
73,LABT3,"T3, TOTAL",Endocrinology,Endocrinology,0.37,0.07,11,14,2988,18715


In [55]:
api.get_orders(
    params=result,
    result_type='med',
    limit=10
)

2025-05-12 11:31:22,483 - INFO - Building query for params={'patient_age': 52, 'patient_gender': 'female', 'icd10_code': 'Z79.4', 'rationale': 'The patient is on long-term (current) use of methotrexate and low-dose prednisone, which is represented by the ICD-10 code Z79.4.', 'error': None}, type=med, year=2024
2025-05-12 11:31:22,512 - INFO - Executing BigQuery query...
2025-05-12 11:31:30,606 - INFO - Query completed successfully. Returned 10 rows.


Unnamed: 0,itemId,description,department,patientRate,encounterRate,nPatientscohortItem,nEncounterscohortItem,nPatientsCohortTotal,nEncountersCohortTotal
0,28995,METFORMIN 500 MG PO TB24,Endocrinology,5.79,1.46,173,274,2988,18715
1,201231,LANTUS SOLOSTAR U-100 INSULIN 100 UNIT/ML (3 M...,Endocrinology,5.66,1.38,169,258,2988,18715
2,233242,TRULICITY 1.5 MG/0.5 ML SC PNIJ,Endocrinology,4.45,0.95,133,177,2988,18715
3,203639,HUMALOG KWIKPEN INSULIN 100 UNIT/ML SC INPN,Endocrinology,4.45,1.13,133,212,2988,18715
4,24398,"METFORMIN 1,000 MG PO TABS",Endocrinology,4.02,1.2,120,224,2988,18715
5,233241,TRULICITY 0.75 MG/0.5 ML SC PNIJ,Endocrinology,4.02,0.75,120,140,2988,18715
6,232958,JARDIANCE 10 MG PO TABS,Endocrinology,4.02,0.84,120,157,2988,18715
7,232959,JARDIANCE 25 MG PO TABS,Endocrinology,3.25,0.82,97,154,2988,18715
8,239231,BASAGLAR KWIKPEN U-100 INSULIN 100 UNIT/ML (3 ...,Endocrinology,2.91,0.99,87,186,2988,18715
9,247917,TRULICITY 3 MG/0.5 ML SC PNIJ,Endocrinology,2.74,0.61,82,115,2988,18715


In [50]:
# def find_closest_icd10_code(description, icd10_codes_df):
#     """Find the closest matching ICD-10 code based on description similarity."""
#     from sklearn.feature_extraction.text import TfidfVectorizer
#     from sklearn.metrics.pairwise import cosine_similarity
#     import numpy as np
    
#     # Create TF-IDF vectors
#     vectorizer = TfidfVectorizer()
#     descriptions = icd10_codes_df['description'].tolist()
#     tfidf_matrix = vectorizer.fit_transform(descriptions + [description])
    
#     # Calculate similarity
#     similarity_scores = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1])[0]
#     best_match_idx = np.argmax(similarity_scores)
    
#     return icd10_codes_df.iloc[best_match_idx]['icd10_code']

# def parse_clinical_info(clinical_question, clinical_notes, icd10_codes_df, max_retries=3):
#     # ... (previous code) ...
    
#     try:
#         result = json.loads(response)
        
#         # Validate the ICD-10 code
#         if validate_icd10_code(result['icd10_code'], icd10_codes_df):
#             return result
#         else:
#             print(f"Attempt {attempt + 1}: Invalid ICD-10 code {result['icd10_code']}. Finding closest match...")
#             # Find the closest matching code
#             closest_code = find_closest_icd10_code(result['rationale'], icd10_codes_df)
#             result['icd10_code'] = closest_code
#             result['rationale'] += f"\nNote: Original code {result['icd10_code']} was not in the list. Using closest match {closest_code} instead."
#             return result
                
#     except json.JSONDecodeError:
#         print(f"Attempt {attempt + 1}: Failed to parse JSON response. Retrying...")
#         continue