#### using actual notes and questions from the eConsult_QA table and test LLM performance on  the icd 10 extract 
### check description table for the chief complaints


In [1]:
# Import the API and other necessary libraries
import sys
sys.path.append('..')  # Add parent directory to path to import the API

# Data manipulation and display
import pandas as pd
pd.set_option('display.max_columns', None)  # Show all columns in the output
from IPython.display import Image, display
import random
import numpy as np
import matplotlib.pyplot as plt
import json
import os
from dotenv import load_dotenv

# Google BigQuery related imports
from google.cloud import bigquery
from google.cloud.bigquery import dbapi
%load_ext google.cloud.bigquery

# Local API imports
from api.bigquery_api import BigQueryAPI
from importlib import reload
from api import bigquery_api
reload(bigquery_api)

# Initialize BigQuery client
client = bigquery.Client("som-nero-phi-jonc101")

# import LLM API: use langgraph as of now
# from groq import Groq
from typing import TypedDict, Annotated, Sequence
from langgraph.graph import StateGraph, END
from langchain_core.messages import HumanMessage, AIMessage
from langchain_groq import ChatGroq
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnableLambda
import operator
import json
import re
import logging
import datetime
import requests
load_dotenv()








True

# Query

In [2]:
eConsult_question = pd.read_csv("../real_data/icd.csv")

In [3]:
%%bigquery --use_rest_api top_limit_icd10_codes
select distinct icd10, dx_name, dm.specialty, count(icd10) as count from som-nero-phi-jonc101.shc_core_2024.diagnosis as dx
JOIN `som-nero-phi-jonc101.shc_core_2024.dep_map` dm
  ON dx.dept_id = dm.department_id
where dm.specialty IN ('Infectious Diseases', 'Endocrinology', 'Hematology')
group by icd10,dx_name,dm.specialty
order by count desc
limit 10000

Query is running:   0%|          |

Downloading:   0%|          |

In [4]:
top_icd10_codes_cleaned = top_limit_icd10_codes.drop_duplicates(subset=["icd10"])

In [5]:
top_icd10_codes_cleaned

Unnamed: 0,icd10,dx_name,specialty,count
0,M81.0,Age-related osteoporosis without current patho...,Endocrinology,133009
1,Z79.4,Long term (current) use of insulin,Endocrinology,90259
2,E11.65,Type 2 diabetes mellitus with hyperglycemia,Endocrinology,88002
3,E03.9,"Hypothyroidism, unspecified type",Endocrinology,85715
5,E78.5,"Hyperlipidemia, unspecified",Endocrinology,77241
...,...,...,...,...
9977,F21,Schizotypal personality disorder (CMS-HCC),Infectious Diseases,37
9984,E13.319,Other specified diabetes mellitus with unspeci...,Endocrinology,37
9992,L91.8,Other hypertrophic disorders of the skin,Endocrinology,37
9994,I60.9,Subarachnoid hemorrhage (CMS-HCC),Hematology,37


# Logging Setup

In [6]:
# Set up logging
class NonEmptyFileHandler(logging.FileHandler):
    def __init__(self, filename, mode='a', encoding=None, delay=False):
        super().__init__(filename, mode, encoding, delay=True)
        self.filename = filename
        self._has_logged = False

    def emit(self, record):
        if not self._has_logged:
            self._has_logged = True
            self._open()
        super().emit(record)

    def close(self):
        if self._has_logged:
            super().close()
        else:
            # If no logs were written, remove the empty file
            try:
                os.remove(self.filename)
            except OSError:
                pass

# Create logs directory if it doesn't exist
log_dir = f"../logs/clinical_workflow_{datetime.datetime.now().strftime("%Y%m%d%H%M%S")}"
os.makedirs(log_dir, exist_ok=True)

# Set up logging with the custom handler
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        NonEmptyFileHandler(os.path.join(log_dir, 'run.log')),
        logging.StreamHandler()
    ]
)

In [7]:
top_icd10_codes_cleaned.to_csv(log_dir + "/top_icd10_codes_cleaned.csv", index=False)

In [8]:
def log_stage(stage_name: str, input_data: dict, output_data: dict):
    """Log the input and output of each stage."""
    input_copy = input_data.copy()
    output_copy = output_data.copy()
    
    if 'icd10_codes' in input_copy and isinstance(input_copy['icd10_codes'], pd.DataFrame):
        input_copy['icd10_codes'] = "check separate file for icd10_codes"
        # input_copy['icd10_codes'] = input_copy['icd10_codes'].to_string()
    if 'icd10_codes' in output_copy and isinstance(output_copy['icd10_codes'], pd.DataFrame):
        output_copy['icd10_codes'] = "check separate file for icd10_codes"
    
    logging.info(f"\n{'='*50}")
    logging.info(f"Summary of Stage: {stage_name}")
    logging.info(f"Input: {json.dumps(input_copy, indent=2)}")
    logging.info(f"Output: {json.dumps(output_copy, indent=2)}")
    logging.info(f"{'='*50}\n")

# Build Langgraph to parse patient info

In [9]:
# api_key = os.getenv("GROQ_API_KEY")
api_key = os.getenv("HEALTHREX_API_KEY")
headers = {'Ocp-Apim-Subscription-Key': api_key, 'Content-Type': 'application/json'}
url = "https://apim.stanfordhealthcare.org/openai-eastus2/deployments/gpt-4.1/chat/completions?api-version=2025-01-01-preview" 
def query_llm(my_question):
    payload = json.dumps({
        "model": "gpt-4.1", 
        "messages": [{"role": "user", "content": my_question}]
    })
    response = requests.request("POST", url, headers=headers, data=payload)
    message_content = response.json()["choices"][0]["message"]["content"]
    print(message_content)
    return message_content


In [10]:
def clean_output(output):
    """
    Clean up the output by removing content wrapped in <think> tags and extracting only the actual response.
    """
    # If the output is already a DataFrame, return it directly
    if isinstance(output, pd.DataFrame):
        return output
    
    # Remove all content between <think> tags
    cleaned_output = re.sub(r'<think>.*?</think>', '', output, flags=re.DOTALL)
    
    # Remove any leading/trailing whitespace
    cleaned_output = cleaned_output.strip()
    
    return cleaned_output

    


In [11]:
# Define the state type
class ClinicalState(TypedDict):
    clinical_question: str
    clinical_notes: str
    icd10_codes: pd.DataFrame
    patient_age: int | None
    patient_gender: str | None
    icd10_code: str | None
    rationale: str | None
    error: str | None
    retry_count: int      
    stopped: bool | None  # <--- to flag if stopped

In [12]:
def stopper_node(state: dict) -> dict:
    state = state.copy()
    state['stopped'] = True
    state['error'] = f"Stopped after {state.get('retry_count', 0)} retries. Manual review required."
    log_stage("stopper_node", state, state)
    return state


In [13]:
def extract_patient_info(state: dict) -> dict:
    """Extract patient age and gender from clinical notes."""
    input_state = state.copy()
    
    # llm = ChatGroq(
    #     model_name="Deepseek-R1-Distill-Llama-70b",
    #     temperature=0.3,
    #     api_key=api_key
    # )
    # llm = ChatOpenAI(model="gpt-4", temperature=0.3)
    prompt = f"""
    Extract the patient's age and gender from the following clinical notes.
    Return ONLY a JSON object with 'age' and 'gender' fields.
    DO NOT include any other text, thinking process, or explanation.
    The response should start with {{ and end with }}.
    
    Example of expected format:
    {{"age": 55, "gender": "male"}}
    
    Clinical Notes: {state.get('clinical_notes')}
    """
    logging.info(f"LLM Prompt for extract_patient_info:\n{prompt}")
    
    
    response = query_llm(prompt)
    logging.info(f"LLM Response for extract_patient_info:\n{response}")

    try:
        # Clean the response to handle markdown code block
        content = clean_output(response)        
        info = json.loads(content)
        # print("Parsed JSON:", info)  # Debug print
        state['patient_age'] = info['age']
        state['patient_gender'] = info['gender']
    except Exception as e:
        # print("Error occurred:", str(e))  # Debug print
        state['error'] = f"Failed to extract patient information: {str(e)}"
    
    log_stage("extract_patient_info", input_state, state)
    return state


In [14]:
def match_icd10_code(state: dict) -> dict:
    """Match clinical information to ICD-10 code."""

    # if state.get('error'):
    #     return state
    
    state['error'] = None
    state['retry_count'] += 1
    
    input_state = state.copy()

        
    # llm = ChatGroq(
    #     model="Deepseek-R1-Distill-Llama-70b",
    #     api_key= api_key
    # )
    # llm = ChatOpenAI(model="gpt-4", temperature=0.3)
    
    raw_prompt = f"""
    Match the clinical information to the most appropriate ICD-10 code from the provided list.
    Return ONLY a JSON object with exactly two fields: 'icd10_code' and 'rationale'.
    DO NOT include any other text, thinking process, or explanation.
    The response should start with {{ and end with }}.

    Example of expected format:
    {{"icd10_code": "xxx", "rationale": "xxxxx"}}

    
    Clinical Question: {state.get('clinical_question')}
    Clinical Notes: {state.get('clinical_notes')}
    Patient Age: {state.get('patient_age')}
    Patient Gender: {state.get('patient_gender')}
    
    """
    prompt = raw_prompt + f"Available ICD-10 Codes: {state['icd10_codes'].to_string()}"
    logging.info(f"LLM Prompt for match_icd10_code:\n{raw_prompt} + available ICD-10 codes")
    # response = llm.invoke([HumanMessage(content=prompt)])
    response = query_llm(prompt)
    logging.info(f"LLM Response for match_icd10_code:\n{response}")
    try:
        output = clean_output(response)
        match = json.loads(output)
        # print("now is printing match")
        # print(match)
        state['icd10_code'] = match['icd10_code']
        state['rationale'] = match['rationale']
        # print("now is printing state")
        # print(state)
    except:
        state['error'] = "Failed to match ICD-10 code"
    
        
    log_stage("match_icd10_code", input_state, state)
    return state

In [15]:
def validate_icd10_code_exists(state: dict) -> dict:
    """Validate if the ICD-10 code exists in the provided list."""
    input_state = state.copy()
    # Check if the code is in the provided list
    valid_codes = state['icd10_codes']['icd10'].tolist()
    if state.get('icd10_code') not in valid_codes:
        logging.warning(f"Invalid code {state.get('icd10_code')}, will rerun matching...")
        print(f"Invalid code {state.get('icd10_code')}, will rerun matching...")
        state['error'] = f"Invalid code {state.get('icd10_code')}, not in provided list"
        state['icd10_code'] = None
        state['rationale'] = None
        # return state
    else:
        # Clear any previous errors if validation passes
        state['error'] = None
    log_stage("validate_icd10_code_exists", input_state, state)
    return state

In [16]:
def validate_icd10_clinical_match(state: dict) -> dict:
    """Validate if the matched ICD-10 code is clinically appropriate."""
    input_state = state.copy()
    # llm = ChatGroq(
    #     model_name="Deepseek-R1-Distill-Llama-70b",
    #     temperature=0.3,
    #     api_key= api_key
    # )
    # llm = ChatOpenAI(model="gpt-4", temperature=0.3)
    
    raw_prompt = f"""
    Validate if the matched ICD-10 code is appropriate for the clinical case.
    Return ONLY a JSON object with exactly two fields: 'is_valid' (boolean) and 'reason' (string).
    DO NOT include any other text, thinking process, or explanation.

    Example of expected format:
    {{"is_valid": true, "reason": "The code I10 matches the patient's hypertension diagnosis"}}
    or
    {{"is_valid": false, "reason": "The code I10 is too general for this specific case"}}

    Current Match:
    ICD-10 Code: {state.get('icd10_code')}
    Rationale: {state.get('rationale')}

    Clinical Question: {state.get('clinical_question')}
    Clinical Notes: {state.get('clinical_notes')}
    Patient Age: {state.get('patient_age')}
    Patient Gender: {state.get('patient_gender')}
    """
    prompt = raw_prompt + f"Available ICD-10 Codes: {state['icd10_codes'].to_string()}"
    logging.info(f"LLM Prompt for validate_icd10_clinical_match:\n{raw_prompt} + avaialble ICD-10 codes")
    
    # response = llm.invoke([HumanMessage(content=prompt)])
    response = query_llm(prompt)
    logging.info(f"LLM Response for validate_icd10_clinical_match:\n{response}")
    try:
        output = clean_output(response)
        validation = json.loads(output)
        logging.info(f"Validation result: {validation}")
        
        if not validation['is_valid']:
            print("Invalid match, will rerun matching...")
            state['error'] = f"Invalid match: {validation['reason']}"
            state['icd10_code'] = None
            state['rationale'] = None
            return state
        else:
             # Clear any previous errors if validation passes
            state['error'] = None
    except Exception as e:
        logging.error(f"Validation error: {str(e)}")
        state['error'] = f"Failed to validate ICD-10 code: {str(e)}"
        return state
    
   
    log_stage("validate_icd10_clinical_match", input_state, state)
    return state

In [17]:
def create_clinical_graph(MAX_RETRIES = 3) -> StateGraph:
    workflow = StateGraph(dict)
    
    # Add nodes
    workflow.add_node("extract_patient_info", RunnableLambda(extract_patient_info))
    workflow.add_node("match_icd10_code", RunnableLambda(match_icd10_code))
    workflow.add_node("validate_icd10_code_exists", RunnableLambda(validate_icd10_code_exists))
    workflow.add_node("validate_icd10_clinical_match", RunnableLambda(validate_icd10_clinical_match))
    # Add stopper node
    workflow.add_node("stopper", RunnableLambda(stopper_node))

  
    # Add basic edges
    workflow.add_edge("extract_patient_info", "match_icd10_code")
    workflow.add_edge("match_icd10_code", "validate_icd10_code_exists")

    # Helper to increment retry count
    def check_and_route(state, next_node):
        if state.get("error"):
            # Only increment retry_count when a retry will actually happen
            if state.get("retry_count", 0) >= MAX_RETRIES:
                return "stopper"
            return "match_icd10_code"
        else:
            return next_node
        

        
    # Conditional for code existence validation
    workflow.add_conditional_edges(
        "validate_icd10_code_exists",
        lambda x: check_and_route(x, "validate_icd10_clinical_match"),
        {
            "match_icd10_code": "match_icd10_code",
            "validate_icd10_clinical_match": "validate_icd10_clinical_match",
            "stopper": "stopper"
        }
    )

    # Conditional for clinical validation
    workflow.add_conditional_edges(
        "validate_icd10_clinical_match",
        lambda x: check_and_route(x, END),
        {
            "match_icd10_code": "match_icd10_code",
            END: END,
            "stopper": "stopper"
        }
    )
    workflow.set_entry_point("extract_patient_info")
    
    return workflow.compile()

In [18]:
# Example usage
def process_clinical_case(clinical_question: str, clinical_notes: str, icd10_codes_df: pd.DataFrame, MAX_RETRIES = 10) -> dict:
    # Create the graph
    """Process a clinical case through the workflow."""
    logging.info(f"\n{'='*50}")
    logging.info("Starting new clinical case processing")
    logging.info(f"Clinical Question: {clinical_question}")
    logging.info(f"Clinical Notes: {clinical_notes}")
    logging.info(f"{'='*50}\n")
    graph = create_clinical_graph(MAX_RETRIES)
    
    # Initialize state
    initial_state = {
        "clinical_question": clinical_question,
        "clinical_notes": clinical_notes,
        "icd10_codes": icd10_codes_df,
        "patient_age": None,
        "patient_gender": None,
        "icd10_code": None,
        "rationale": None,
        "error": None,
        "retry_count": 0,
        "stopped": False
    }
    # print("Initial state:", initial_state)  # Debug print
    
    # Run the graph
    config = {"recursion_limit": 100}  # Increase from default 25 to 100
    result = graph.invoke(initial_state, config=config)
    clean_result = {
        "patient_age": result.get("patient_age"),
        "patient_gender": result.get("patient_gender"),
        "icd10_code": result.get("icd10_code"),
        "rationale": result.get("rationale"),
        "error": result.get("error"),
        "retry_count": result.get("retry_count"),
        "stopped": result.get("stopped")
    }
    # Log final result
    logging.info(f"\n{'='*50}")
    logging.info("Final Result:")
    logging.info(json.dumps(clean_result, indent=2))
    logging.info(f"{'='*50}\n")
    
    # Return the result
    return clean_result


In [19]:
# clinical_question ="Could this patient's chronic upper abdominal discomfort and iron deficiency anemia indicate a peptic ulcer or upper GI malignancy, and is EGD indicated?"

# clinical_notes = "47-year-old male with no significant past medical history presents with 3-month history of epigastric discomfort, early satiety, and unintentional 10 lb weight loss. Denies NSAID use, alcohol, or overt GI bleeding. Labs show iron deficiency anemia (Hgb 10.5, MCV 74, ferritin 12). Physical exam unremarkable. Concern for peptic ulcer disease or less likely gastric cancer. Seeking input on need for upper endoscopy."
# Process the case
clinical_question = eConsult_question["Question"].iloc[2]
clinical_notes = eConsult_question["Summary"].iloc[2]
result = process_clinical_case(clinical_question, clinical_notes, top_icd10_codes_cleaned[["icd10"]])
print(json.dumps(result, indent=2))

2025-06-22 16:56:02,227 - INFO - 
2025-06-22 16:56:02,228 - INFO - Starting new clinical case processing
2025-06-22 16:56:02,229 - INFO - Clinical Question: Do these titers suggest positive infection and should we continue treatment?
2025-06-22 16:56:02,229 - INFO - Clinical Notes: Patient is a 31-year old woman who is 19 weeks pregnant. She had syphilis testing done 3 years ago that showed RPR reactive 1:2, FTA-ABS reactive and then RPR reactive 1:1, FTA ABS a few days later. Repeat testing now that showed RPR-reactive 1:1 and TPPA reactive. The patient was not aware of the testing from 3 years ago. She has no symptoms and is at low risk given she is monogamous with her partner who had negative syphilis testing a year ago. The patient was started on treatment with PCN G given she is pregnant.

2025-06-22 16:56:02,296 - INFO - LLM Prompt for extract_patient_info:

    Extract the patient's age and gender from the following clinical notes.
    Return ONLY a JSON object with 'age' and 'g

{"age": 31, "gender": "female"}


2025-06-22 16:56:07,320 - INFO - LLM Response for match_icd10_code:
{"icd10_code": "A53.9", "rationale": "The patient has serologic evidence of syphilis (RPR and treponemal test positive) without symptoms, and was started on penicillin G treatment during pregnancy. A53.9 is the ICD-10 code for 'Syphilis, unspecified', which is used for confirmed syphilis cases without specification of stage or symptomatology."}
2025-06-22 16:56:07,321 - INFO - 
2025-06-22 16:56:07,322 - INFO - Summary of Stage: match_icd10_code
2025-06-22 16:56:07,322 - INFO - Input: {
  "clinical_question": "Do these titers suggest positive infection and should we continue treatment?",
  "clinical_notes": "Patient is a 31-year old woman who is 19 weeks pregnant. She had syphilis testing done 3 years ago that showed RPR reactive 1:2, FTA-ABS reactive and then RPR reactive 1:1, FTA ABS a few days later. Repeat testing now that showed RPR-reactive 1:1 and TPPA reactive. The patient was not aware of the testing from 3 yea

{"icd10_code": "A53.9", "rationale": "The patient has serologic evidence of syphilis (RPR and treponemal test positive) without symptoms, and was started on penicillin G treatment during pregnancy. A53.9 is the ICD-10 code for 'Syphilis, unspecified', which is used for confirmed syphilis cases without specification of stage or symptomatology."}


2025-06-22 16:56:10,549 - INFO - LLM Response for validate_icd10_clinical_match:
{"is_valid": true, "reason": "The code A53.9 ('Syphilis, unspecified') matches the patient's confirmed syphilis diagnosis without symptoms or specified stage, which is appropriate in this pregnant patient with reactive serologic tests."}
2025-06-22 16:56:10,550 - INFO - Validation result: {'is_valid': True, 'reason': "The code A53.9 ('Syphilis, unspecified') matches the patient's confirmed syphilis diagnosis without symptoms or specified stage, which is appropriate in this pregnant patient with reactive serologic tests."}
2025-06-22 16:56:10,551 - INFO - 
2025-06-22 16:56:10,552 - INFO - Summary of Stage: validate_icd10_clinical_match
2025-06-22 16:56:10,553 - INFO - Input: {
  "clinical_question": "Do these titers suggest positive infection and should we continue treatment?",
  "clinical_notes": "Patient is a 31-year old woman who is 19 weeks pregnant. She had syphilis testing done 3 years ago that showed

{"is_valid": true, "reason": "The code A53.9 ('Syphilis, unspecified') matches the patient's confirmed syphilis diagnosis without symptoms or specified stage, which is appropriate in this pregnant patient with reactive serologic tests."}
{
  "patient_age": 31,
  "patient_gender": "female",
  "icd10_code": "A53.9",
  "rationale": "The patient has serologic evidence of syphilis (RPR and treponemal test positive) without symptoms, and was started on penicillin G treatment during pregnancy. A53.9 is the ICD-10 code for 'Syphilis, unspecified', which is used for confirmed syphilis cases without specification of stage or symptomatology.",
  "error": null,
  "retry_count": 1,
  "stopped": false
}


In [20]:
# Initialize the API
api = BigQueryAPI()

2025-06-22 16:59:18,197 - INFO - Successfully initialized BigQuery client for project som-nero-phi-jonc101


In [29]:
result_new = result.copy()
result_new["patient_gender"] = None


In [32]:
result_new

{'patient_age': 31,
 'patient_gender': None,
 'icd10_code': 'A53.9',
 'rationale': "The patient has serologic evidence of syphilis (RPR and treponemal test positive) without symptoms, and was started on penicillin G treatment during pregnancy. A53.9 is the ICD-10 code for 'Syphilis, unspecified', which is used for confirmed syphilis cases without specification of stage or symptomatology.",
 'error': None,
 'retry_count': 1,
 'stopped': False}

In [30]:
orders = api.get_orders(
    params=result_new
)

2025-06-22 17:14:49,212 - INFO - Building query for params={'patient_age': 31, 'patient_gender': None, 'icd10_code': 'A53.9', 'rationale': "The patient has serologic evidence of syphilis (RPR and treponemal test positive) without symptoms, and was started on penicillin G treatment during pregnancy. A53.9 is the ICD-10 code for 'Syphilis, unspecified', which is used for confirmed syphilis cases without specification of stage or symptomatology.", 'error': None, 'retry_count': 1, 'stopped': False}, type=lab, year=2024
2025-06-22 17:14:49,213 - INFO - Executing BigQuery query...
2025-06-22 17:14:55,568 - INFO - Query completed successfully. Returned 10 rows.
2025-06-22 17:14:55,572 - INFO - Building query for params={'patient_age': 31, 'patient_gender': None, 'icd10_code': 'A53.9', 'rationale': "The patient has serologic evidence of syphilis (RPR and treponemal test positive) without symptoms, and was started on penicillin G treatment during pregnancy. A53.9 is the ICD-10 code for 'Syphili

In [31]:
orders

Unnamed: 0,itemId,description,patientRate,encounterRate,nPatientscohortItem,nEncounterscohortItem,nPatientsCohortTotal,nEncountersCohortTotal,result_type
0,LABRPRQLT,RPR W/REFLEX TO TITER,25.16,16.37,80,120,318,733,lab
1,LABMETC,"METABOLIC PANEL, COMPREHENSIVE",18.87,10.91,60,80,318,733,lab
2,LABCLGC3,"CHLAMYDIA/GC, NAAT",18.55,11.05,59,81,318,733,lab
3,LABCBCD,CBC WITH DIFFERENTIAL,15.72,11.6,50,85,318,733,lab
4,LABC4C8,"CD4 / CD8 PANEL, BLOOD",14.47,10.5,46,77,318,733,lab
5,LABUA,URINALYSIS WITH MICROSCOPIC,10.38,5.73,33,42,318,733,lab
6,LABVD25H,"VITAMIN D, 25-HYDROXYVITAMIN",7.55,4.91,24,36,318,733,lab
7,LABHBSQT,HEP B SURF AB QUANT,6.92,3.41,22,25,318,733,lab
8,LABQFTBG,QUANTIFERON TEST FOR LATENT TB,6.6,3.27,21,24,318,733,lab
9,LABHCVA,HEPATITIS C AB IGG,6.29,3.41,20,25,318,733,lab


In [23]:
c = pd.DataFrame([result])
result_df.to_csv(f"{log_dir}/result.csv")
orders.to_csv(f"{log_dir}/orders.csv", index=False)

In [24]:
# def find_closest_icd10_code(description, icd10_codes_df):
#     """Find the closest matching ICD-10 code based on description similarity."""
#     from sklearn.feature_extraction.text import TfidfVectorizer
#     from sklearn.metrics.pairwise import cosine_similarity
#     import numpy as np
    
#     # Create TF-IDF vectors
#     vectorizer = TfidfVectorizer()
#     descriptions = icd10_codes_df['description'].tolist()
#     tfidf_matrix = vectorizer.fit_transform(descriptions + [description])
    
#     # Calculate similarity
#     similarity_scores = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1])[0]
#     best_match_idx = np.argmax(similarity_scores)
    
#     return icd10_codes_df.iloc[best_match_idx]['icd10_code']

# def parse_clinical_info(clinical_question, clinical_notes, icd10_codes_df, max_retries=3):
#     # ... (previous code) ...
    
#     try:
#         result = json.loads(response)
        
#         # Validate the ICD-10 code
#         if validate_icd10_code(result['icd10_code'], icd10_codes_df):
#             return result
#         else:
#             print(f"Attempt {attempt + 1}: Invalid ICD-10 code {result['icd10_code']}. Finding closest match...")
#             # Find the closest matching code
#             closest_code = find_closest_icd10_code(result['rationale'], icd10_codes_df)
#             result['icd10_code'] = closest_code
#             result['rationale'] += f"\nNote: Original code {result['icd10_code']} was not in the list. Using closest match {closest_code} instead."
#             return result
                
#     except json.JSONDecodeError:
#         print(f"Attempt {attempt + 1}: Failed to parse JSON response. Retrying...")
#         continue