# MedQDx Evaluation

In [None]:
!pip install openai
!pip install -q transformers accelerate bitsandbytes
!pip install openai --upgrade



In [None]:
# Importing required libraries
import os
import re
import torch
import openai
import difflib
import pandas as pd
from tqdm import tqdm
from openai import AzureOpenAI
from google.colab import files
from IPython.display import display
from azure.core.credentials import AzureKeyCredential
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## **MQD** - Mean Question-based Diagnostic Similarity

MQD metric quantifies how diagnostic accuracy evolves as more information is gathered.

 We compute the similarity between the model’s predicted diagnosis and the ground-truth disease after the first question, then again after the second and third questions, and report the average similarity at each stage.

 This allows us to track how each additional question improves (or fails to improve) the model’s alignment with the true diagnosis.

In [None]:
df = pd.read_csv('/content/MedQDx_benchmark.csv')

In [None]:
# Calculate mean of maximum similarity per row
similarity_cols = ['Similarity_1', 'Similarity_2', 'Similarity_3']
df['Max_Similarity'] = df[similarity_cols].max(axis=1)
mean_max_similarity = df['Max_Similarity'].mean()

# Calculate mean similarity for each round
mean_similarity_rounds = df[similarity_cols].mean()

# Create a summary DataFrame
summary_df = pd.DataFrame({
    'Metric': [
        'Mean of Max Similarity Across Rows',
        'Mean Similarity Round 1',
        'Mean Similarity Round 2',
        'Mean Similarity Round 3'
    ],
    'Value': [
        mean_max_similarity,
        mean_similarity_rounds['Similarity_1'],
        mean_similarity_rounds['Similarity_2'],
        mean_similarity_rounds['Similarity_3']
    ]
})

# Display the summary in Colab
display(summary_df)

Unnamed: 0,Metric,Value
0,Mean of Max Similarity Across Rows,0.65742
1,Mean Similarity Round 1,0.623759
2,Mean Similarity Round 2,0.620979
3,Mean Similarity Round 3,0.635151


## **ZDA** - Zero-Shot Diagnostic Accuracy
Represents the model’s success rate at diagnosing cases without any prior training (zero-shot).

We evaluated diagnoses on three case types (100%, 80%, and 50% of the full information), converted each prediction to a binary outcome (“success” or “failure”) based on a similarity threshold between the predicted diagnosis and the  ground truth disease, and then calculated the success rate for each category.



In [None]:
pip install azure-core openai pandas tqdm

Collecting azure-core
  Downloading azure_core-1.34.0-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.9/42.9 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Downloading azure_core-1.34.0-py3-none-any.whl (207 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.4/207.4 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: azure-core
Successfully installed azure-core-1.34.0


In [None]:
# Azure OpenAI client setup
AZURE_ENDPOINT = "MedQDx_ENDPOINT"
AZURE_API_KEY = "MedQDx_API_key"
AZURE_API_VERSION = "2024-12-01-preview"
AZURE_DEPLOYMENT_NAME = "gpt-4.1"

client = AzureOpenAI(
    api_version=AZURE_API_VERSION,
    azure_endpoint=AZURE_ENDPOINT,
    api_key=AZURE_API_KEY,)


# DataFrame (benchmark) Loading
df = pd.read_csv('/content/MedQDx_benchmark.csv')


# Zero shot diagnosis
def get_diagnosis(case_text: str, temperature: float = 0.0) -> str:
    """
    Sending the cases to Azure GPT-4.1 and return a single‐term diagnosis.
    """
    prompt = (
        "You are a medical doctor. Based on the following patient case, "
        "provide exactly ONE concise diagnosis term (no explanations, no extra text):\n\n"
        f"{case_text}\n\n"
        "Diagnosis:"
    )
    response = client.chat.completions.create(
        messages=[
            {"role": "system", "content": "You are a helpful medical assistant."},
            {"role": "user", "content": prompt},
        ],
        max_completion_tokens=32,
        temperature=temperature,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=0.0,
        model=AZURE_DEPLOYMENT_NAME,
    )
    return response.choices[0].message.content.strip()


# Checking diagnosis - prognosis similarity
def is_similar(pred: str, true: str, threshold: float = 0.6) -> bool:
    """
    Return True if the SequenceMatcher ratio between pred and true (both lowercased)
    is above or equal to the threshold.
    """
    # Normalize to lowercase
    pred_low = pred.lower()
    true_low = true.lower()
    ratio = difflib.SequenceMatcher(None, pred_low, true_low).ratio()
    return ratio >= threshold

### Running MQD and ZDA on all MedQDx benchmark

In [None]:
###  Main loop: getting predictions ###

# Preparing columns to store GPT-4.1’s predictions
for pct in ["100", "80", "50"]:
    df[f"Pred_{pct}%"] = ""

# Iterate and prompt Azure GPT-4.1
for idx, row in tqdm(df.iterrows(), total=len(df), desc="GPT-4.1 Diagnoses"):
    for pct in ["100", "80", "50"]:
        case_text = row[f"{pct}% Case"]
        df.at[idx, f"Pred_{pct}%"] = get_diagnosis(case_text)


# Computeing Zero‐Shot Diagnostic Accuracy (Similarity‐Based)
accuracy_results = {}
for pct in ["100", "80", "50"]:
    correct_count = 0
    total = len(df)
    for _, row in df.iterrows():
        pred = row[f"Pred_{pct}%"]
        true = row["prognosis"]
        if is_similar(pred, true, threshold=0.65):
            correct_count += 1
    accuracy = correct_count / total * 100
    accuracy_results[pct] = (correct_count, total, accuracy)


print("\nZero‐Shot Diagnostic Accuracy with Azure GPT-4.1 (Similarity‐Based, threshold=0.65):\n")
for pct, (count, total, acc) in accuracy_results.items():
    print(f"  {pct}% Case → {count}/{total} correct  (ZDA = {acc:.2f}%)")


# Printing each diagnosis with similarity score
print("\nDetailed Predictions and Similarity Scores:")
for idx, row in df.iterrows():
    true = row["prognosis"]
    preds = {pct: row[f"Pred_{pct}%"] for pct in ["100", "80", "50"]}
    scores = {
        pct: difflib.SequenceMatcher(None, preds[pct].lower(), true.lower()).ratio()
        for pct in ["100", "80", "50"]
    }
    print(
        f"Case {idx+1}: True → {true}   |  "
        f"100% Pred → {preds['100']} (sim={scores['100']:.2f})   |  "
        f"80% Pred → {preds['80']} (sim={scores['80']:.2f})   |  "
        f"50% Pred → {preds['50']} (sim={scores['50']:.2f})"
    )


GPT-4.1 Diagnoses: 100%|██████████| 100/100 [05:40<00:00,  3.40s/it]


Zero‐Shot Diagnostic Accuracy with Azure GPT-4.1 (Similarity‐Based, threshold=0.65):

  100% Case → 51/100 correct  (ZDA = 51.00%)
  80% Case → 44/100 correct  (ZDA = 44.00%)
  50% Case → 37/100 correct  (ZDA = 37.00%)

Detailed Predictions and Similarity Scores:
Case 1: True → Chickenpox   |  100% Pred → Infectious mononucleosis (sim=0.18)   |  80% Pred → Infectious mononucleosis (sim=0.18)   |  50% Pred → Infectious mononucleosis (sim=0.18)
Case 2: True → Hypoglycemia   |  100% Pred → Hypoglycemia (sim=1.00)   |  80% Pred → Hypoglycemia (sim=1.00)   |  50% Pred → Hypoglycemia (sim=1.00)
Case 3: True → Varicose Veins   |  100% Pred → Chronic venous insufficiency (sim=0.43)   |  80% Pred → Chronic venous insufficiency (sim=0.43)   |  50% Pred → Chronic venous insufficiency (sim=0.43)
Case 4: True → Chronic Cholestasis   |  100% Pred → Obstructive jaundice (sim=0.21)   |  80% Pred → Obstructive jaundice (sim=0.21)   |  50% Pred → Cholestasis (sim=0.73)
Case 5: True → Hepatitis D   |  1


