In [None]:
!pip install -U datasets



In [None]:
import random
import re
import torch
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import json

In [None]:
# selected_ids = [
#   "DxBench_40", "DxBench_50", "DxBench_60",
#   "DxBench_120", "DxBench_150",
#   "DxBench_200", "DxBench_240",
#   "DxBench_300", "DxBench_330",
#   "DxBench_370", "DxBench_400",
#   "DxBench_920"
# ]

In [None]:
TEMPLATE_LEAST_TO_MOST = """
Patient Symptoms:
{symptom_line}

You are a specialized medical AI assistant. Your responses must be:
1) Strictly based on established medical knowledge.
2) Confined to medical topics; if not medical, do not assess.
3) Helpful and cautious; no speculation beyond provided symptoms.

Internal Reasoning (DO NOT reveal):
Q1: Identify involved organ systems (explicit + implicit symptoms).
Q2: List ~5–7 plausible diseases for those systems.
Q3: Choose exactly 3 (or 2 if only two are credible) that best fit all symptoms; note supports vs. missing/contradictory findings.
Q4: Pick the single most plausible overall among those 2–3.

Final Output (PRINT EXACTLY ONE LINE — a single JSON object; no markdown, no extra words):
{"BEST":"<single disease>","RANKED":[["<disease1>",p1],["<disease2>",p2],["<disease3>",p3]]}

Hard rules:
- RANKED must contain 2 or 3 items ordered by descending probability; BEST must equal the first item’s disease.
- p1+p2(+p3) = 1.00 (two decimals). Use decimals (e.g., 0.55), not percentages.
- Disease names in standard English; no explanations or qualifiers.
- If the query is not medical: output {"BEST":"","RANKED":[]} only.
- If symptoms are insufficient for any credible differential: output {"BEST":"","RANKED":[]} only.
ASSISTANT:
"""

In [None]:
TEMPLATE_SINGLE_STEP_COT = """
Patient Symptoms:
{symptom_line}

You are a specialized medical AI assistant. Your responses must be:
1) Strictly based on established medical knowledge.
2) Confined to medical topics; if not medical, do not assess.
3) Helpful and cautious; no speculation beyond provided symptoms.

Task: Analyze the symptoms step-by-step to determine potential conditions.

Step 1 – Symptom Categorization:
Classify each explicit/implicit symptom by primary organ system(s).

Step 2 – Broad List of Potential Conditions:
Generate ~5–7 plausible diseases based on Step 1 (no ranking here).

Step 3 – Differential with Detailed Evaluation:
From Step 2, select exactly 2–3 diseases that best fit all symptoms; for each, assess supporting vs. missing/contradictory findings and estimate relative likelihoods.

IMPORTANT OUTPUT RULE:
- Perform Steps 1–3 INTERNALLY (do NOT print steps or reasoning).
- PRINT EXACTLY ONE LINE as a single JSON object (no markdown, no extra words):
{"BEST":"<single disease>","RANKED":[["<disease1>",p1],["<disease2>",p2],["<disease3>",p3]]}

Hard rules:
- RANKED has 2 or 3 items in descending probability; BEST equals the first item’s disease.
- p1+p2(+p3) = 1.00 with two decimals (e.g., 0.55).
- Standard English disease names only; no explanations/qualifiers.
- No clarifying questions in the output.
- If not medical or symptoms insufficient: output {"BEST":"","RANKED":[]} only.
ASSISTANT:
"""


In [None]:
TEMPLATE_ZERO_SHOT_DIRECT = """
Patient Symptoms:
{symptom_line}

You are a specialized medical AI assistant. Your responses must be:
1) Strictly based on established medical knowledge.
2) Confined to medical topics; if not medical, do not assess.
3) Helpful and cautious; no speculation beyond provided symptoms.

Task (Zero-Shot):
Based only on the symptoms above, directly infer the most likely conditions WITHOUT writing any reasoning, steps, or questions.

FINAL OUTPUT (PRINT EXACTLY ONE LINE — a single JSON object; no markdown, no extra words):
{"BEST":"<single disease>","RANKED":[["<disease1>",p1],["<disease2>",p2],["<disease3>",p3]]}

Hard rules:
- Perform any necessary reasoning internally; DO NOT print it.
- RANKED must contain 2 or 3 items in descending probability; BEST equals the first item’s disease.
- p1+p2(+p3) = 1.00 with two decimals (e.g., 0.55).
- Use standard English disease names only; no explanations/qualifiers.
- NO clarifying questions in the output.
- If the query is not medical OR symptoms are insufficient: output {"BEST":"","RANKED":[]} only.
ASSISTANT:
"""

In [None]:
PROMPT_TEMPLATES = {
    "least_to_most":     TEMPLATE_LEAST_TO_MOST,
    "zero_shot_direct":  TEMPLATE_ZERO_SHOT_DIRECT,
    "single_step_cot":   TEMPLATE_SINGLE_STEP_COT
}

In [None]:
def clean_and_map(example):
    def format_symptoms(symptom_list):
        # Build JSON-style dict: {"Symptom": "True"/"False"}
        sym_dict = {
            sym[0]: sym[1]
            for sym in symptom_list
            if isinstance(sym, list) and len(sym) == 2
        }
        return json.dumps(sym_dict, ensure_ascii=False)

    explicit_str = format_symptoms(example.get('explicit_symptoms', [])) or "Not provided"
    implicit_str = format_symptoms(example.get('implicit_symptoms', [])) or "Not provided"

    symptom_line = f"Explicit: {explicit_str} \nImplicit: {implicit_str}"
    return {
        "symptom_line": symptom_line,
        "label": example.get('disease'),
        "id": example.get('id')  # Used for logging
    }

In [None]:
ds = load_dataset("FreedomIntelligence/DxBench", "DxBench")
raw_dataset = ds["en"]
print(f"Original dataset size: {len(raw_dataset)} rows")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/977 [00:00<?, ?B/s]

DxBench_en.json:   0%|          | 0.00/732k [00:00<?, ?B/s]

DxBench_zh.json:   0%|          | 0.00/664k [00:00<?, ?B/s]

Generating en split:   0%|          | 0/1148 [00:00<?, ? examples/s]

Generating zh split:   0%|          | 0/1148 [00:00<?, ? examples/s]

Original dataset size: 1148 rows


In [None]:
# 1. Dataset Preprocessing: Clean and extract symptom lines
cleaned = raw_dataset.map(clean_and_map)
# Drop rows without explicit or implicit symptoms or without a label
cleaned = cleaned.filter(lambda x: x['symptom_line'] and x['label'])
print(f"After cleaning: {len(cleaned)} rows")

# df_clean = cleaned.to_pandas()

Map:   0%|          | 0/1148 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1148 [00:00<?, ? examples/s]

After cleaning: 1148 rows


In [None]:
# id_col = cleaned["id"]
# index_by_id = {v: i for i, v in enumerate(id_col)}
# indices = [index_by_id[i] for i in selected_ids if i in index_by_id]

# subset = cleaned.select(indices)
# print(f"Selected {len(subset)} rows for evaluation")

Selected 12 rows for evaluation


In [None]:
subset = cleaned

In [None]:
import os
import time
import pandas as pd

dataset_iter = subset

out_dir = "per_method_outputs"
os.makedirs(out_dir, exist_ok=True)

techniques = [t for t in PROMPT_TEMPLATES.keys()]

In [None]:
for ex in dataset_iter:
    ex_id = ex.get("id")
    label = ex.get("label")
    symptom_line = ex.get("symptom_line")

    for technique in techniques:
        template = PROMPT_TEMPLATES[technique]
        prompt = template.replace("{symptom_line}", symptom_line)

        prm_path  = os.path.join(out_dir, f"prompts_{technique}.txt")
        with open(prm_path, "a", encoding="utf-8") as f:
            f.write(f"{ex_id}\n")
            f.write(f"{label}\n")
            f.write(f"{prompt}\n")
            f.write("=" * 22 + "\n")
            f.flush()

        print(f"[LOG] id={ex_id} | method={technique}")

[LOG] id=DxBench_40 | method=zero_shot_direct
[LOG] id=DxBench_50 | method=zero_shot_direct
[LOG] id=DxBench_60 | method=zero_shot_direct
[LOG] id=DxBench_120 | method=zero_shot_direct
[LOG] id=DxBench_150 | method=zero_shot_direct
[LOG] id=DxBench_200 | method=zero_shot_direct
[LOG] id=DxBench_240 | method=zero_shot_direct
[LOG] id=DxBench_300 | method=zero_shot_direct
[LOG] id=DxBench_330 | method=zero_shot_direct
[LOG] id=DxBench_370 | method=zero_shot_direct
[LOG] id=DxBench_400 | method=zero_shot_direct
[LOG] id=DxBench_920 | method=zero_shot_direct
