<a href="https://colab.research.google.com/github/Keshvii/MedMCQA_Exp/blob/main/AdditionalMultipleModelAccuracyComparision.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#1. Zero shot: OpenAI/gpt-4o-mini:  Acc=66.7
#2. COT: OpenAI-o4-mini:  Acc= 76.1
#3. COT microsoft/mai-ds-r1:free:  Acc=76.8
#4. Zero Shot:google/gemini-2.5-flash-lite-preview-06-17 : Acc=66.2
#5. Zero Shot:google/gemini-2.5-flash: Acc= 76.6
#6. Zero Shot: anthropic/claude-sonnet-4: Acc=78.0

Zero Shot: openai/gpt-4o-mini-2024-07-18: Acc=66.7

In [None]:
import re
import time
import numpy as np
import pandas as pd
from openai import OpenAI

# --- 1. Setup OpenRouter client ---
OPEN_ROUTER_KEY = "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"
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=OPEN_ROUTER_KEY,
)

# --- 2. System Prompt ---
SYSTEM_PROMPT = (
    "You are a medical examination assistant. "
    "For every batch you will receive exactly 20 multiple-choice questions (MCQs) in the format:\n\n"
    "Q1: <question>\nA. <option>\nB. <option>\nC. <option>\nD. <option>\n\n"
    "Return ONLY one the answer from A,B,C or D (nothing else should be included in answer), for each question STRICTLY IN FORMAT:\n"
    "Q1: A, Q2: C, ..., Q10: B\n"
    "**IMPORTANT**Do not add explanations or extra text."
)

# --- 3. Ask LLM in Batches of 20 ---
def ask_llm_batch_openrouter(questions_batch, model):
    prompt = ""
    for i, row in enumerate(questions_batch, start=1):
        prompt += (

            f"Q{i}: {row['question']}\n"
            f"A. {row['opa']}\n"
            f"B. {row['opb']}\n"
            f"C. {row['opc']}\n"
            f"D. {row['opd']}\n\n"
        )
    prompt += "Write your answers now:\n"

    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": prompt}
        ],
        temperature=0.2,
        top_p=0.95,
        max_tokens=150
    )

    return response.choices[0].message.content.strip()

# --- 4. Parse answer letters A/B/C/D into 0/1/2/3 ---
def parse_batch_answers(response_text, expected_len):
    matches = re.findall(r"Q\d+:\s*([A-D])", response_text, flags=re.IGNORECASE)
    preds = [ord(m.upper()) - 65 for m in matches[:expected_len]]
    return preds

# --- 5. Load Validation Data ---
validation_df = pd.read_csv('/content/validation.csv')
# validation_df.info()
# validation_df=validation_df.head(200)

# --- 6. Define Models to Evaluate ---
models = [
    "openai/gpt-4o-mini-2024-07-18",
]

# --- 7. Batch Evaluation for Each Model ---
batch_size = 30
results = {}

for model_name in models:
    print(f"\n--- Evaluating {model_name} ---")
    correct, total = 0, 0

    total_batches = (len(validation_df) + batch_size - 1) // batch_size
    for i in tqdm(range(0, len(validation_df), batch_size), desc=f"{model_name} Progress", total=total_batches):

        batch_df = validation_df.iloc[i:i + batch_size]
        golds = batch_df["cop"].astype(int).map({0:'A', 1:'B', 2:'C', 3:'D'}).tolist()

        try:

            raw_response = ask_llm_batch_openrouter(batch_df.to_dict("records"), model=model_name)
            matches = re.findall(r"Q\d+:\s*([A-D])", raw_response, flags=re.IGNORECASE)

            if len(matches) != len(golds):
                print(raw_response)
                print(f"Batch {i//batch_size + 1}: Expected {len(golds)} answers, got {len(matches)}. Skipping.")
                continue

            preds = [m.upper() for m in matches[:len(golds)]]
            correct += sum([int(p == g) for p, g in zip(preds, golds)])
            total += len(golds)

        except Exception as e:
            print(f"Error in batch {i//batch_size + 1} for model {model_name}: {e}")
            continue

    if total > 0:
        acc = correct / total
        results[model_name] = acc
        print(f"Accuracy for {model_name}: {acc:.3f} ({correct}/{total})")
    else:
        print(f"No batches processed for {model_name}")

# --- 8. Summary ---
print("\n=== Final Model Accuracies ===")
for model, acc in results.items():
    print(f"{model}: {acc:.3f}")


--- Evaluating openai/gpt-4o-mini-2024-07-18 ---


openai/gpt-4o-mini-2024-07-18 Progress:   0%|          | 0/140 [00:00<?, ?it/s]

Q1: A, Q2: C, Q3: C, Q4: B, Q5: A, Q6: B, Q7: B, Q8: A, Q9: D, Q10: C, Q11: C, Q12: B, Q13: A, Q14: B, Q15: D, Q16: A, Q17: C, Q18: D, Q19: B, Q20: A
Batch 31: Expected 30 answers, got 20. Skipping.
Q1: C, Q2: B, Q3: A, Q4: C, Q5: C, Q6: B, Q7: C, Q8: D, Q9: A, Q10: A, Q11: D, Q12: D, Q13: B, Q14: D, Q15: A, Q16: B, Q17: C, Q18: C, Q19: A, Q20: A
Batch 79: Expected 30 answers, got 20. Skipping.
Accuracy for openai/gpt-4o-mini-2024-07-18: 0.667 (2750/4123)

=== Final Model Accuracies ===
openai/gpt-4o-mini-2024-07-18: 0.667


COT:openai/o4-mini  : Acc= 76.1

In [None]:
import re
import time
import numpy as np
import pandas as pd
from openai import OpenAI
from tqdm.auto import tqdm

# --- 1. Setup OpenRouter client ---
OPEN_ROUTER_KEY = "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"
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=OPEN_ROUTER_KEY,
)

# --- 2. Prompt & Settings ---
batch_size = 50
model_name = "openai/o4-mini"

SYSTEM_PROMPT_COT = (
    "You are a medical assistant answering MCQs. For each question, briefly explain your reasoning, then select the correct answer."
    " Format strictly as:\n"
    "Q1:\nExplanation: ...\nAnswer: B\n\n"
    "**Instructions:**\n"
    "- Keep explanations short (1 lines max).\n"
    "- Return only 'Qn:', 'Explanation:', and 'Answer:' lines.\n"
    "- No extra commentary or summary.\n"
)

# --- 3. Function to query LLM ---
def ask_llm_batch_openrouter(questions_batch, model):
    prompt = ""
    for i, row in enumerate(questions_batch, start=1):
        prompt += (
            f"Q{i}: {row['question']}\n"
            f"A. {row['opa']}\n"
            f"B. {row['opb']}\n"
            f"C. {row['opc']}\n"
            f"D. {row['opd']}\n\n"
        )
    prompt += (
        "Answer each question in this format:\n"
        "Q1:\nExplanation: ...\nAnswer: A\n"
        "Q2:\nExplanation: ...\nAnswer: B\n"
        f"... up to Q{len(questions_batch)}.\n"
    )

    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT_COT},
            {"role": "user", "content": prompt}
        ],
        temperature=0.2,
        top_p=0.95,
    )

    return response.choices[0].message.content.strip()

# --- 4. Extract answers from CoT output ---
def extract_answers_from_cot(response_text, expected_len):
    matches = re.findall(r"Answer:\s*([A-D])", response_text, flags=re.IGNORECASE)
    if len(matches) != expected_len:
        print("⚠️ Incomplete answers. Response was:")
        print(response_text)
    return [ord(m.upper()) - 65 for m in matches[:expected_len]]

# --- 5. Load Validation Data ---
validation_df = pd.read_csv('/content/validation.csv')

# --- 6. Run Evaluation ---
correct, total = 0, 0
total_batches = (len(validation_df) + batch_size - 1) // batch_size
print(f"\n--- Evaluating {model_name} with Visible CoT Reasoning ---")

for i in tqdm(range(0, len(validation_df), batch_size), desc=f"{model_name} Progress", total=total_batches):
    batch_df = validation_df.iloc[i:i + batch_size]
    golds = batch_df["cop"].astype(int).tolist()

    try:
        raw_response = ask_llm_batch_openrouter(batch_df.to_dict("records"), model=model_name)
        print(f"\n--- Batch {i//batch_size + 1}")

        preds = extract_answers_from_cot(raw_response, len(golds))
        if len(preds) != len(golds):
            print(f"⚠️ Skipping batch {i//batch_size + 1} due to mismatch.")
            continue

        correct += sum([int(p == g) for p, g in zip(preds, golds)])
        total += len(golds)

    except Exception as e:
        print(f"❌ Error in batch {i//batch_size + 1}: {e}")
        continue

# --- 7. Print Accuracy ---
if total > 0:
    acc = correct / total
    print(f"\n✅ Final Accuracy for {model_name}: {acc:.3f} ({correct}/{total})")
else:
    print("❌ No batches processed.")


--- Evaluating openai/o4-mini with Visible CoT Reasoning ---


openai/o4-mini Progress:   0%|          | 0/84 [00:00<?, ?it/s]


--- Batch 1

--- Batch 2

--- Batch 3

--- Batch 4

--- Batch 5

--- Batch 6

--- Batch 7

--- Batch 8

--- Batch 9

--- Batch 10

--- Batch 11

--- Batch 12

--- Batch 13

--- Batch 14

--- Batch 15

--- Batch 16

--- Batch 17

--- Batch 18

--- Batch 19

--- Batch 20

--- Batch 21

--- Batch 22

--- Batch 23
⚠️ Incomplete answers. Response was:
Q1:
Explanation: Loss of nuclear chromatin staining is karyolysis.
Answer: B

Q2:
Explanation: Mollusc shells are classic examples of biologic mineral deposition.
Answer: D

Q3:
Explanation: Topical decolonization with bacitracin (substitute for mupirocin) is preferred.
Answer: A

Q4:
Explanation: Meissner’s corpuscles reside in the papillary dermis.
Answer: D

Q5:
Explanation: Increased flow murmurs occur in L→R shunts, not in Eisenmenger.
Answer: A

Q6:
Explanation: Spiral angle refers to the angle between the file’s long axis and its cutting edge.
Answer: B

Q7:
Explanation: AT1 receptors signal via Gq/PLC, not via K+ channel opening.
Answ

In [None]:
validation_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4183 entries, 0 to 4182
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            4183 non-null   object
 1   question      4183 non-null   object
 2   opa           4183 non-null   object
 3   opb           4183 non-null   object
 4   opc           4182 non-null   object
 5   opd           4143 non-null   object
 6   cop           4183 non-null   int64 
 7   choice_type   4183 non-null   object
 8   exp           2206 non-null   object
 9   subject_name  4183 non-null   object
 10  topic_name    423 non-null    object
dtypes: int64(1), object(10)
memory usage: 359.6+ KB


COT: microsoft/mai-ds-r1:free : Acc=76.8

In [None]:
import re
import time
import numpy as np
import pandas as pd
from openai import OpenAI
from tqdm.auto import tqdm

# --- 1. Setup OpenRouter client ---
OPEN_ROUTER_KEY = "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"
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=OPEN_ROUTER_KEY,
)

# --- 2. Prompt & Settings ---
batch_size = 20
model_name = "microsoft/mai-ds-r1:free"

SYSTEM_PROMPT_COT = (
    "You are a medical assistant answering MCQs. For each question, briefly explain your reasoning, then select the correct answer."
    " Format strictly as:\n"
    "Q1:\nExplanation: ...\nAnswer: B\n\n"
    "**Instructions:**\n"
    "- Keep explanations short (1 lines max).\n"
    "- Return only 'Qn:', 'Explanation:', and 'Answer:' lines.\n"
    "- No extra commentary or summary.\n"
)

# --- 3. Function to query LLM ---
def ask_llm_batch_openrouter(questions_batch, model):
    prompt = ""
    for i, row in enumerate(questions_batch, start=1):
        prompt += (
            f"Q{i}: {row['question']}\n"
            f"A. {row['opa']}\n"
            f"B. {row['opb']}\n"
            f"C. {row['opc']}\n"
            f"D. {row['opd']}\n\n"
        )
    prompt += (
        "Answer each question in this format:\n"
        "Q1:\nExplanation: ...\nAnswer: A\n"
        "Q2:\nExplanation: ...\nAnswer: B\n"
        f"... up to Q{len(questions_batch)}.\n"
    )

    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT_COT},
            {"role": "user", "content": prompt}
        ],
        temperature=0.2,
        top_p=0.95,
    )

    return response.choices[0].message.content.strip()

# --- 4. Extract answers from CoT output ---
def extract_answers_from_cot(response_text, expected_len):
    matches = re.findall(r"Answer:\s*([A-D])", response_text, flags=re.IGNORECASE)
    if len(matches) != expected_len:
        print("⚠️ Incomplete answers. Response was:")
        print(response_text)
    return [ord(m.upper()) - 65 for m in matches[:expected_len]]

# --- 5. Load Validation Data ---
validation_df = pd.read_csv('/content/validation.csv')

# --- 6. Run Evaluation ---
correct, total = 0, 0
total_batches = (len(validation_df) + batch_size - 1) // batch_size
print(f"\n--- Evaluating {model_name} with Visible CoT Reasoning ---")

for i in tqdm(range(0, len(validation_df), batch_size), desc=f"{model_name} Progress", total=total_batches):
    batch_df = validation_df.iloc[i:i + batch_size]
    golds = batch_df["cop"].astype(int).tolist()

    try:
        raw_response = ask_llm_batch_openrouter(batch_df.to_dict("records"), model=model_name)
        print(f"\n--- Batch {i//batch_size + 1}")

        preds = extract_answers_from_cot(raw_response, len(golds))
        if len(preds) != len(golds):
            print(f"⚠️ Skipping batch {i//batch_size + 1} due to mismatch.")
            continue

        correct += sum([int(p == g) for p, g in zip(preds, golds)])
        total += len(golds)

    except Exception as e:
        print(f"❌ Error in batch {i//batch_size + 1}: {e}")
        continue

# --- 7. Print Accuracy ---
if total > 0:
    acc = correct / total
    print(f"\n✅ Final Accuracy for {model_name}: {acc:.3f} ({correct}/{total})")
else:
    print("❌ No batches processed.")


--- Evaluating microsoft/mai-ds-r1:free with Visible CoT Reasoning ---


microsoft/mai-ds-r1:free Progress:   0%|          | 0/210 [00:00<?, ?it/s]


--- Batch 1

--- Batch 2

--- Batch 3

--- Batch 4

--- Batch 5

--- Batch 6

--- Batch 7

--- Batch 8

--- Batch 9

--- Batch 10

--- Batch 11

--- Batch 12

--- Batch 13

--- Batch 14

--- Batch 15

--- Batch 16

--- Batch 17

--- Batch 18

--- Batch 19

--- Batch 20

--- Batch 21

--- Batch 22

--- Batch 23

--- Batch 24

--- Batch 25

--- Batch 26

--- Batch 27

--- Batch 28

--- Batch 29

--- Batch 30

--- Batch 31

--- Batch 32

--- Batch 33

--- Batch 34

--- Batch 35

--- Batch 36

--- Batch 37

--- Batch 38

--- Batch 39

--- Batch 40

--- Batch 41

--- Batch 42

--- Batch 43

--- Batch 44

--- Batch 45

--- Batch 46

--- Batch 47

--- Batch 48

--- Batch 49

--- Batch 50

--- Batch 51

--- Batch 52

--- Batch 53

--- Batch 54

--- Batch 55

--- Batch 56

--- Batch 57

--- Batch 58

--- Batch 59

--- Batch 60

--- Batch 61

--- Batch 62

--- Batch 63

--- Batch 64

--- Batch 65

--- Batch 66

--- Batch 67

--- Batch 68

--- Batch 69

--- Batch 70

--- Batch 71

--- Batch 72



Zero Shot:google/gemini-2.5-flash-lite-preview-06-17: Acc=66.2

In [None]:
import re
import time
import numpy as np
import pandas as pd
from openai import OpenAI
from tqdm.auto import tqdm


# --- 1. Setup OpenRouter client ---
OPEN_ROUTER_KEY = "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"
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=OPEN_ROUTER_KEY,
)

# --- 2. System Prompt ---
SYSTEM_PROMPT = (
    "You are a medical examination assistant. "
    "For every batch you will receive exactly 20 multiple-choice questions (MCQs) in the format:\n\n"
    "Q1: <question>\nA. <option>\nB. <option>\nC. <option>\nD. <option>\n\n"
    "Return ONLY one the answer from A,B,C or D (nothing else should be included in answer), for each question STRICTLY IN FORMAT:\n"
    "Q1: A, Q2: C, ..., Q10: B\n"
    "**IMPORTANT**Do not add explanations or extra text."
)

# --- 3. Ask LLM in Batches of 20 ---
def ask_llm_batch_openrouter(questions_batch, model):
    prompt = ""
    for i, row in enumerate(questions_batch, start=1):
        prompt += (

            f"Q{i}: {row['question']}\n"
            f"A. {row['opa']}\n"
            f"B. {row['opb']}\n"
            f"C. {row['opc']}\n"
            f"D. {row['opd']}\n\n"
        )
    prompt += "Write your answers now:\n"

    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": prompt}
        ],
        temperature=0.2,
        top_p=0.95,
        max_tokens=100
    )

    return response.choices[0].message.content.strip()

# --- 4. Parse answer letters A/B/C/D into 0/1/2/3 ---
def parse_batch_answers(response_text, expected_len):
    matches = re.findall(r"Q\d+:\s*([A-D])", response_text, flags=re.IGNORECASE)
    preds = [ord(m.upper()) - 65 for m in matches[:expected_len]]
    return preds

# --- 5. Load Validation Data ---
validation_df = pd.read_csv('/content/validation.csv')
# validation_df.info()
# validation_df=validation_df.head(200)

# --- 6. Define Models to Evaluate ---
models = [
    "google/gemini-2.5-flash-lite-preview-06-17",
]

# --- 7. Batch Evaluation for Each Model ---
batch_size = 10
results = {}

for model_name in models:
    print(f"\n--- Evaluating {model_name} ---")
    correct, total = 0, 0

    total_batches = (len(validation_df) + batch_size - 1) // batch_size
    for i in tqdm(range(0, len(validation_df), batch_size), desc=f"{model_name} Progress", total=total_batches):

        batch_df = validation_df.iloc[i:i + batch_size]
        golds = batch_df["cop"].astype(int).map({0:'A', 1:'B', 2:'C', 3:'D'}).tolist()

        try:

            raw_response = ask_llm_batch_openrouter(batch_df.to_dict("records"), model=model_name)
            matches = re.findall(r"Q\d+:\s*([A-D])", raw_response, flags=re.IGNORECASE)

            if len(matches) != len(golds):
                print(raw_response)
                print(f"Batch {i//batch_size + 1}: Expected {len(golds)} answers, got {len(matches)}. Skipping.")
                continue

            preds = [m.upper() for m in matches[:len(golds)]]
            correct += sum([int(p == g) for p, g in zip(preds, golds)])
            total += len(golds)

        except Exception as e:
            print(f"Error in batch {i//batch_size + 1} for model {model_name}: {e}")
            continue

    if total > 0:
        acc = correct / total
        results[model_name] = acc
        print(f"Accuracy for {model_name}: {acc:.3f} ({correct}/{total})")
    else:
        print(f"No batches processed for {model_name}")

# --- 8. Summary ---
print("\n=== Final Model Accuracies ===")
for model, acc in results.items():
    print(f"{model}: {acc:.3f}")


--- Evaluating google/gemini-2.5-flash-lite-preview-06-17 ---


google/gemini-2.5-flash-lite-preview-06-17 Progress:   0%|          | 0/419 [00:00<?, ?it/s]

Accuracy for google/gemini-2.5-flash-lite-preview-06-17: 0.662 (2769/4183)

=== Final Model Accuracies ===
google/gemini-2.5-flash-lite-preview-06-17: 0.662


Zero Shot:google/gemini-2.5-flash: Acc= 76.6

In [3]:
import re
import time
import numpy as np
import pandas as pd
from openai import OpenAI
from tqdm.auto import tqdm


# --- 1. Setup OpenRouter client ---
OPEN_ROUTER_KEY = "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"
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=OPEN_ROUTER_KEY,
)

batch_size = 40
# --- 2. System Prompt ---
SYSTEM_PROMPT = (
    "You are a medical examination assistant. "
    "For every batch you will receive exactly {{batch_size}} multiple-choice questions (MCQs) in the format:\n\n"
    "Q1: <question>\nA. <option>\nB. <option>\nC. <option>\nD. <option>\n\n"
    "Return ONLY one the answer from A,B,C or D (nothing else should be included in answer), for each question STRICTLY IN FORMAT:\n"
    "Q1: A, Q2: C, ..., Q10: B\n"
    "**IMPORTANT**Do not add explanations or extra text."
)

# --- 3. Ask LLM in Batches of 20 ---
def ask_llm_batch_openrouter(questions_batch, model):
    prompt = ""
    for i, row in enumerate(questions_batch, start=1):
        prompt += (

            f"Q{i}: {row['question']}\n"
            f"A. {row['opa']}\n"
            f"B. {row['opb']}\n"
            f"C. {row['opc']}\n"
            f"D. {row['opd']}\n\n"
        )
    prompt += "Write your answers now:\n"

    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": prompt}
        ],
        temperature=0.2,
        top_p=0.95,
        max_tokens=300
    )

    return response.choices[0].message.content.strip()

# --- 4. Parse answer letters A/B/C/D into 0/1/2/3 ---
def parse_batch_answers(response_text, expected_len):
    matches = re.findall(r"Q\d+:\s*([A-D])", response_text, flags=re.IGNORECASE)
    preds = [ord(m.upper()) - 65 for m in matches[:expected_len]]
    return preds

# --- 5. Load Validation Data ---
validation_df = pd.read_csv('/content/validation.csv')
# validation_df.info()
# validation_df=validation_df.head(200)

# --- 6. Define Models to Evaluate ---
models = [
    "google/gemini-2.5-flash",
]

# --- 7. Batch Evaluation for Each Model ---

results = {}

for model_name in models:
    print(f"\n--- Evaluating {model_name} ---")
    correct, total = 0, 0

    total_batches = (len(validation_df) + batch_size - 1) // batch_size
    for i in tqdm(range(0, len(validation_df), batch_size), desc=f"{model_name} Progress", total=total_batches):

        batch_df = validation_df.iloc[i:i + batch_size]
        golds = batch_df["cop"].astype(int).map({0:'A', 1:'B', 2:'C', 3:'D'}).tolist()

        try:

            raw_response = ask_llm_batch_openrouter(batch_df.to_dict("records"), model=model_name)
            matches = re.findall(r"Q\d+:\s*([A-D])", raw_response, flags=re.IGNORECASE)

            if len(matches) != len(golds):
                print(raw_response)
                print(f"Batch {i//batch_size + 1}: Expected {len(golds)} answers, got {len(matches)}. Skipping.")
                continue

            preds = [m.upper() for m in matches[:len(golds)]]
            correct += sum([int(p == g) for p, g in zip(preds, golds)])
            total += len(golds)

        except Exception as e:
            print(f"Error in batch {i//batch_size + 1} for model {model_name}: {e}")
            continue

    if total > 0:
        acc = correct / total
        results[model_name] = acc
        print(f"Accuracy for {model_name}: {acc:.3f} ({correct}/{total})")
    else:
        print(f"No batches processed for {model_name}")

# --- 8. Summary ---
print("\n=== Final Model Accuracies ===")
for model, acc in results.items():
    print(f"{model}: {acc:.3f}")


--- Evaluating google/gemini-2.5-flash ---


google/gemini-2.5-flash Progress:   0%|          | 0/105 [00:00<?, ?it/s]

Accuracy for google/gemini-2.5-flash: 0.766 (3203/4183)

=== Final Model Accuracies ===
google/gemini-2.5-flash: 0.766


Zero Shot: anthropic/claude-sonnet-4: Acc=78.0

In [4]:
import re
import time
import numpy as np
import pandas as pd
from openai import OpenAI
from tqdm.auto import tqdm


# --- 1. Setup OpenRouter client ---
OPEN_ROUTER_KEY = "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"
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=OPEN_ROUTER_KEY,
)

batch_size = 40
# --- 2. System Prompt ---
SYSTEM_PROMPT = (
    "You are a medical examination assistant. "
    "For every batch you will receive exactly {{batch_size}} multiple-choice questions (MCQs) in the format:\n\n"
    "Q1: <question>\nA. <option>\nB. <option>\nC. <option>\nD. <option>\n\n"
    "Return ONLY one the answer from A,B,C or D (nothing else should be included in answer), for each question STRICTLY IN FORMAT:\n"
    "Q1: A, Q2: C, ..., Q10: B\n"
    "**IMPORTANT**Do not add explanations or extra text."
)

# --- 3. Ask LLM in Batches of 20 ---
def ask_llm_batch_openrouter(questions_batch, model):
    prompt = ""
    for i, row in enumerate(questions_batch, start=1):
        prompt += (

            f"Q{i}: {row['question']}\n"
            f"A. {row['opa']}\n"
            f"B. {row['opb']}\n"
            f"C. {row['opc']}\n"
            f"D. {row['opd']}\n\n"
        )
    prompt += "Write your answers now:\n"

    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": prompt}
        ],
        temperature=0.2,
        top_p=0.95,
        max_tokens=300
    )

    return response.choices[0].message.content.strip()

# --- 4. Parse answer letters A/B/C/D into 0/1/2/3 ---
def parse_batch_answers(response_text, expected_len):
    matches = re.findall(r"Q\d+:\s*([A-D])", response_text, flags=re.IGNORECASE)
    preds = [ord(m.upper()) - 65 for m in matches[:expected_len]]
    return preds

# --- 5. Load Validation Data ---
validation_df = pd.read_csv('/content/validation.csv')
# validation_df.info()
# validation_df=validation_df.head(200)

# --- 6. Define Models to Evaluate ---
models = [
    "anthropic/claude-sonnet-4",
]

# --- 7. Batch Evaluation for Each Model ---

results = {}

for model_name in models:
    print(f"\n--- Evaluating {model_name} ---")
    correct, total = 0, 0

    total_batches = (len(validation_df) + batch_size - 1) // batch_size
    for i in tqdm(range(0, len(validation_df), batch_size), desc=f"{model_name} Progress", total=total_batches):

        batch_df = validation_df.iloc[i:i + batch_size]
        golds = batch_df["cop"].astype(int).map({0:'A', 1:'B', 2:'C', 3:'D'}).tolist()

        try:

            raw_response = ask_llm_batch_openrouter(batch_df.to_dict("records"), model=model_name)
            matches = re.findall(r"Q\d+:\s*([A-D])", raw_response, flags=re.IGNORECASE)

            if len(matches) != len(golds):
                print(raw_response)
                print(f"Batch {i//batch_size + 1}: Expected {len(golds)} answers, got {len(matches)}. Skipping.")
                continue

            preds = [m.upper() for m in matches[:len(golds)]]
            correct += sum([int(p == g) for p, g in zip(preds, golds)])
            total += len(golds)

        except Exception as e:
            print(f"Error in batch {i//batch_size + 1} for model {model_name}: {e}")
            continue

    if total > 0:
        acc = correct / total
        results[model_name] = acc
        print(f"Accuracy for {model_name}: {acc:.3f} ({correct}/{total})")
    else:
        print(f"No batches processed for {model_name}")

# --- 8. Summary ---
print("\n=== Final Model Accuracies ===")
for model, acc in results.items():
    print(f"{model}: {acc:.3f}")


--- Evaluating anthropic/claude-sonnet-4 ---


anthropic/claude-sonnet-4 Progress:   0%|          | 0/105 [00:00<?, ?it/s]

Accuracy for anthropic/claude-sonnet-4: 0.780 (3264/4183)

=== Final Model Accuracies ===
anthropic/claude-sonnet-4: 0.780
