Tree of Thought: llama3-70b-8192(Acc=68.9)

In [4]:
import re
import time
import numpy as np
import pandas as pd
from openai import OpenAI
from tqdm.auto import tqdm
# --- 1. Setup OpenRouter client ---
GROQ_API_KEY = "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"
client = OpenAI(
    base_url="https://api.groq.com/openai/v1",
    api_key=GROQ_API_KEY
)

# --- 2. System Prompt ---
SYSTEM_PROMPT_TOT = (
    "You are a medical assistant evaluating each MCQ with 3 reasoning paths (Path A, B, C). "
    "After evaluating all options under each path, choose the most consistent answer."
    "Return ONLY one the answer from A,B,C or D (nothing else should be included in answer), for each question STRICTLY IN FORMAT:\n"
    "Q1: A, Q2: C, ..., Q10: B\n"
    "**IMPORTANT**Do not add explanations or extra text."
)


# --- 3. Ask LLM in Batches of 20 ---
def ask_llm_batch_openrouter(questions_batch, model):
    prompt = ""
    for i, row in enumerate(questions_batch, start=1):
        prompt += (

            f"Q{i}: {row['question']}\n"
            f"A. {row['opa']}\n"
            f"B. {row['opb']}\n"
            f"C. {row['opc']}\n"
            f"D. {row['opd']}\n\n"
        )
    prompt += "Write your answers now:\n"

    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT_TOT},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
    )

    return response.choices[0].message.content.strip()

# --- 4. Parse answer letters A/B/C/D into 0/1/2/3 ---
def parse_batch_answers(response_text, expected_len):
    matches = re.findall(r"Q\d+:\s*([A-D])", response_text, flags=re.IGNORECASE)
    preds = [ord(m.upper()) - 65 for m in matches[:expected_len]]
    return preds

# --- 5. Load Validation Data ---
validation_df = pd.read_csv('/content/validation.csv')
# validation_df.info()
# validation_df=validation_df.head(200)

# --- 6. Define Models to Evaluate ---
models = [
    "llama3-70b-8192",
]

# --- 7. Batch Evaluation for Each Model ---
batch_size = 40
results = {}

for model_name in models:
    print(f"\n--- Evaluating {model_name} ---")
    correct, total = 0, 0

    total_batches = (len(validation_df) + batch_size - 1) // batch_size
    for i in tqdm(range(0, len(validation_df), batch_size), desc=f"{model_name} Progress", total=total_batches):

        batch_df = validation_df.iloc[i:i + batch_size]
        golds = batch_df["cop"].astype(int).map({0:'A', 1:'B', 2:'C', 3:'D'}).tolist()

        try:

            raw_response = ask_llm_batch_openrouter(batch_df.to_dict("records"), model=model_name)
            matches = re.findall(r"Q\d+:\s*([A-D])", raw_response, flags=re.IGNORECASE)

            if len(matches) != len(golds):
                print(raw_response)
                print(f"Batch {i//batch_size + 1}: Expected {len(golds)} answers, got {len(matches)}. Skipping.")
                continue

            preds = [m.upper() for m in matches[:len(golds)]]
            correct += sum([int(p == g) for p, g in zip(preds, golds)])
            total += len(golds)

        except Exception as e:
            print(f"Error in batch {i//batch_size + 1} for model {model_name}: {e}")
            continue

    if total > 0:
        acc = correct / total
        results[model_name] = acc
        print(f"Accuracy for {model_name}: {acc:.3f} ({correct}/{total})")
    else:
        print(f"No batches processed for {model_name}")

# --- 8. Summary ---
print("\n=== Final Model Accuracies ===")
for model, acc in results.items():
    print(f"{model}: {acc:.3f}")


--- Evaluating llama3-70b-8192 ---


llama3-70b-8192 Progress:   0%|          | 0/105 [00:00<?, ?it/s]

Accuracy for llama3-70b-8192: 0.689 (2884/4183)

=== Final Model Accuracies ===
llama3-70b-8192: 0.689
