In [224]:
import json
import time
import os
import tempfile
import re
from groq import Groq


In [225]:
# ===== CONFIG =====
INPUT_FILENAME = r"C:\\llm_runs\\outputs\\new_option\\failed_questions.json"
OUTPUT_FILENAME = r"C:\\llm_runs\\outputs\\failed_original_questions_gpt_oss20B.json"
#OUTPUT_FILENAME = r"C:\\llm_runs\\outputs\\original_gpt_oss20B.json"

# Switch model here
MODEL = "openai/gpt-oss-20b"
# MODEL = "llama-3.1-8b-instant"

SECONDS_BETWEEN_REQUESTS = 0.3


In [None]:
client = Groq(api_key="YOUR_API_KEY_HERE")

os.makedirs(os.path.dirname(OUTPUT_FILENAME), exist_ok=True)


In [227]:
def load_data(filename):
    with open(filename, "r", encoding="utf-8") as f:
        return json.load(f)


In [228]:
def save_data(filename, data):
    dir_name = os.path.dirname(filename) or "."
    with tempfile.NamedTemporaryFile(
        mode="w",
        encoding="utf-8",
        dir=dir_name,
        delete=False
    ) as tmp:
        json.dump(data, tmp, indent=2, ensure_ascii=False)
        tmp_name = tmp.name

    os.replace(tmp_name, filename)  # atomic on Windows


In [229]:
def get_model_answer(question, options):
    option_block = "\n".join(
        f"{i}. {opt}" for i, opt in enumerate(options)
    )

    prompt = f"""
You are answering a multiple choice question.
Select the MOST appropriate option.

Question:
{question}

Options:
{option_block}

Respond with the option number.
"""

    completion = client.chat.completions.create(
        model="openai/gpt-oss-20b",
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
        max_tokens=10000
        #max_tokens = 10000   # IMPORTANT for GPT-OSS
    )

    text = completion.choices[0].message.content or ""

    # Robust digit extraction (LIKE YOUR FRIEND)
    match = re.search(r"\b([0-4])\b", text)

    return int(match.group(1)) if match else -1


In [230]:
def run_test_indexed():
    data = load_data(INPUT_FILENAME)

    if os.path.exists(OUTPUT_FILENAME):
        try:
            results = load_data(OUTPUT_FILENAME)
        except json.JSONDecodeError:
            results = {}
    else:
        results = {}

    for i, q in enumerate(data):
        qid = str(q.get("id", i))

        if qid in results:
            continue

        print(f"Testing ID {qid}")

        answer = get_model_answer(q["question"], q["options"])

        results[qid] = {
            "category": q.get("category"),
            "answer": answer
        }

        save_data(OUTPUT_FILENAME, results)
        time.sleep(SECONDS_BETWEEN_REQUESTS)

    print("Done.")


In [231]:
run_test_indexed()


Testing ID 205
Testing ID 288
Testing ID 323
Testing ID 507
Testing ID 514
Testing ID 1975
Testing ID 2000
Testing ID 2124
Testing ID 2261
Testing ID 2979
Testing ID 3222
Testing ID 3234
Testing ID 3514
Testing ID 3594
Testing ID 3833
Testing ID 4571
Testing ID 4718
Testing ID 4786
Testing ID 5532
Testing ID 5735
Testing ID 6200
Testing ID 6316
Testing ID 6655
Testing ID 7429
Testing ID 8351
Testing ID 8942
Testing ID 8974
Testing ID 9144
Testing ID 9330
Testing ID 9697
Testing ID 9745
Testing ID 11294
Testing ID 11339
Done.


In [232]:
import json

# Paths
GT_PATH = "test_answers_en.json"
PRED_PATH = r"C:\\llm_runs\\outputs\\original_gpt_oss20B.json"


# Load files
with open(GT_PATH, "r", encoding="utf-8") as f:
    gt = json.load(f)

with open(PRED_PATH, "r", encoding="utf-8") as f:
    pred = json.load(f)

correct = 0
total = 0
invalid = 0

for qid, gt_entry in gt.items():
    if qid not in pred:
        continue  # prediction missing

    gt_answer = gt_entry["answer"]
    pred_answer = pred[qid]["answer"]

    if pred_answer == -1:
        invalid += 1
        continue  # exclude from accuracy

    total += 1
    if pred_answer == gt_answer:
        correct += 1

accuracy = correct / total if total > 0 else 0.0

print("Accuracy of LLama 3.1 8B instruct(excluding -1):", accuracy)
print("Total evaluated:", total)
print("Correct:", correct)
print("Invalid (-1):", invalid)


Accuracy of LLama 3.1 8B instruct(excluding -1): 0.8399754751686083
Total evaluated: 11417
Correct: 9590
Invalid (-1): 18


In [233]:
with open(OUTPUT_FILENAME, "r", encoding="utf-8") as f:
    results = json.load(f)

failed_ids = [
    qid for qid, entry in results.items()
    if entry["answer"] == -1
]


In [234]:
num_failed = len(failed_ids)

print("Number of failed (-1) answers:", num_failed)

Number of failed (-1) answers: 19


In [235]:
print("Failed question IDs:")
print(failed_ids)


Failed question IDs:
['205', '288', '507', '1975', '2124', '2261', '2979', '3234', '3514', '3833', '5532', '5735', '6655', '8942', '8974', '9144', '9697', '11294', '11339']


In [221]:
import json

# Paths
INPUT_FILENAME = "test_en_reordered.json"   # original questions
OUTPUT_FILENAME = r"C:\\llm_runs\\outputs\\original_gpt_oss20B.json"
FAILED_QUESTIONS_FILE = r"C:\llm_runs\outputs\new_option\failed_questions.json"

# Load data
with open(INPUT_FILENAME, "r", encoding="utf-8") as f:
    questions = json.load(f)

with open(OUTPUT_FILENAME, "r", encoding="utf-8") as f:
    results = json.load(f)

# Collect failed IDs
failed_ids = {
    qid for qid, entry in results.items()
    if entry.get("answer") == -1
}

print("Number of failed IDs:", len(failed_ids))

# Extract failed question objects
failed_questions = [
    q for q in questions
    if str(q.get("id")) in failed_ids
]

# Save failed questions
with open(FAILED_QUESTIONS_FILE, "w", encoding="utf-8") as f:
    json.dump(failed_questions, f, indent=2, ensure_ascii=False)

print("Failed questions saved to:", FAILED_QUESTIONS_FILE)


Number of failed IDs: 33
Failed questions saved to: C:\llm_runs\outputs\new_option\failed_questions.json
