In [63]:
import json
import time
from groq import Groq
import os
import tempfile
import re

In [None]:
INPUT_FILENAME = "Added_optoin.json"     # your input file
OUTPUT_FILENAME = "C:\\llm_runs\\outputs\\new_option\\added_option_output_gpt_oss20B.json" # model outputs

MODEL = "openai/gpt-oss-20b"
SECONDS_BETWEEN_REQUESTS = 0.3

client = Groq(api_key=")

In [65]:
os.makedirs(os.path.dirname(OUTPUT_FILENAME), exist_ok=True)


In [67]:
def load_data(filename):
    with open(filename, "r", encoding="utf-8") as f:
        return json.load(f)

def save_data(filename, data):
    dir_name = os.path.dirname(filename) or "."
    with tempfile.NamedTemporaryFile(
        mode="w",
        encoding="utf-8",
        dir=dir_name,
        delete=False
    ) as tmp:
        json.dump(data, tmp, indent=2, ensure_ascii=False)
        tmp_name = tmp.name

    os.replace(tmp_name, filename)  # atomic on Windows

In [71]:
def get_model_answer(question, options, qid=None):
    prompt = f"""
Choose the single most correct answer from the options below.
Respond with ONLY the option text, nothing else.

Question:
{question}

Options:
""" + "\n".join(f"- {opt}" for opt in options)

    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are a helpful assistant that answers multiple-choice questions."},
            {"role": "user", "content": prompt},
        ],
        temperature=0.0,
        max_tokens=50,
    )

    return response.choices[0].message.content.strip()

In [70]:
import re

def get_model_answer(question, options):
    option_block = "\n".join(
        f"{i}. {opt}" for i, opt in enumerate(options)
    )

    prompt = f"""
You are answering a multiple-choice question.

Rules:
- The question has EXACTLY ONE correct answer
- Return ONLY the option number
- Valid answers are ONLY: 0, 1, 2, 3, or 4
- Do NOT add any explanation or text

Question:
{question}

Options:
{option_block}
"""

    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {
                "role": "system",
                "content": "You must output a single digit between 0 and 4."
            },
            {
                "role": "user",
                "content": prompt
            }
        ],
        temperature=0.0,
        max_tokens=5,
    )

    raw = response.choices[0].message.content.strip()
    match = re.search(r"\b([0-4])\b", raw)

    return int(match.group(1)) if match else None


In [72]:
def run_test_indexed():
    data = load_data(INPUT_FILENAME)
    results = {}

    for i, q in enumerate(data):
        qid = str(q.get("id", i))
        category = q.get("category")
        question = q["question"]
        options = q["options"]

        print(f"\nTesting ID {qid}")

        try:
            answer_index = get_model_answer(question, options)

            if answer_index is None:
                print(f"⚠️ Invalid model output for ID {qid}")
                answer_index = -1

        except Exception as e:
            print(f"❌ Error at ID {qid}: {e}")
            answer_index = -1

        results[qid] = {
            "category": category,
            "answer": answer_index
        }

        save_data(OUTPUT_FILENAME, results)
        time.sleep(SECONDS_BETWEEN_REQUESTS)

    print(f"\n✅ Done! Saved indexed answers to {OUTPUT_FILENAME}")


In [73]:
run_test_indexed()



Testing ID 0

Testing ID 1

Testing ID 2

Testing ID 3

Testing ID 4

Testing ID 5

Testing ID 6

Testing ID 7

Testing ID 8

Testing ID 9

Testing ID 10

Testing ID 11

Testing ID 12

Testing ID 13

Testing ID 14

Testing ID 15

Testing ID 16

Testing ID 17

Testing ID 18

Testing ID 19

Testing ID 20

Testing ID 21

Testing ID 22

Testing ID 23

Testing ID 24

Testing ID 25

Testing ID 26

Testing ID 27

Testing ID 28

Testing ID 29

Testing ID 30

Testing ID 31

Testing ID 32

Testing ID 33

Testing ID 34

Testing ID 35

Testing ID 36

Testing ID 37

Testing ID 38

Testing ID 39

Testing ID 40

Testing ID 41

Testing ID 42

Testing ID 43

Testing ID 44

Testing ID 45

Testing ID 46

Testing ID 47

Testing ID 48

Testing ID 49

Testing ID 50

Testing ID 51

Testing ID 52

Testing ID 53

Testing ID 54

Testing ID 55

Testing ID 56

Testing ID 57

Testing ID 58

Testing ID 59

Testing ID 60

Testing ID 61

Testing ID 62

Testing ID 63

Testing ID 64

Testing ID 65

Testing ID 66

Test

PermissionError: [WinError 5] Access is denied: 'C:\\llm_runs\\outputs\\new_option\\tmpjytaaf4x' -> 'C:\\llm_runs\\outputs\\new_option\\added_option_output_gpt_oss20B.json'

In [12]:
import json

with open(OUTPUT_FILENAME, "r", encoding="utf-8") as f:
    results = json.load(f)


In [13]:
failed_ids = [
    qid for qid, entry in results.items()
    if entry["answer"] == -1
]


In [14]:
num_failed = len(failed_ids)

print("Number of failed (-1) answers:", num_failed)


Number of failed (-1) answers: 2


In [15]:
print("Failed question IDs:")
print(failed_ids)


Failed question IDs:
['919', '5307']


In [16]:
with open("failed_ids.json", "w") as f:
    json.dump(failed_ids, f, indent=2)


In [38]:
import json

# Paths
GT_PATH = "test_answers_en.json"
PRED_PATH = r"C:\llm_runs\outputs\new_option\added_option_output.json"

# Load files
with open(GT_PATH, "r", encoding="utf-8") as f:
    gt = json.load(f)

with open(PRED_PATH, "r", encoding="utf-8") as f:
    pred = json.load(f)

correct = 0
total = 0
invalid = 0

for qid, gt_entry in gt.items():
    if qid not in pred:
        continue  # prediction missing

    gt_answer = gt_entry["answer"]
    pred_answer = pred[qid]["answer"]

    if pred_answer == -1:
        invalid += 1
        continue  # exclude from accuracy

    total += 1
    if pred_answer == gt_answer:
        correct += 1

accuracy = correct / total if total > 0 else 0.0

print("Accuracy of LLama 3.1 8B instruct(excluding -1):", accuracy)
print("Total evaluated:", total)
print("Correct:", correct)
print("Invalid (-1):", invalid)


Accuracy of LLama 3.1 8B instruct(excluding -1): 0.689232922242631
Total evaluated: 11433
Correct: 7880
Invalid (-1): 2
