## Testing on the Original Dataset

In [1]:
import json
import time
from groq import Groq
import os
import tempfile

In [None]:
INPUT_FILENAME = "first_-1.json"     # your input file
OUTPUT_FILENAME = "first_-1_output.json" # model outputs

MODEL = "llama-3.1-8b-instant"
SECONDS_BETWEEN_REQUESTS = 1

client = Groq(api_key="YOUR_API_KEY")


In [23]:

os.makedirs(os.path.dirname(OUTPUT_FILENAME), exist_ok=True)


In [30]:
def load_data(filename):
    with open(filename, "r", encoding="utf-8") as f:
        return json.load(f)

def save_data(filename, data):
    dir_name = os.path.dirname(filename) or "."
    with tempfile.NamedTemporaryFile(
        mode="w",
        encoding="utf-8",
        dir=dir_name,
        delete=False
    ) as tmp:
        json.dump(data, tmp, indent=2, ensure_ascii=False)
        tmp_name = tmp.name

    os.replace(tmp_name, filename)  # atomic on Windows

In [44]:
def load_data(filename):
    with open(filename, "r", encoding="utf-8") as f:
        return json.load(f)

def save_data(filename, data):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)


In [45]:
def get_model_answer(question, options, qid=None):
    prompt = f"""
Choose the single most correct answer from the options below.
Respond with ONLY the option text, nothing else.

Question:
{question}

Options:
""" + "\n".join(f"- {opt}" for opt in options)

    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are a helpful assistant that answers multiple-choice questions."},
            {"role": "user", "content": prompt},
        ],
        temperature=0.0,
        max_tokens=50,
    )

    return response.choices[0].message.content.strip()


In [46]:
def run_test_indexed():
    data = load_data(INPUT_FILENAME)
    results = {}  # <-- dict output

    for i, q in enumerate(data):
        qid = str(q.get("id", i))   # JSON-safe key
        category = q.get("category")
        question = q["question"]
        options = q["options"]

        print(f"\nTesting ID {qid}")

        try:
            model_answer_text = get_model_answer(question, options, qid)

            if model_answer_text in options:
                answer_index = options.index(model_answer_text)
            else:
                print(f"⚠️ Answer not found in options for ID {qid}")
                answer_index = -1

        except Exception as e:
            print(f" Error at ID {qid}: {e}")
            answer_index = -1

        results[qid] = {
            "category": category,
            "answer": answer_index
        }

        # Save after every question (safe)
        save_data(OUTPUT_FILENAME, results)
        time.sleep(SECONDS_BETWEEN_REQUESTS)

    print(f"\n Done! Saved indexed answers to {OUTPUT_FILENAME}")


In [47]:
run_test_indexed()



Testing ID 7718

Testing ID 7719

Testing ID 7720

Testing ID 7721

Testing ID 7722
⚠️ Answer not found in options for ID 7722

Testing ID 7723

Testing ID 7724

Testing ID 7725

Testing ID 7726

Testing ID 7727

Testing ID 7728

Testing ID 7729
⚠️ Answer not found in options for ID 7729

Testing ID 7730

Testing ID 7731
⚠️ Answer not found in options for ID 7731

Testing ID 7732

Testing ID 7733
⚠️ Answer not found in options for ID 7733

Testing ID 7734

Testing ID 7735

Testing ID 7736
⚠️ Answer not found in options for ID 7736

Testing ID 7737

Testing ID 7738

Testing ID 7739
⚠️ Answer not found in options for ID 7739

Testing ID 7740

Testing ID 7741

Testing ID 7742

Testing ID 7743
⚠️ Answer not found in options for ID 7743

Testing ID 7744

Testing ID 7745
⚠️ Answer not found in options for ID 7745

Testing ID 7746

Testing ID 9382
⚠️ Answer not found in options for ID 9382

Testing ID 9383

Testing ID 9384

Testing ID 9385

Testing ID 9386

Testing ID 9387

Testing ID 9388
⚠