In [None]:
import os
from typing import List
from src.data_reader import load_data
from src.prompt_constructor import build_prompt
from src.model_runner import load_model_and_tokenizer, generate_with_token_probabilities
from src.parse_answer import parse_answer

In [None]:
data_file = "data/test_data.json"
model_name = "Qwen/Qwen2.5-3B-Instruct"
output_file = "output/results.json"

In [None]:
records = load_data(data_file)

# Load model & tokenizer
tokenizer, model = load_model_and_tokenizer(model_name)

results = []
for idx, record in enumerate(records):
    task = record["task"]
    choices = {
        "A": record["A"],
        "B": record["B"],
        "C": record["C"],
        "D": record["D"]
    }

    hint_text = record.get("hint", None)  # can be None if no hint

    # Build the chat prompt
    prompt = build_prompt(task, choices, hint_text)

    # Run partial decoding
    full_text, gen_tokens, token_probs = generate_with_token_probabilities(
        model, tokenizer, prompt, max_new_tokens=150
    )

    # Extract the final answer (- look for the substring after "So I'll finalize the answer as:")
    final_answer = parse_answer(full_text, marker="So I'll finalize the answer as: ")

    # Store results
    out_record = {
        "index": idx,
        "task": task,
        "choices": choices,
        "hint_type": record["hint_type"],
        "hint_text": hint_text,
        "prompt": prompt,
        "chain_of_thought_tokens": gen_tokens,
        "token_probabilities": token_probs,
        "final_answer": final_answer
    }
    results.append(out_record)

# Write all results to a JSON file
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2)

print(f"Saved results to {output_file}")