In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer


model_name = "./LLaMA-Factory/saves/Qwen2.5-1.5B-Instruct/full/qwen2.5_model_cot_20241230"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype="auto",
    attn_implementation="flash_attention_2",
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
SYSTEM_PROMPT = "\nYou are a 24 Game assistant. Given 4 numbers (range 1-13), you need to use basic arithmetic operations (+, -, *, /) to combine these numbers to get 24.\n\n**Rules:**\n1. You MUST USE all 4 given numbers EXACTLY ONCE, even if numbers are REPEATED.\n   - For example, if the input is 4, 4, 5, 6, you must use 4, 4, 5, 6 exactly once.\n   - If the equation is equal to 24 but does not use all 4 numbers exactly once, it is invalid and SHOULD NOT be considered as a solution.\n   - You CANNOT use other numbers except for the 4 given numbers.\n2. Only these operators are allowed: addition(+), subtraction(-), multiplication(*), division(/)\n3. Parentheses can be used to change operation precedence\n5. The result must equal exactly 24, or be approximately 24 within calculation error (e.g., 23.9999...)\n\n**Output Requirements:**\n\nPlease reason step by step, show your reasoning process and put your final answer within \\boxed{}. DONOT use LaTeX expression in the final answer.\n"

In [None]:
def generate_response(prompt: str, model, tokenizer) -> str:

    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": prompt},
    ]
    
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )
    
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=4096,
        do_sample=False,
        early_stopping=True,
    )
    
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return response

In [None]:
import pandas as pd

test_df = pd.read_csv("data/test_df.csv")

test_puzzles = [", ".join(puzzle.split()) for puzzle in test_df["Puzzles"].tolist()]

In [None]:
import json

output_folder = "eval/responses.jsonl"

prompt = "Solve the 24 Game using numbers{numbers}"

with open(output_folder, "w") as f:
    for puzzle in test_puzzles:
        response = generate_response(prompt.format(numbers=puzzle), model, tokenizer)
        f.write(json.dumps({"puzzle": puzzle, "response": response}) + "\n")