In [1]:
from peft import PeftModel
import torch
from modelscope import AutoModelForCausalLM, AutoTokenizer

model_name = 'Qwen/Qwen2.5-0.5B-Instruct'

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype="auto",
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

ft_model = PeftModel.from_pretrained(
    base_model,
    './outputs/Long-CoT-Math-Inference-Finetuning/checkpoint-1869',
    dtype=torch.bfloat16,
    is_trainable=False,
    device_map="auto"
)


rl_model = AutoModelForCausalLM.from_pretrained(
    './outputs/Qwen2.5-0.5B-reasoning-GRPO/checkpoint-1868',
    device_map="auto"
)

  from .autonotebook import tqdm as notebook_tqdm


Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-0.5B-Instruct


2025-10-10 12:27:59,782 - modelscope - INFO - Target directory already exists, skipping creation.


Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-0.5B-Instruct


2025-10-10 12:28:01,460 - modelscope - INFO - Target directory already exists, skipping creation.


In [2]:
from datasets import load_dataset

SYSTEM_PROMPT = """You are Qwen, created by Alibaba Cloud. According to the question, please provide the user with detailed reasoning steps and answer in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

def extract_hash_answer(text: str) -> str | None:
    if '####' not in text:
        return None
    return text.split("####")[1].strip()

def create_prompt_formats(sample):
    text = tokenizer.apply_chat_template(
        sample['message'],
        tokenize=False,
        add_generation_prompt=True,
        )

    sample["prompt"] = text

    return sample

def get_gsm8k_question(split='train'):
    data = load_dataset(path='./../.cache/huggingface/datasets/gsm8k/main')[split]
    data = data.map(lambda x: {
        'message': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['question'] + "\n"}
        ],
        'answer': extract_hash_answer(x['answer'])
    }) # map the dataset to the new format
    data = data.map(create_prompt_formats)
    return data

test_dataset = get_gsm8k_question('test')
test_dataset

Dataset({
    features: ['question', 'answer', 'message', 'prompt'],
    num_rows: 1319
})

In [3]:
import re

def strict_format_reward_func(prompts, completions, **kwargs):
    pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]['content'] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]

    return [1.0 if match else 0.0 for match in matches]

def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1].split("</answer>")[0].strip()
    # strip for removing leading/trailing whitespace/newline
    return answer

def correctness_reward_func(prompts, completions, **kwargs):
    # prompts [batch_size, conversation_turns, {role, content}]
    # completions is a list of response dicts generated by the model [batch_size, [{role, content}]]
    # answer [batch_size]
    answer = kwargs.get('answer', [])
    responses = [completion[0]['content'] for completion in completions] # get every response content from each completion
    extracted_responses = [extract_xml_answer(response) for response in responses] # extract the answer from each response

    # compare the extracted responses with the ground truth answers
    return [1.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]

In [4]:
from tqdm import tqdm

test_batch_size = 4
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=test_batch_size)

ft_model.eval()
ft_format_reward = 0
ft_correct_reward = 0

rl_model.eval()
rl_format_reward = 0
rl_correct_reward = 0

for batch in tqdm(test_dataloader):
    prompts = batch['prompt']
    answers = batch['answer']
    inputs = tokenizer(prompts, return_tensors='pt', padding=True, padding_side='left').to(ft_model.device)

    ft_outputs = ft_model.generate(
        **inputs,
        max_new_tokens=512,
    )
    completions = tokenizer.batch_decode(ft_outputs[:, inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    completions = [[{'role': 'assistant', 'content': c}] for c in completions]

    ft_format_reward += sum(strict_format_reward_func(prompts, completions))
    ft_correct_reward += sum(correctness_reward_func(prompts, completions, answer=answers))

    rl_outputs = rl_model.generate(
        **inputs,
        max_new_tokens=512,
    )
    rl_completions = tokenizer.batch_decode(rl_outputs[:, inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    rl_completions = [[{'role': 'assistant', 'content': c}] for c in rl_completions]

    rl_format_reward += sum(strict_format_reward_func(prompts, rl_completions))
    rl_correct_reward += sum(correctness_reward_func(prompts, rl_completions, answer=answers))

print(f"RL Model - Strict Format Acc: {rl_format_reward/len(test_dataset):.4f}, Correctness Acc: {rl_correct_reward/len(test_dataset):.4f}")
print(f"FT Model - Strict Format Acc: {ft_format_reward/len(test_dataset):.4f}, Correctness Acc: {ft_correct_reward/len(test_dataset):.4f}")

100%|██████████| 330/330 [51:07<00:00,  9.30s/it]

RL Model - Strict Format Acc: 0.9697, Correctness Acc: 0.4428
FT Model - Strict Format Acc: 0.0129, Correctness Acc: 0.2456



