In [1]:
import tqdm
from modelscope import AutoModelForCausalLM, AutoTokenizer
model_name = 'Qwen/Qwen2.5-0.5B-Instruct'
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm


Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-0.5B-Instruct


2025-09-29 12:22:15,681 - modelscope - INFO - Target directory already exists, skipping creation.
`torch_dtype` is deprecated! Use `dtype` instead!


Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-0.5B-Instruct


2025-09-29 12:22:17,714 - modelscope - INFO - Target directory already exists, skipping creation.


In [2]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Qwen2RotaryEmbe

In [3]:
from datasets import load_dataset
data = load_dataset(path='./../.cache/huggingface/datasets/gsm8k/main')

In [4]:
data

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})

In [5]:
data['train'][1]

{'question': 'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?',
 'answer': 'Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.\nWorking 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.\n#### 10'}

In [6]:
### construct data by Qwen format
prompt = data['train'][1]['question']
messages = [
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True, # add new response prompt at the end
)
text

'<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nWeng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?<|im_end|>\n<|im_start|>assistant\n'

In [7]:
model_inputs = tokenizer(
    [text],
    return_tensors="pt",
).to(model.device)
model_inputs
# input_ids is indexed tokens
# attention_mask is 1 for real tokens and 0 for padding tokens
# token_type_ids is segment ids

{'input_ids': tensor([[151644,   8948,    198,   2610,    525,   1207,  16948,     11,   3465,
            553,  54364,  14817,     13,   1446,    525,    264,  10950,  17847,
             13, 151645,    198, 151644,    872,    198,     54,    826,  63759,
            400,     16,     17,    458,   6460,    369,  70583,  14810,     13,
          60033,     11,   1340,   1101,   1521,    220,     20,     15,   4420,
            315,  70583,  14810,     13,   2585,   1753,   1521,   1340,   7232,
             30, 151645,    198, 151644,  77091,    198]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [8]:
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512,
) # generate up to 512 new tokens

In [9]:
generated_ids

tensor([[151644,   8948,    198,   2610,    525,   1207,  16948,     11,   3465,
            553,  54364,  14817,     13,   1446,    525,    264,  10950,  17847,
             13, 151645,    198, 151644,    872,    198,     54,    826,  63759,
            400,     16,     17,    458,   6460,    369,  70583,  14810,     13,
          60033,     11,   1340,   1101,   1521,    220,     20,     15,   4420,
            315,  70583,  14810,     13,   2585,   1753,   1521,   1340,   7232,
             30, 151645,    198, 151644,  77091,    198,   1249,   8253,   1246,
           1753,    467,    826,  15303,    369,  70583,  14810,     11,    582,
           1184,    311,   1795,   1493,   7354,   1447,     16,     13,   3070,
          12012,    279,    882,    504,   4420,    311,   4115,     25,   1019,
            256,    481,   1205,   1414,    429,  17767,     16,  57758,   6460,
            374,   6144,    311,  17767,     21,     15,  57758,   4420,    624,
            256,    481,  15

In [10]:
response = tokenizer.batch_decode(
    generated_ids,
    skip_special_tokens=False, # skip special tokens like <|im_start|>.
)
response

['<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nWeng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?<|im_end|>\n<|im_start|>assistant\nTo determine how much Weng earned for babysitting, we need to follow these steps:\n\n1. **Convert the time from minutes to hours:**\n   - We know that \\(1\\) hour is equal to \\(60\\) minutes.\n   - Therefore, \\(50\\) minutes can be converted to hours:\n     \\[\n     \\text{Time in hours} = \\frac{50 \\text{ minutes}}{60 \\text{ minutes per hour}} = \\frac{5}{6} \\text{ hours}\n     \\]\n\n2. **Calculate the earnings:**\n   - Weng earns $12 per hour.\n   - To find her total earnings, multiply the number of hours worked by her hourly wage:\n     \\[\n     \\text{Earnings} = \\left(\\frac{5}{6}\\right) \\text{ hours} \\times \\$12/\\text{hour}\n     \\]\n   - Perform the multiplication:\n     \\[\n     \\text{Earnings} = 

In [11]:
data['train'][1]['answer']

'Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.\nWorking 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.\n#### 10'

In [12]:
import wandb

wandb.init(project="GRPO_Math_Inference")

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mlogan-zh-cai[0m ([33mlogan-cai[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [13]:
import re
import torch
from datasets import load_dataset, Dataset
from modelscope import AutoModelForCausalLM, AutoTokenizer
import trl.trainer.grpo_trainer as grpo_trainer
import trl.trainer.grpo_config as grpo_config

# define the system prompt
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""
# final output format
XML_COT_FORMAT = """\
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""

In [14]:
# extract the answer from the model output
def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1].split("</answer>")[0].strip()
    # strip for removing leading/trailing whitespace/newline
    return answer

# extract the answer from the dataset
def extract_hash_answer(text: str) -> str | None:
    if '####' not in text:
        return None
    return text.split("####")[1].strip()

def get_gsm8k_question(split='train'):
    data = load_dataset(path='./../.cache/huggingface/datasets/gsm8k/main')[split]
    data = data.map(lambda x: {
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['question']}
        ],
        'answer': extract_hash_answer(x['answer'])
    }) # map the dataset to the new format
    return data

dataset = get_gsm8k_question('train')
dataset

Dataset({
    features: ['question', 'answer', 'prompt'],
    num_rows: 7473
})

In [15]:
dataset[1]

{'question': 'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?',
 'answer': '10',
 'prompt': [{'content': '\nRespond in the following format:\n<reasoning>\n...\n</reasoning>\n<answer>\n...\n</answer>\n',
   'role': 'system'},
  {'content': 'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?',
   'role': 'user'}]}

In [16]:
# completely correct answer (xml format, interger, correct) 2 scores, otherwise 0 score
def correctness_reward_func(prompts, completions, **kwargs):
    # prompts [batch_size, conversation_turns, {role, content}]
    # completions is a list of response dicts generated by the model [batch_size, [{role, content}]]
    # answer [batch_size]
    answer = kwargs.get('answer', [])
    responses = [completion[0]['content'] for completion in completions] # get every response content from each completion
    q = prompts[0][-1]['content'] # first prompt's last turn's content (question)
    extracted_responses = [extract_xml_answer(response) for response in responses] # extract the answer from each response
    # print the first example for debugging
    print('-'*20, f'Question:\n{q}', f'\nAnswers:\n{answer[0]}', f'\nResponses:\n{responses[0]}', f'\nExtracted Responses:\n{extracted_responses[0]}')

    # compare the extracted responses with the ground truth answers
    return [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]

In [17]:
# answer is xml format, interger, 0.5 score, otherwise 0 score
def int_reward_func(prompts, completions, **kwargs):
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(response) for response in responses]

    return [0.5 if response.isdigit() else 0.0 for response in extracted_responses]

In [18]:
# answer that strictly follows format "<reasoning>...</reasoning><answer>...</answer>"(including '\n'), 0.5 score, otherwise 0 score
def strict_format_reward_func(prompts, completions, **kwargs):
    pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]['content'] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]

    return [0.5 if match else 0.0 for match in matches]

In [19]:
# answer that basically follows format "<reasoning>...</reasoning><answer>...</answer>"(no need to include '\n'), 0.5 score, otherwise 0 score
def soft_format_reward_func(prompts, completions, **kwargs):
    pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
    responses = [completion[0]['content'] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]

    return [0.5 if match else 0.0 for match in matches]

In [20]:
# label position reward
def count_xml(text):
    count = 0.0
    if text.count("<reasoning>\n") == 1:
        count += 0.125
    if text.count("\n</reasoning>\n") == 1:
        count += 0.125
    if text.count("\n<answer>\n") == 1:
        count += 0.125
        count -= len(text.split("\n<answer>\n")[-1]) * 0.001 # penalize long answer part
    if text.count("\n</answer>\n") == 1:
        count += 0.125
        count -= (len(text.split("\n</answer>")[-1]) - 1) * 0.001 # penalize extra content after </answer>
    return count

# for batch
def xmlcount_reward_func(prompts, completions, **kwargs):
    contents = [completion[0]['content'] for completion in completions]

    return [count_xml(content) for content in contents]


In [21]:
model_name = 'Qwen/Qwen2.5-0.5B-Instruct'
output_dir = 'outputs/Qwen2.5-0.5B-reasoning-GRPO'
run_name = 'Qwen2.5-0.5B-GRPO-gsm8k'
training_args = grpo_config.GRPOConfig(
    output_dir=output_dir,
    run_name=run_name, # wandb run name
    learning_rate=5e-6, # lr for rl usually little
    adam_beta1=0.9,
    adam_beta2=0.99,
    weight_decay=0.1,
    warmup_ratio=0.1,
    lr_scheduler_type='cosine', # lr decay strategy
    logging_steps=1,
    bf16=True, # use bf16 to mix precision training, if possible
    per_device_train_batch_size=8, # total batch size is per_device_train_batch_size * num gpus
    gradient_accumulation_steps=4, # update model every 4 steps
    num_generations=8, # number of samples to generate per prompt by GRPO
    max_prompt_length=256, # max length for the prompt (question)
    max_completion_length=200, # max length for the completion (response)
    num_train_epochs=1,
    save_steps=100, # save model every 100 steps
    max_grad_norm=0.1, # gradient clipping
    log_on_each_node=False, # log only on the main node when using multiple nodes
    use_vllm=False, # use vllm for faster generation
    vllm_gpu_memory_utilization=0.3, # gpu memory utilization for vllm
    report_to='wandb', # report to wandb
)

In [22]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="cuda",
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
trainer = grpo_trainer.GRPOTrainer(
    model=model,
    processing_class=tokenizer, # use tokenizer to process the data
    reward_funcs=[
        xmlcount_reward_func,
        soft_format_reward_func,
        int_reward_func,
        correctness_reward_func,
        strict_format_reward_func
    ],
    args=training_args,
    train_dataset=dataset,
)

trainer.train()
trainer.save_model()

Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-0.5B-Instruct


2025-09-29 12:22:27,400 - modelscope - INFO - Target directory already exists, skipping creation.
`torch_dtype` is deprecated! Use `dtype` instead!


Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-0.5B-Instruct


2025-09-29 12:22:28,555 - modelscope - INFO - Target directory already exists, skipping creation.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151645}.


-------------------- Question:
Ahmed and Emily are having a contest to see who can get the best grade in the class. There have been 9 assignments and Ahmed has a 91 in the class. Emily has a 92. The final assignment is worth the same amount as all the other assignments. Emily got a 90 on the final assignment. What is the minimum grade Ahmed needs to get to beat Emily if all grades are whole numbers? 
Answers:
100 
Responses:
To determine the minimum grade Ahmed needs to get to beat Emily, we first need to figure out the crucial information about the final assignment grades. The important things to know are:

1. The sum of all their grades adds up to 91 (since the final grade is worth the same amount as the other 8 assignments).
2. Emily's final grade is 90.

Let's denote:
- Ahmed's grades: \(A_1, A_2, A_3, A_4, A_5, A_6, A_7, A_8, A_9\)
- Emily's final grade: 90
- The average required total grade for Ahmed: 91
- Every grade is a whole number

To find out the minimum grade Ahmed needs, 

Step,Training Loss
1,0.0
2,0.0
3,0.0
4,-0.0035
5,0.0
6,0.0
7,0.0476
8,0.0
9,0.0
10,0.0


-------------------- Question:
In a graveyard, there are 20 skeletons.  Half of these skeletons are adult women, and the remaining number are split evenly between adult men and children.  If an adult woman has 20 bones in their body, and a male has 5 more than this, and a child has half as many as an adult woman, how many bones are in the graveyard? 
Answers:
375 
Responses:
To find the total number of bones in the graveyard, we need to break down the problem step by step.

1. **Determine the number of adult women and adult men:**
   - Half of the skeletons are adult women.
   - Total skeletons = 20
   - Number of adult women = 20 / 2 = 10
   - Number of adult men = 20 - 10 = 10

2. **Calculate the number of bones per adult woman:**
   - A single adult woman has 20 bones.
   - Number of bones per adult woman = 20

3. **Calculate the number of bones per adult man:**
   - Each adult man has 5 more bones than an adult woman.
   - Number of bones per adult man = 20 (bones per adult woman) 