In [None]:
import re
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import trl
from trl import GRPOConfig, GRPOTrainer
from peft import LoraConfig, get_peft_model, TaskType

SYSTEM_PROMPT = """
按照如下格式生成，思考过程要简洁直接：
<think>
[简要分析问题，列出关键信息和解题步骤，不要重复题目内容]
</think>
<answer>
[仅写最终数字答案]
</answer>
"""

def process_data(data):
    data = data.map(lambda x: {
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['question_zh-cn']}
        ],
        'answer': x['answer_only']
    }) 
    return data
    
def extract_answer(text: str) -> str:
    """
    改进的答案提取函数：
    1. 优先匹配完整的 <answer>...</answer> 标签
    2. 如果标签不完整，尝试匹配 <answer> 后的内容
    3. 最后才使用兜底逻辑
    """
    # 1. 优先匹配完整的 <answer>...</answer> 标签中的最后一个数字
    complete_match = re.search(r"<answer>\s*(.*?)\s*</answer>", text, re.I | re.S)
    if complete_match:
        answer_content = complete_match.group(1).strip()
        # 从answer内容中提取数字
        nums = re.findall(r"\d+", answer_content)
        if nums:
            return nums[-1]  # 返回answer标签内的最后一个数字
    
    # 2. 如果没有完整的 </answer>，但有 <answer> 开始标签
    # 提取 <answer> 之后到字符串结尾的内容
    partial_match = re.search(r"<answer>\s*(.*?)$", text, re.I | re.S)
    if partial_match:
        answer_content = partial_match.group(1).strip()
        nums = re.findall(r"\d+", answer_content)
        if nums:
            return nums[-1]
    
    # 3. 兜底：如果完全没有answer标签，返回空字符串（避免误抓问题中的数字）
    # 这样可以避免把问题描述中的数字当作答案
    return ""

def mark_num(text):
    """计算格式标记的奖励"""
    reward = 0
    if text.count("<think>\n") == 1:
        reward += 0.125
        
    if text.count("</think>\n") == 1:
        reward += 0.125
        
    if text.count("<answer>\n") == 1:
        reward += 0.125
        
    if text.count("</answer>\n") == 1:
        reward += 0.125
    return reward

def correctness_reward(prompts, completions, answer, **kwargs):
    """生成答案是否正确的奖励"""
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_answer(r) for r in responses]
    
    # 精简的调试信息
    print(f"\n{'='*60}")
    print(f"问题: {prompts[0][-1]['content'][:100]}...")  # 只打印前100字符
    print(f"正确答案: {answer[0]}")
    
    # 检查是否完整输出
    has_answer_end = "</answer>" in responses[0].lower()
    if has_answer_end:
        print(f"✓ 输出完整")
        # 只打印answer部分
        answer_match = re.search(r"<answer>(.*?)</answer>", responses[0], re.I | re.S)
        if answer_match:
            print(f"模型答案部分: {answer_match.group(1).strip()}")
    else:
        print(f"✗ 输出被截断")
        print(f"输出长度: {len(responses[0])} 字符")
        # 只打印最后100字符
        print(f"输出结尾: ...{responses[0][-100:]}")
    
    print(f"提取答案: {extracted_responses[0]}")
    
    rewards = [2.0 if response == str(ans) else 0.0
        for response, ans in zip(extracted_responses, answer)]
    
    print(f"正确性奖励: {rewards[0]}")
    print(f"{'='*60}\n")
    
    # 可选：将完整输出保存到文件
    # with open('output/debug_log.txt', 'a', encoding='utf-8') as f:
    #     f.write(f"\n{'='*80}\n")
    #     f.write(f"问题: {prompts[0][-1]['content']}\n")
    #     f.write(f"正确答案: {answer[0]}\n")
    #     f.write(f"模型完整输出:\n{responses[0]}\n")
    #     f.write(f"提取答案: {extracted_responses[0]}\n")
    #     f.write(f"正确性奖励: {rewards[0]}\n")
    
    return rewards

def digit_reward(completions, **kwargs):
    """生成答案是否是数字的奖励"""
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_answer(r) for r in responses]
    return [0.5 if response.isdigit() and response != "" else 0.0 
            for response in extracted_responses]

def hard_format_reward(completions, **kwargs):
    """严格格式奖励"""
    pattern = r"^<think>\n.*?\n</think>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, response, re.S) for response in responses]
    return [0.5 if match else 0.0 for match in matches]

def soft_format_reward(completions, **kwargs):
    """宽松格式奖励"""
    pattern = r"<think>.*?</think>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.search(pattern, response, re.S) for response in responses]
    return [0.5 if match else 0.0 for match in matches]

def mark_reward(completions, **kwargs):
    """标记奖励（改善格式奖励稀疏问题）"""
    responses = [completion[0]["content"] for completion in completions]
    return [mark_num(response) for response in responses]

def truncation_penalty(completions, **kwargs):
    """截断惩罚：如果输出不完整（没有</answer>标签），给予负奖励"""
    responses = [completion[0]["content"] for completion in completions]
    penalties = []
    for response in responses:
        # 如果有<answer>但没有</answer>，说明被截断了
        has_answer_start = "<answer>" in response.lower()
        has_answer_end = "</answer>" in response.lower()
        
        if has_answer_start and not has_answer_end:
            penalties.append(-1.0)  # 强化截断惩罚
        elif not has_answer_start:
            # 连<answer>都没有，说明在<think>阶段就被截断了，给更重的惩罚
            penalties.append(-2.0)
        else:
            penalties.append(0.0)  # 无惩罚
    
    return penalties

def length_penalty(completions, **kwargs):
    """长度惩罚：鼓励简洁的输出"""
    responses = [completion[0]["content"] for completion in completions]
    penalties = []
    for response in responses:
        length = len(response)
        # 如果输出太长（>600字符），给予惩罚
        if length > 600:
            penalties.append(-0.3)
        elif length > 500:
            penalties.append(-0.1)
        else:
            penalties.append(0.1)  # 简洁的输出给予小奖励
    
    return penalties


if __name__ == '__main__':
    model_name = "/root/autodl-tmp/base_models/Qwen3-0.6B"
    model = AutoModelForCausalLM.from_pretrained(model_name)
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # 设置EOS token以确保生成能正常结束
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    ds = load_dataset("/root/autodl-tmp/llm_study/deepseek_learn/datasets/gsm8k_chinese")
    data = process_data(ds['train'])
    
    output_dir = "output_v3"

    training_args = GRPOConfig(
        output_dir=output_dir,
        learning_rate=5e-6,
        adam_beta1=0.9,
        adam_beta2=0.99,
        weight_decay=0.1,
        warmup_ratio=0.1,
        lr_scheduler_type='cosine',
        logging_steps=1,
        bf16=True,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8,
        generation_batch_size=8,
        num_generations=8,
        max_prompt_length=256,
        max_completion_length=768,
        num_train_epochs=1,
        save_steps=400,
        max_grad_norm=0.1,
        log_on_each_node=False,
        use_vllm=False,
        report_to="tensorboard",
        gradient_checkpointing=True,
        # 生成配置
        temperature=0.7,  # 降低温度使生成更确定
        top_p=0.9,
        # 添加停止字符串，当生成到</answer>时停止
        # stop_strings=["</answer>\n"],
    )
    
    trainer = GRPOTrainer(
        model=model,
        processing_class=tokenizer,
        reward_funcs=[
            truncation_penalty,  # 优先检查截断（负奖励）
            length_penalty,  # 鼓励简洁输出
            mark_reward,
            soft_format_reward,
            hard_format_reward,
            digit_reward,
            correctness_reward
        ],
        args=training_args,
        train_dataset=data,
    )
    
    trainer.train()
    trainer.save_model(output_dir)

[2025-10-06 12:55:40,851] [INFO] [real_accelerator.py:260:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/root/miniconda3/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for bool@CXXABI_1.3'

[2025-10-06 12:55:41,590] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.



问题: 艾哈迈德和艾米丽正在进行一场比赛，看谁能获得班上最好的成绩。共有 9 项作业，艾哈迈德在课堂上得了 91 分。 Emily 的得分为 92。最终作业的价值与所有其他作业的价值相同。艾米丽的期末作业得了...
正确答案: 100
✗ 输出被截断
输出长度: 1182 字符
输出结尾: ...作业比90和92都高。但题目问的是艾哈迈德需要最低的成绩，也就是他需要的最低得分，才能击败艾米丽。如果所有作业的价值相同，那么艾哈迈德的最低得分就是91，因为他已经得了91。但这样艾米丽的期末作业是9
提取答案: 
正确性奖励: 0.0



Step,Training Loss
1,0.133
2,0.2115
3,0.1769
4,0.3161
5,0.1275
6,0.0418
7,0.249
8,-0.0839
9,0.374
10,0.4989



问题: 水箱上的水表显示水箱已充满 1/3 的水。为了填充水箱，添加了 16 加仑的水。水箱满后可容纳多少加仑的水？...
正确答案: 24
✗ 输出被截断
输出长度: 1072 字符
输出结尾: ... + 1/3，即50/3，但这样可能答案是50/3，即约16.666...加仑。但可能题目中的答案需要以整数呈现，这时候可能需要重新考虑。

或者，可能问题中的“已充满1/3”指的是水箱原本有1/3的
提取答案: 
正确性奖励: 0.0


问题: 学校食堂有15张桌子。每张桌子可容纳10人。通常，只有 1/10 的座位空着。通常有多少个座位？...
正确答案: 135
✗ 输出被截断
输出长度: 877 字符
输出结尾: ...座位空着”，那么空着的是总座位数的1/10，因此实际坐满的座位数是总座位数的9/10。因此，计算方法是总座位数乘以9/10，即150*9/10=135。所以答案应该是135。
</think>
135
提取答案: 
正确性奖励: 0.0


问题: Fiona 一小时内完成了 36 道数学题。雪莉在同一时间内完成了两倍的数学题，基亚娜完成了菲奥娜和雪莉数学题总和的一半。如果接下来的一小时内她们每个人都做了相同数量的问题，那么三个女孩在 2 小时内...
正确答案: 324
✓ 输出完整
模型答案部分: 324
提取答案: 324
正确性奖励: 2.0


问题: 墓地里有20具骷髅。这些骷髅中有一半是成年女性，其余的则由成年男性和儿童各占一半。如果一个成年女性身上有20块骨头，一个男性比这个多5块，而一个孩子的骨头数量是成年女性的一半，那么墓地里有多少块骨头？...
正确答案: 375
✓ 输出完整
模型答案部分: 325
提取答案: 325
正确性奖励: 0.0


问题: 林恩买了 7 本关于猫的书和 2 本关于太阳系的书。她还买了3本杂志。每本书售价 7 美元，每本杂志售价 4 美元。林恩总共花了多少钱？...
正确答案: 75
✗ 输出被截断
输出长度: 313 字符
输出结尾: ...需要检查一下计算是否正确。先算猫和太阳系的书，7*7=49，2*7=14，总和是63美元。然后杂志3*4=12美元，加起来总共是63+12=75美元。所以答案应该是75美元。
</think>

75
提取答案: 
正确性奖励: 0.0


问题: