In [1]:
!pip install peft trackio accelerate datasets transformers trl

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [2]:
import os
import shutil
import torch
from accelerate import PartialState
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
)
from trl import (
    ModelConfig,
    PPOConfig,
    PPOTrainer,
    get_kbit_device_map,
    get_peft_config,
    get_quantization_config,
)
from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import trl
import transformers
import torch
print("trl version: ", trl.__version__)
print("transformers version: ", transformers.__version__)
print("torch veion: ", torch.__version__)

trl version:  0.22.2
transformers version:  4.56.0
torch veion:  2.3.1+cu118


In [4]:
# 本地文件位置
reward_model_path = r"C:/Users/hhm18/Desktop/深度学习/env_DRL/model/reward"
save_path = "C:/Users/hhm18/Desktop/深度学习/env_DRL/model"
base_model_path = r"C:/Users/hhm18/Desktop/深度学习/env_DRL/model/QwenQwen2.5-0.5B-Instruct"

# # openbayes
# reward_model_path = r"./home/model/reward"
# save_path = "./home/model"
# base_model_path = r"./home/model/Qwen2-5-0-5b-instruct"

# script_args = {
#     "dataset_name": "trl-internal-testing/descriptiveness-sentiment-trl-style",
#     "dataset_config": None,
#     "dataset_train_split": "descriptiveness",
# }

training_args = PPOConfig(
    output_dir=save_path,
    learning_rate=3e-6,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,    # 小的batch减少显存占用
    total_episodes=20,               # 可以先小一点
    num_ppo_epochs=1,
    num_mini_batches=1,
    push_to_hub=False,
    reward_model_path=reward_model_path,
    sft_model_path=base_model_path,
    logging_strategy="no",             # 🚫 关闭日志 python 3.10以上可以不需要一下兼容
    # eval_strategy="no",                # 🚫 不做 eval（想做就改成 "steps"）
    save_strategy="no",                # 🚫 不自动保存 checkpoint
    report_to="none",
    logging_steps=10,                   # 🚫 禁用所有集成
)

model_args = ModelConfig(
    model_name_or_path=base_model_path,
    trust_remote_code=True,     # 如果模型自带自定义 tokenizer 代码
    torch_dtype=torch.float16
)

In [5]:
# -----------------------------
# 模型和 Tokenizer
# -----------------------------
# torch_dtype = (
#     model_args.torch_dtype
#     if model_args.torch_dtype in ["auto", None]
#     else getattr(torch, model_args.torch_dtype)
# )

quantization_config = get_quantization_config(model_args)
model_kwargs = dict(
    revision=model_args.model_revision,
    attn_implementation=model_args.attn_implementation,
    torch_dtype=model_args.torch_dtype,
    device_map=get_kbit_device_map() if quantization_config is not None else None,
    quantization_config=quantization_config,
)

tokenizer = AutoTokenizer.from_pretrained(
    model_args.model_name_or_path,
    padding_side="left",
    trust_remote_code=model_args.trust_remote_code,
)

tokenizer.add_special_tokens({"pad_token": "[PAD]"})
if tokenizer.chat_template is None:
    tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE

In [6]:
print(repr(base_model_path))   # 应该显示合法绝对路径，不能多/少斜杠
print(os.path.exists(base_model_path))  # 必须 True
print(os.path.isfile(os.path.join(base_model_path, "config.json")))  # 必须 True

'C:/Users/hhm18/Desktop/深度学习/env_DRL/model/QwenQwen2.5-0.5B-Instruct'
True
True


In [7]:
tokenizer.special_tokens_map

{'eos_token': '<|im_end|>',
 'pad_token': '[PAD]',
 'additional_special_tokens': ['<|im_start|>',
  '<|im_end|>',
  '<|object_ref_start|>',
  '<|object_ref_end|>',
  '<|box_start|>',
  '<|box_end|>',
  '<|quad_start|>',
  '<|quad_end|>',
  '<|vision_start|>',
  '<|vision_end|>',
  '<|vision_pad|>',
  '<|image_pad|>',
  '<|video_pad|>']}

In [8]:
value_model = AutoModelForSequenceClassification.from_pretrained(
    training_args.reward_model_path,
    trust_remote_code=model_args.trust_remote_code,
    num_labels=1,
)

In [9]:
value_model

Qwen2ForSequenceClassification(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896, padding_idx=151645)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)

In [10]:
reward_model = AutoModelForSequenceClassification.from_pretrained(
    training_args.reward_model_path,
    trust_remote_code=model_args.trust_remote_code,
    num_labels=1,
)
reward_model

Qwen2ForSequenceClassification(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896, padding_idx=151645)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)

In [11]:
policy = AutoModelForCausalLM.from_pretrained(
    training_args.sft_model_path,
    trust_remote_code=model_args.trust_remote_code,
)
# policy.gradient_checkpointing_enable()

In [12]:
from peft import LoraConfig, get_peft_model

# 正确构造 LoRA 配置
peft_config = LoraConfig(
    r=2,
    lora_alpha=4,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

# 应用到模型
policy_model = get_peft_model(policy, peft_config)
# 打印看看
print(policy_model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(151936, 896)
        (layers): ModuleList(
          (0-23): 24 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=896, out_features=896, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=896, out_features=2, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=2, out_features=896, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear(in_features=8

In [13]:
print(policy_model.is_gradient_checkpointing)  

False


In [14]:
peft_config = get_peft_config(model_args)
if peft_config is None:
    ref_policy = AutoModelForCausalLM.from_pretrained(
        training_args.sft_model_path, trust_remote_code=model_args.trust_remote_code
    )
else:
    ref_policy = None

In [15]:
ref_policy

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Qwen2RotaryEmbe

In [16]:
data_path = "C:/Users/hhm18/Desktop/深度学习/env_DRL/data"

dataset = load_dataset(data_path, data_files={
    "sentiment": "sentiment-00000-of-00001.parquet",
    "descriptiveness": "descriptiveness-00000-of-00001.parquet",
})

print(dataset)


DatasetDict({
    sentiment: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 5480
    })
    descriptiveness: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 5425
    })
})


In [17]:
def prepare_dataset(dataset, tokenizer, text_field="prompt"):
    def tokenize(element):
        outputs = tokenizer(
            element[text_field],
            padding=False,
        )
        return {"input_ids": outputs["input_ids"]}
    
    return dataset.map(
        tokenize,
        batched=True,
        remove_columns=dataset.column_names,
    )


# with PartialState().local_main_process_first():  #分布式
train_dataset = prepare_dataset(dataset["descriptiveness"], tokenizer).select(range(100))
eval_dataset = prepare_dataset(dataset["sentiment"], tokenizer).select(range(50))

In [18]:
print(len(train_dataset), train_dataset,
      "/n", len(eval_dataset), eval_dataset)

100 Dataset({
    features: ['input_ids'],
    num_rows: 100
}) /n 50 Dataset({
    features: ['input_ids'],
    num_rows: 50
})


In [19]:
# -----------------------------
# 训练
# -----------------------------

# 🚫 禁用 trackio / wandb 等日志
os.environ["DISABLE_TRACKIO"] = "1"
os.environ["WANDB_DISABLED"] = "true"


trainer = PPOTrainer(
    args=training_args,
    processing_class=tokenizer,
    model=policy_model,
    ref_model=ref_policy,
    reward_model=reward_model,
    value_model=value_model,
    train_dataset=train_dataset,
    # eval_dataset=eval_dataset,
    peft_config=peft_config,
)

trainer.train()

===training policy===


  attn_output = torch.nn.functional.scaled_dot_product_attention(


OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB. GPU 

In [None]:
trainer.save_model(training_args.output_dir)