In [1]:
# train_dpo.py
import torch
from datasets import load_dataset
from trl import DPOConfig, DPOTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model

In [2]:
model_path = r"C:/Users/hhm18/Desktop/深度学习/env_DRL/model/QwenQwen2.5-0.5B-Instruct"
data_path = "C:/Users/hhm18/Desktop/深度学习/env_DRL/data"
save_pah = "C:/Users/hhm18/Desktop/深度学习/model/Qwen-dpo"

quantization_config = BitsAndBytesConfig(load_in_8bit=True,)

# 加载模型和分词器
model = AutoModelForCausalLM.from_pretrained(
                    model_path, 
                    quantization_config=quantization_config, 
                    dtype=torch.float16,)

tokenizer = AutoTokenizer.from_pretrained(model_path)

In [3]:
dataset = load_dataset(data_path,
                       data_files={
                            "train": "train.jsonl.gz",
                       }, split="train").select(range(100))

print(dataset,model)

Dataset({
    features: ['chosen', 'rejected'],
    num_rows: 100
}) Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear8bitLt(in_features=896, out_features=896, bias=True)
          (k_proj): Linear8bitLt(in_features=896, out_features=128, bias=True)
          (v_proj): Linear8bitLt(in_features=896, out_features=128, bias=True)
          (o_proj): Linear8bitLt(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear8bitLt(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear8bitLt(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear8bitLt(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm

In [4]:
dpo_config = DPOConfig(
    output_dir=save_pah,
    learning_rate=5e-5,
    per_device_train_batch_size=1,   # 单卡
    gradient_accumulation_steps=8,   # 有效批次 = 1 * 8
    beta=0.1,
    max_steps=100,
    num_train_epochs=2,
    save_strategy="steps",
    save_steps=20,
    logging_steps=5,
    report_to="tensorboard",
     optim="paged_adamw_8bit",
)

peft_config = LoraConfig(
    r=4,
    lora_alpha=8,
    target_modules=["q_proj", "v_proj",],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

In [5]:
model_peft = get_peft_model(model, peft_config)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
trainable_ratio = 100 * trainable_params / total_params

print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")
print(f"Trainable parameters ratio: {trainable_ratio:.4f}%")

Total parameters: 494303104
Trainable parameters: 270336
Trainable parameters ratio: 0.0547%


In [6]:
trainer = DPOTrainer(
    model=model_peft,
    args=dpo_config,
    processing_class=tokenizer,
    train_dataset=dataset,
    # peft_config = peft_config,
)

In [7]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss
5,0.6998
10,0.6854
15,0.6912
20,0.6783
25,0.6791
30,0.6585
35,0.6553
40,0.6664
45,0.6703
50,0.631




TrainOutput(global_step=100, training_loss=0.643120551109314, metrics={'train_runtime': 1574.0753, 'train_samples_per_second': 0.508, 'train_steps_per_second': 0.064, 'total_flos': 0.0, 'train_loss': 0.643120551109314, 'epoch': 7.72})

![train loss](img/dpo%20train%20loss.png)