# Text to text DPO

在选择模型的数据集的过程中：首先尝试了 encoder-decoder 架构的 t5-small ，但是模型的生成效果不好。于是改用作业中 work 的 Qwen2.5-0.5B-Instruct。尝试了 Dahoas/rm-static，Anthropic/hh-rlhf 数据集，但是他们都是多轮对话，同时 string 长度较大，针对 Human 的提问手动设置 label 并不容易。因此我们选择：**Intel/orca_dpo_pairs** 数据集。其中有

此外，我们使用了 **Qwen-2.5-0.5B-Instruct** 模型

### 准备工作


In [None]:
import os
# 使用 hf-mirror 下载模型
os.environ["HF_HUB_ENDPOINT"] = "https://hf-mirror.com"
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

import torch.nn.functional as F
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm
import wandb
wandb.login(key="My key")

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device=torch.device("npu:0")
print("my device is:",device)

RuntimeError: Failed to import transformers.models.auto.modeling_auto because of the following error (look up to see its traceback):
Failed to import transformers.generation.utils because of the following error (look up to see its traceback):
partially initialized module 'torch_npu' has no attribute 'npu' (most likely due to a circular import)

### 设置模型、数据集和超参数

In [None]:
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
ref_model_name = "Qwen/Qwen2.5-0.5B-Instruct"  # 参考策略
# dataset_name="Dahoas/rm-static"
dataset_name="Intel/orca_dpo_pairs" # 采用单轮对话
save_dir = "./qwen-ft"
max_length=512

batch_size = 8
learning_rate = 2e-5
num_epochs = 2
beta = 0.01  # DPO 温度参数

### 加载模型

In [22]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device) # 显存占用 50 G
ref_model = AutoModelForCausalLM.from_pretrained(ref_model_name).eval().to(device)
model.config.sliding_window = None  # 显式禁用
ref_model.config.sliding_window = None  # 显式禁用
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

RuntimeError: Initialize:build/CMakeFiles/torch_npu.dir/compiler_depend.ts:247 NPU function error: at_npu::native::AclSetCompileopt(aclCompileOpt::ACL_PRECISION_MODE, precision_mode), error code is 500001
[ERROR] 2025-05-09-10:44:20 (PID:2869731, Device:0, RankID:-1) ERR00100 PTA call acl api failed
[Error]: The internal ACL of the system is incorrect.
        Rectify the fault based on the error information in the ascend log.


### 加载数据集

In [None]:
class PreferenceDataset(Dataset):
    def __init__(self, tokenizer, split="train", max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length

        self.dataset = load_dataset(dataset_name, split=split)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sample = self.dataset[idx]
        system_msg = sample["system"]
        user_msg = sample["question"]
        better_response = sample["chosen"]
        worse_response = sample["rejected"]

        messages = [
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg}
        ]

        text = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        text_tokens = self.tokenizer(text, add_special_tokens=False)
        text_len = len(text_tokens["input_ids"])

        # 拼接 better
        better_full = text + better_response
        better_enc = self.tokenizer(
            better_full,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        # 使用了 apply_chat_template，它会自动在结尾加上 <|assistant|>\n
        # 拼接 worse
        worse_full = text + worse_response
        worse_enc = self.tokenizer(
            worse_full,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids_better": better_enc["input_ids"].squeeze(0),
            "attention_mask_better": better_enc["attention_mask"].squeeze(0),
            "input_ids_worse": worse_enc["input_ids"].squeeze(0),
            "attention_mask_worse": worse_enc["attention_mask"].squeeze(0),
            # "text_ids": text_tokens["input_ids"].squeeze(0),
            "text_len": text_len  
        }
train_dataset = PreferenceDataset(tokenizer,split="train",max_length=max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [None]:
'''data set'''

print(train_dataset.dataset[0])
print("the length of the train dataset is:",len(train_dataset.dataset))


wandb.init(
    project="dpo-training",
    name="run_exp",
    config={
        "model_name": model_name,
        "ref_model_name": ref_model_name,
        "dataset_name": dataset_name,
        "batch_size": batch_size,
        "learning_rate": learning_rate,
        "num_epochs": num_epochs,
        "beta": beta,
        "max_length": max_length, # in dataset prompt 256+ response 256
    }
)

{'prompt': '\n\nHuman: Can you describe the steps to clean fingerprints and smudges from a laptop screen\n\nAssistant: Yes, certainly. To clean your screen, you first need to use a microfiber cloth or soft, damp cloth to gently wipe down the surface of the screen. Next, you’ll want to grab a soft, lint-free, microfiber cleaning cloth and gently rub it back and forth across the screen to remove fingerprints and smudges.\n\nHuman: Can I spray isopropyl alcohol onto the cloth and clean it that way?\n\nAssistant:', 'response': ' Yes, you can do that to help the cloth pick up even more dirt from the screen. Be sure to always use a clean, soft cloth, not a piece of scratchy, roughened, or textured material, and make sure it’s lint-free.', 'chosen': ' Yes, you can do that to help the cloth pick up even more dirt from the screen. Be sure to always use a clean, soft cloth, not a piece of scratchy, roughened, or textured material, and make sure it’s lint-free.', 'rejected': ' Yes, you can spray 



### 设置 Template

### DPO loss
$$
\mathcal{L}_{DPO}(\pi_{\theta};\pi_{ref})=-\mathbb{E}_{(x,y_w,y_l)\sim D}\left[\log \sigma(\beta\log \frac{\pi_{\theta}(y_w | x)}{\pi_{ref}(y_w | x)}-\beta\log \frac{\pi_{\theta}(y_l | x)}{\pi_{ref}(y_l | x)})\right]
$$
经过模型的输出：
$$
out\_worse.loss=\dfrac{1}{N}\sum\limits_{i=1}^N-\log P_{model}(y_i|x_i)
$$
例如我们在求 $\log \pi_{\theta}(y_w | x)$ 时使用：
$$
-out\_worse\_theta.loss
$$

In [None]:
def dpo_loss(logp_theta_w, logp_ref_w, logp_theta_l, logp_ref_l, beta):
    
    diff = beta * ((logp_theta_w - logp_ref_w) - (logp_theta_l - logp_ref_l))
    # 负对数 sigmoid
    loss = -F.logsigmoid(diff).mean()
    return loss

### Train

In [None]:
model.train()
for epoch in range(num_epochs):
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")

    total_loss = 0.0
    total_batches = len(train_dataloader)
    for batch_idx, batch in enumerate(progress_bar):
        

        better_ids   = batch["input_ids_better"].to(device)      # [B, Lp]
        better_att  = batch["attention_mask_better"].to(device)
        worse_ids    = batch["input_ids_worse"].to(device)      # [B, Lc]
        worse_att   = batch["attention_mask_worse"].to(device)
        # prompt_ids   = batch["text_ids"].to(device)    # [B, Lr]
        prompt_len   = batch["text_len"].to(device)

        # make labels
        B, L = better_ids.shape
        w_labels = better_ids.clone()
        token_pos = torch.arange(L, device=device).unsqueeze(0).expand(B, L)
        w_labels[token_pos < prompt_len.unsqueeze(1)] = -100
        l_labels = worse_ids.clone()
        l_labels[token_pos < prompt_len.unsqueeze(1)] = -100

        # === Get better log-probs ===
        out_better = model(
            input_ids=better_ids,
            attention_mask=better_att,
            labels= w_labels,
            return_dict=True
        )
        # token_w = (token_pos >= prompt_len.unsqueeze(1)).sum(dim=1).float().clamp(min=1.)
        # average loss for every token
        logp_w_theta = -out_better.loss

        with torch.no_grad():
            out_better_ref = ref_model(
                input_ids=better_ids,
                attention_mask=better_att,
                labels= w_labels,
                return_dict=True
            )
            logp_w_ref = -out_better_ref.loss

        # === Get better log-probs ===
        out_worse = model(
            input_ids=worse_ids,
            attention_mask=worse_att,
            labels= l_labels,
            return_dict=True
        )
        # token_l = (token_pos >= prompt_len.unsqueeze(1)).sum(dim=1).float().clamp(min=1.)
        logp_l_theta = -out_worse.loss

        with torch.no_grad():
            out_worse_ref = ref_model(
                input_ids=worse_ids,
                attention_mask=worse_att,
                labels= l_labels,
                return_dict=True
            )
            logp_l_ref = -out_worse_ref.loss

        # ===== DPO loss + backward =====

        loss = dpo_loss(logp_w_theta, logp_w_ref, logp_l_theta, logp_l_ref, beta)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        total_loss += loss.item()
        avg_loss = total_loss / (batch_idx + 1)

        # 更新进度条显示
        progress_bar.set_postfix({
            'loss': f'{loss.item():.4f}',
            'avg_loss': f'{avg_loss:.4f}'
        })

        if batch_idx % 10 == 0:
            wandb.log({"loss": loss.item(), "avg_loss": avg_loss})

Epoch 0 - Loss: 0.2117
Epoch 0 - Loss: 0.8887
Epoch 0 - Loss: 5.6476
Epoch 0 - Loss: 0.1030
Epoch 0 - Loss: 2.2070
Epoch 0 - Loss: 2.2786
Epoch 0 - Loss: 0.0949
Epoch 0 - Loss: 1.1465
Epoch 0 - Loss: 0.4953
Epoch 0 - Loss: 8.6514
Epoch 0 - Loss: 0.0001
Epoch 0 - Loss: 10.8875
Epoch 0 - Loss: 18.6852


KeyboardInterrupt: 

### Save model

In [None]:
os.makedirs(save_dir, exist_ok=True)


wandb.save(os.path.join(save_dir, "pytorch_model.bin"))
wandb.finish()
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

### Generation

In [None]:
system_msg = "You are an AI assistant that helps people find information."
user_msg   = "Where would the best place to drive over the speed limit be?"

messages = [
    {"role": "system", "content": system_msg},
    {"role": "user", "content": user_msg}
]

prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

inputs = tokenizer(prompt, return_tensors="pt").to(device)

model.eval()
with torch.no_grad():
    output_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
    ref_output_ids = ref_model.generate(**inputs, max_new_tokens=100, do_sample=False)

output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
ref_output_text = tokenizer.decode(ref_output_ids[0], skip_special_tokens=True)

print("=== Model Output ===")
print(output_text)
print("\n=== Reference Model Output ===")
print(ref_output_text)