## 03: DPO

This section will explain  how to use huggingface to do DPO!

We will use `HumanLLMs/Human-Like-DPO-Dataset` using `datasets` library

In [None]:
from datasets import load_dataset
from trl import DPOConfig, DPOTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl.trainer.dpo_trainer import PreferenceCollator
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import DataLoader
from transformers import Trainer, TrainingArguments

import torch
from typing import Union

from copy import deepcopy

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
train_dataset = load_dataset("HumanLLMs/Human-Like-DPO-Dataset", split="train")

Now, Just train it directly. We will use a base model `Qwen/Qwen2-0.5B`

In [None]:
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B")

training_args = DPOConfig(
    output_dir="Qwen2-0.5B-DPO",
    logging_steps=10,
    max_length=64,
    per_device_train_batch_size=2,
)
trainer = DPOTrainer(
    model=model,
    args=training_args,
    processing_class=tokenizer,
    train_dataset=train_dataset,
)
trainer.train()

Test it

In [None]:
# test the model
prompt = "What's your favorite actor?"

output = model.generate(**tokenizer(prompt, return_tensors="pt").to("cuda"), max_new_tokens=10)
print(tokenizer.decode(output[0], skip_special_tokens=True))


## Deeper understanding!

Like the previous notebook, let's dive deep into the data and loss function

In [None]:
model_name = "Qwen/Qwen2.5-0.5B"  # or your preferred model
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Important for GPT models

Implement the tokenizer

In [None]:
def preprocess_function(x, tokenizer):
    """
    preprocess_function is a function that takes in a single example and returns a dictionary of input_ids.
    It follows the `PreferenceCollator` arguments in `trl` library.
    
    returned dictionary should have the following:
    - prompt_input_ids: input_ids for the prompt
    - chosen_input_ids: input_ids for the chosen option
    - rejected_input_ids: input_ids for the rejected option
    """
    # TODO: implement this

In [None]:
# TODO: map and test it!

Test it and see the outputs

In [None]:
# TODO: map and test it!

Questions:
1. Anything interesting with the data format? Yes, something is different, why?


In [None]:
def pad_to_length(
    tensor: torch.Tensor, length: int, pad_value: Union[int, float], dim: int = -1
) -> torch.Tensor:
    """
    Helper function from `trl` library to pad a tensor to a certain length.
    
    Args:
        tensor: input tensor
        length: desired length
        pad_value: value to pad with
        dim: dimension to pad
    
    Returns:
        padded tensor
    """
    if tensor.size(dim) >= length:
        return tensor
    else:
        pad_size = list(tensor.shape)
        pad_size[dim] = length - tensor.size(dim)
        return torch.cat(
            [
                tensor,
                pad_value
                * torch.ones(*pad_size, dtype=tensor.dtype, device=tensor.device),
            ],
            dim=dim,
        )

Implement these

# DPO Loss Implementation Breakdown

## 1. Core DPO Loss Formula
The fundamental DPO loss being implemented is:
$$L_{DPO}(\theta) = -\log(\sigma(\beta(r_\theta(x,y^+) - r_\theta(x,y^-)) - (r_{\text{ref}}(x,y^+) - r_{\text{ref}}(x,y^-))))$$

## 2. Log Probability Calculation
For each sequence, log probabilities are computed as:
$$r_\theta(x,y) = \frac{1}{|y|}\sum_{t=1}^{|y|} \log P_\theta(y_t|x,y_{<t})$$

## 3. Policy and Reference Model Comparison
The code computes four key components:
1. Chosen policy logprobs: $r_\theta(x,y^+)$
2. Rejected policy logprobs: $r_\theta(x,y^-)$
3. Chosen reference logprobs: $r_{\text{ref}}(x,y^+)$
4. Rejected reference logprobs: $r_{\text{ref}}(x,y^-)$

Then calculates the ratios:
$$\text{policy\_ratio} = r_\theta(x,y^+) - r_\theta(x,y^-)$$
$$\text{ref\_ratio} = r_{\text{ref}}(x,y^+) - r_{\text{ref}}(x,y^-)$$

## 4. Final Loss Computation

$$L = -\mathbb{E}[\log\sigma(\beta((r_\theta(x,y^+) - r_\theta(x,y^-)) - (r_{\text{ref}}(x,y^+) - r_{\text{ref}}(x,y^-))))]$$



In [None]:
class DPOTrainer(Trainer):
    def __init__(self, processing_class, beta=0.1,  *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.beta = beta
        self.processing_class = processing_class  # tokenizer

    def compute_loss(self, model, inputs, **kwargs):
        """
        Implementation follows trl's DPOTrainer `compute_loss` method (with a slight modification).
        """

        # Concatenate prompt and completion inputs (chosen and rejected)
        # Repeat prompt inputs (and attention masks) for each completion
        # for instance I have 2 completions, I will repeat the prompt inputs twice
        # Then pad the completions to the same length (what dimension?)
        # TODO: implement them!
        
        
        # Then concatenate the prompt and completions
        # TODO: implement them!
        
        
        # you should have input_ids and attention_mask now
        # Mask the prompt but not the completion for the loss
        # illustration: if the input is [p,p,c,c,c], the loss mask will be [0,0,1,1,1]
        loss_mask = # TODO: implement this!

        # Memory optimization: Flush left to reduce memory usage
        # illustration ( padding is 0), we remove the padding from the left
        # We do this since we got two-sided padding
        # input_ids = [
        #     [0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        #     [0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        #     [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
        #     [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
        # ]
        for i in range(attention_mask.size(0)):
            nonzero_indices = torch.nonzero(attention_mask[i])
            if len(nonzero_indices) > 0:
                first_one_idx = nonzero_indices[0].item()
                input_ids[i] = torch.roll(input_ids[i], shifts=-first_one_idx)
                attention_mask[i] = torch.roll(attention_mask[i], shifts=-first_one_idx)
                loss_mask[i] = torch.roll(loss_mask[i], shifts=-first_one_idx)

        # TODO: get model's output and its log probabilities and the reference!
        # Forward pass through policy model
        
        # Forward pass through reference model

        batch_size = inputs["prompt_input_ids"].shape[0]

        # Get log probabilities

        # Split logprobs into chosen and rejected
        chosen_policy_logps = policy_logps[:batch_size]
        rejected_policy_logps = policy_logps[batch_size:]
        chosen_ref_logps = ref_logps[:batch_size]
        rejected_ref_logps = ref_logps[batch_size:]

        # Compute policy and reference ratios
        policy_ratio = # TODO: implement this!
        ref_ratio = # TODO: implement this!

        # Compute the loss
        loss =  # TODO: implement this!

        return loss

    def _get_logprobs(self, logits, input_ids, attention_mask, loss_mask):
        """Compute sequence-level log probabilities.
        Args:
            logits: Logits from the model. Shape: (batch_size, seq_len, vocab_size)
            input_ids: Input IDs. Shape: (batch_size, seq_len)
            attention_mask: Attention mask. Shape: (batch_size, seq_len)
            loss_mask: Loss mask for the prompt. Shape: (batch_size, seq_len)
        Returns:
            sequence_logprobs: Sequence-level log probabilities. Shape: (batch_size,)
        """
        # Shift for next token prediction


        # Get log probs


        # Only consider tokens that are part of the completion (not prompt)
        # hint: use mask

        # Normalize by sequence length


        return sequence_logprobs


In [None]:
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

training_args = TrainingArguments(
    output_dir="./dpo_trained_model",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=1e-5,
    bf16=True,
    logging_steps=10,
    save_strategy="no",
    warmup_steps=100,
    remove_unused_columns=False
)

# 5. Initialize and run the DPO trainer
trainer = DPOTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    data_collator=collator,  # Use the PreferenceCollator
    beta=0.1,
    processing_class=tokenizer,
)

trainer.train()

In [None]:
input = ["What's your favorite actor ?"]

# TODO: test the model and compare with the reference model (use model and trainer.reference_model)

In [None]:
# compare with the reference model
