# [Direct Preference Optimization: Your Language Model is Secretly a Reward Model (DPO)](https://arxiv.org/pdf/2305.18290.pdf)

### Reference Code 
- https://huggingface.co/docs/trl/main/en/dpo_trainer
- https://github.com/huggingface/trl/blob/main/examples/scripts/dpo.py

Therefore the final dataset object should contain these 3 entries if you use the default DPODataCollatorWithPadding data collator. 

The entries should be named:
- prompt
- chosen
- rejected

In [1]:
import os
import torch
# Set GPU device
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

os.environ['http_proxy']  = 'http://192.41.170.23:3128'
# os.environ['https_proxy'] = 'http://192.41.170.23:3128'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device(f"cuda:{1}" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [2]:
torch.__version__

'2.2.2+cu121'

In [3]:
dpo_dataset_dict = {
    "prompt": [
        "hello",
        "how are you",
        "What is your name?",
        "What is your name?",
        "Which is the best programming language?",
        "Which is the best programming language?",
        "Which is the best programming language?",
    ],
    "chosen": [
        "hi nice to meet you",
        "I am fine",
        "My name is Mary",
        "My name is Mary",
        "Python",
        "Python",
        "Java",
    ],
    "rejected": [
        "leave me alone",
        "I am not fine",
        "Whats it to you?",
        "I dont have a name",
        "Javascript",
        "C++",
        "C++",
    ],
}

In [4]:
import torch
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    HfArgumentParser, 
    TrainingArguments
)

from typing import Dict, Optional
from trl import DPOTrainer, DPOConfig

# 1. load a pretrained model and tokenizer

In [5]:
def get_model():
    model_name_or_path = "gpt2"
    ignore_bias_buffers = False
    
    model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
    if ignore_bias_buffers:
        # torch distributed hack
        model._ddp_params_and_buffers_to_ignore = [
            name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool
        ]
    
    model_ref = AutoModelForCausalLM.from_pretrained(model_name_or_path)
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    return model,model_ref,tokenizer

The DPO trainer expects a model of AutoModelForCausalLM, compared to PPO that expects AutoModelForCausalLMWithValueHead for the value function.

## 2. Load the Anthropic Helpful-Harmless dataset

In [6]:
def extract_anthropic_prompt(prompt_and_response):
    """Extract the anthropic prompt from a prompt and response pair."""
    search_term = "\n\nAssistant:"
    search_term_idx = prompt_and_response.rfind(search_term)
    assert search_term_idx != -1, f"Prompt and response does not contain '{search_term}'"
    return prompt_and_response[: search_term_idx + len(search_term)]

def get_hh(split: str, sanity_check: bool = False, silent: bool = False, cache_dir: str = None) -> Dataset:
    """Load the Anthropic Helpful-Harmless dataset from Hugging Face and convert it to the necessary format.

    The dataset is converted to a dictionary with the following structure:
    {
        'prompt': List[str],
        'chosen': List[str],
        'rejected': List[str],
    }

    Prompts should be structured as follows:
      \n\nHuman: <prompt>\n\nAssistant:
    Multiple turns are allowed, but the prompt should always start with \n\nHuman: and end with \n\nAssistant:.
    """

    dataset = load_dataset("Anthropic/hh-rlhf", split=split, cache_dir=cache_dir)
    if sanity_check:
        dataset = dataset.select(range(min(len(dataset), 1000)))

    def split_prompt_and_responses(sample) -> Dict[str, str]:
        prompt = extract_anthropic_prompt(sample["chosen"])
        return {
            "prompt": prompt,
            "chosen": sample["chosen"][len(prompt) :],
            "rejected": sample["rejected"][len(prompt) :],
        }

    return dataset.map(split_prompt_and_responses)

In [7]:
sanity_check = True
train_dataset = get_hh("train", sanity_check=sanity_check)
eval_dataset = get_hh("test", sanity_check=sanity_check)

In [8]:
train_dataset

Dataset({
    features: ['chosen', 'rejected', 'prompt'],
    num_rows: 1000
})

In [9]:
eval_dataset

Dataset({
    features: ['chosen', 'rejected', 'prompt'],
    num_rows: 1000
})

# 3. initialize training arguments:

In [10]:
learning_rate = 1e-3
per_device_train_batch_size = 8
gradient_accumulation_steps = 1
max_length= 512 
max_prompt_length = 128 
max_target_length =128 
label_pad_token_id = 100
max_steps = 1000
# instrumentation
sanity_check = True
report_to = None
gradient_checkpointing = None
beta = 0.1

In [11]:
training_args = DPOConfig(
    per_device_train_batch_size=per_device_train_batch_size,
    max_steps=max_steps,
    remove_unused_columns=False,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    evaluation_strategy="steps",
    logging_first_step=True,
    logging_steps=5,
    eval_steps=500,
    output_dir="./test",
    optim="rmsprop",
    warmup_steps=150,
    report_to=report_to,
   # bf16=True,
    gradient_checkpointing=gradient_checkpointing,
    max_prompt_length=max_prompt_length,
    max_length=max_length,
    max_completion_length=max_target_length,
    beta=beta,  # Add the beta parameter here
)



# 4. initialize the DPO trainer

In [12]:
def get_dpo_trainer():
    dpo_trainer = DPOTrainer(
        model=model,
        ref_model=model_ref,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,  # Changed from processing_class to tokenizer
    )

    return dpo_trainer

# 5. Train

I will experiment with three hyperparameters as follows:
- learning_rate (0.001, 0.01, 0.1)
- per_device_train_batch_size (2,4,8)
- beta (0.1,0.2,0.3)

In [14]:
learning_rate = 1e-3
per_device_train_batch_size = 8
beta = 0.1
output_dir="./test0"

training_args.learning_rate = learning_rate
training_args.per_device_train_batch_size = per_device_train_batch_size
training_args.beta = beta
training_args.output_dir = output_dir

model,model_ref,tokenizer =  get_model()

dpo_trainer = get_dpo_trainer()

  dpo_trainer = DPOTrainer(


In [15]:
dpo_trainer.train()

Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
500,1.3362,2.946879,-16.692886,-19.559902,0.604,2.867015,-283.705658,-334.188507,-30.103504,-27.820219
1000,0.0,4.476468,-30.020247,-34.729416,0.609,4.709165,-416.979248,-485.883606,-71.702415,-67.645248


TrainOutput(global_step=1000, training_loss=1.01318841723894, metrics={'train_runtime': 580.053, 'train_samples_per_second': 13.792, 'train_steps_per_second': 1.724, 'total_flos': 0.0, 'train_loss': 1.01318841723894, 'epoch': 8.0})

In [22]:
dpo_trainer.save_model('./model0.pth')

In [27]:
learning_rate = 1e-2
per_device_train_batch_size = 4
beta = 0.2
output_dir="./test1"

training_args.learning_rate = learning_rate
training_args.per_device_train_batch_size = per_device_train_batch_size
training_args.beta = beta
training_args.output_dir = output_dir

model,model_ref,tokenizer =  get_model()

dpo_trainer = get_dpo_trainer()

  dpo_trainer = DPOTrainer(


In [28]:
dpo_trainer.train()

Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
500,7.6119,10.2327,-36.70858,-45.741634,0.576,9.033051,-300.319702,-367.297668,-10.869419,-10.867895
1000,6.8943,9.678289,-40.634819,-49.387379,0.582,8.752555,-319.950897,-385.526367,-11.941029,-11.876305


TrainOutput(global_step=1000, training_loss=7.809128794193268, metrics={'train_runtime': 413.3771, 'train_samples_per_second': 9.676, 'train_steps_per_second': 2.419, 'total_flos': 0.0, 'train_loss': 7.809128794193268, 'epoch': 4.0})

In [29]:
dpo_trainer.save_model('./model1.pth')

In [30]:
learning_rate = 1e-1
per_device_train_batch_size = 2
beta = 0.3
output_dir="./test2"

training_args.learning_rate = learning_rate
training_args.per_device_train_batch_size = per_device_train_batch_size
training_args.beta = beta
training_args.output_dir = output_dir

model,model_ref,tokenizer =  get_model()

dpo_trainer = get_dpo_trainer()

  dpo_trainer = DPOTrainer(


In [31]:
dpo_trainer.train()

Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
500,203.0885,155.744827,-583.94989,-697.817749,0.553,113.867889,-2063.276367,-2464.648438,-3.729004,-3.728814
1000,3.723,15.010561,-57.645195,-70.634834,0.568,12.989642,-308.927399,-374.03891,-1.612762,-1.61333


TrainOutput(global_step=1000, training_loss=134.4879447426796, metrics={'train_runtime': 190.6906, 'train_samples_per_second': 10.488, 'train_steps_per_second': 5.244, 'total_flos': 0.0, 'train_loss': 134.4879447426796, 'epoch': 2.0})

In [None]:
dpo_trainer.save_model('./model2.pth')

# 6. Experimental Results

In [33]:
import pandas as pd

pd.DataFrame({"Learning Rate" : [0.001, 0.01, 0.1],
             "Batch Size" : [8,4,2],
             "Beta" : [0.1,0.2,0.3],
              "Training Loss at 1000 steps" : [0, 6.894300,3.723000],
               "Validation Loss at 1000 steps" : [4.476468, 9.678289,15.010561]
             })

Unnamed: 0,Learning Rate,Batch Size,Beta,Training Loss at 1000 steps,Validation Loss at 1000 steps
0,0.001,8,0.1,0.0,4.476468
1,0.01,4,0.2,6.8943,9.678289
2,0.1,2,0.3,3.723,15.010561


# 7. Upload Model to Hugging face

In [24]:
model0 = AutoModelForCausalLM.from_pretrained("./model0.pth")
tokenizer0 = AutoTokenizer.from_pretrained("./model0.pth")

In [25]:
model0.push_to_hub('kaung-nyo-lwin/dpo_gpt2_nlp_a5')
tokenizer0.push_to_hub('kaung-nyo-lwin/dpo_gpt2_nlp_a5')

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/kaung-nyo-lwin/dpo_gpt2_nlp_a5/commit/c47f92bbfdbc659dcedd6251e6b508a28a7c49df', commit_message='Upload tokenizer', commit_description='', oid='c47f92bbfdbc659dcedd6251e6b508a28a7c49df', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kaung-nyo-lwin/dpo_gpt2_nlp_a5', endpoint='https://huggingface.co', repo_type='model', repo_id='kaung-nyo-lwin/dpo_gpt2_nlp_a5'), pr_revision=None, pr_num=None)

# 8. Discussion

Due to the limitation of compute resources, the model is trained on a subset of the dataset. Therefore, the model may not perform well on new data. I have tested three hyperparameters and the model with the best performance is trained on. Since increasing batch size is limited by gpu memory, I have experimented to see the decline of training performance with decreasing batch size, increasing learning rate and beta value. According to experimental results, the training performance decline is sigificant when decreasing batch size, increasing learning rate and beta value.