# [Direct Preference Optimization: Your Language Model is Secretly a Reward Model (DPO)](https://arxiv.org/pdf/2305.18290.pdf)

### Reference Code 
- https://huggingface.co/docs/trl/main/en/dpo_trainer
- https://github.com/huggingface/trl/blob/main/examples/scripts/dpo.py

Therefore the final dataset object should contain these 3 entries if you use the default DPODataCollatorWithPadding data collator. 

The entries should be named:
- prompt
- chosen
- rejected

In [1]:
import os
import torch
# Set GPU device
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

os.environ['http_proxy']  = 'http://192.41.170.23:3128'
os.environ['https_proxy'] = 'http://192.41.170.23:3128'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [2]:
import torch
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    HfArgumentParser, 
    TrainingArguments
)

from typing import Dict, Optional
from trl import DPOTrainer

# 1. load a pretrained model and tokenizer

In [3]:
model_name_or_path = "gpt2"
ignore_bias_buffers = False

model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
if ignore_bias_buffers:
    # torch distributed hack
    model._ddp_params_and_buffers_to_ignore = [
        name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool
    ]

model_ref = AutoModelForCausalLM.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

The DPO trainer expects a model of AutoModelForCausalLM, compared to PPO that expects AutoModelForCausalLMWithValueHead for the value function.

## 2. Load the Anthropic Helpful-Harmless dataset

In [6]:
def extract_anthropic_prompt(prompt_and_response):
    """Extract the anthropic prompt from a prompt and response pair."""
    search_term = "\n\nAssistant:"
    search_term_idx = prompt_and_response.rfind(search_term)
    assert search_term_idx != -1, f"Prompt and response does not contain '{search_term}'"
    return prompt_and_response[: search_term_idx + len(search_term)]

def get_hh(split: str, sanity_check: bool = False, silent: bool = False, cache_dir: str = None) -> Dataset:
    """Load the Anthropic Helpful-Harmless dataset from Hugging Face and convert it to the necessary format.

    The dataset is converted to a dictionary with the following structure:
    {
        'prompt': List[str],
        'chosen': List[str],
        'rejected': List[str],
    }

    Prompts should be structured as follows:
      \n\nHuman: <prompt>\n\nAssistant:
    Multiple turns are allowed, but the prompt should always start with \n\nHuman: and end with \n\nAssistant:.
    """

    dataset = load_dataset("Anthropic/hh-rlhf", split=split, cache_dir=cache_dir)
    if sanity_check:
        dataset = dataset.select(range(min(len(dataset), 1000)))

    def split_prompt_and_responses(sample) -> Dict[str, str]:
        prompt = extract_anthropic_prompt(sample["chosen"])
        return {
            "prompt": prompt,
            "chosen": sample["chosen"][len(prompt) :],
            "rejected": sample["rejected"][len(prompt) :],
        }

    return dataset.map(split_prompt_and_responses)

In [7]:
sanity_check = True
train_dataset = get_hh("train", sanity_check=sanity_check)
eval_dataset = get_hh("test", sanity_check=sanity_check)

In [8]:
train_dataset

Dataset({
    features: ['chosen', 'rejected', 'prompt'],
    num_rows: 1000
})

In [9]:
eval_dataset

Dataset({
    features: ['chosen', 'rejected', 'prompt'],
    num_rows: 1000
})

# 3. initialize training arguments:

In [10]:
learning_rate = 1e-3
per_device_train_batch_size = 4
gradient_accumulation_steps = 1
max_length= 256 
max_prompt_length = 128 
max_target_length =128 
label_pad_token_id = 100
max_steps = 1000
# instrumentation
sanity_check = True
report_to = None
gradient_checkpointing = None
beta = 0.1

In [12]:
training_args = TrainingArguments(
    per_device_train_batch_size=per_device_train_batch_size,
    max_steps=max_steps,
    remove_unused_columns=False,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    evaluation_strategy="steps",
    logging_first_step=True,
    logging_steps=5,  # match results in blog post
    eval_steps=500,
    output_dir="./test",
    optim="rmsprop",
    warmup_steps=150,
    report_to="wandb",
    bf16=True,
    # TODO: uncomment that on the next transformers release
    gradient_checkpointing=gradient_checkpointing,
    run_name = "DPO"
)



# 4. initialize the DPO trainer

In [13]:
dpo_trainer = DPOTrainer(
    model,
    model_ref,
    args=training_args,
    beta=beta,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    max_length=max_length,
    max_target_length=max_target_length,
    max_prompt_length=max_prompt_length,
    generate_during_eval=True,
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


# 5. Train

In [14]:
dpo_trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mst124952[0m ([33mst124952-asian-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen
500,2.0416,3.1629,-11.313297,-14.325815,0.597,3.01252,-283.884094,-230.698654,-9.234812,-9.627277
1000,0.1006,4.51219,-23.955256,-28.742985,0.597,4.787728,-428.055786,-357.118225,-40.360313,-40.775414


TrainOutput(global_step=1000, training_loss=1.6003257738626562, metrics={'train_runtime': 486.4146, 'train_samples_per_second': 8.223, 'train_steps_per_second': 2.056, 'total_flos': 0.0, 'train_loss': 1.6003257738626562, 'epoch': 4.0})

# 6. Inference 

In [15]:
dpo_trainer.save_model("./model/dpo_model_gpt2")

In [16]:
tokenizer.save_pretrained("./model/dpo_tokenizer")

('./model/dpo_tokenizer/tokenizer_config.json',
 './model/dpo_tokenizer/special_tokens_map.json',
 './model/dpo_tokenizer/vocab.json',
 './model/dpo_tokenizer/merges.txt',
 './model/dpo_tokenizer/added_tokens.json',
 './model/dpo_tokenizer/tokenizer.json')

In [14]:
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("./model/dpo_model_gpt2")
tokenizer = AutoTokenizer.from_pretrained("./model/dpo_tokenizer")

In [15]:
import torch
prompt = "Ask Anything"
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)

with torch.no_grad():
    outputs = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=30,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
    )

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

Ask Anything, or I can be able to me, or why, or why, or why, or why, or why, I can be able


# 7. Pushing the Model to Hugging Face Hub

In [16]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
# from huggingface_hub import HfFolder
# print(HfFolder.get_token())
## This cannot be shared outside :3

In [18]:
from huggingface_hub import HfApi, HfFolder

model.push_to_hub("PattycherryAnker/optimize-human-preference")
tokenizer.push_to_hub("PattycherryAnker/optimize-human-preference")

Repo card metadata block was not found. Setting CardData to empty.
model.safetensors: 100%|██████████| 498M/498M [07:00<00:00, 1.18MB/s]   


CommitInfo(commit_url='https://huggingface.co/PattycherryAnker/optimize-human-preference/commit/3cd407eceb8f58fb018188db2298f645f23ac3bb', commit_message='Upload tokenizer', commit_description='', oid='3cd407eceb8f58fb018188db2298f645f23ac3bb', pr_url=None, repo_url=RepoUrl('https://huggingface.co/PattycherryAnker/optimize-human-preference', endpoint='https://huggingface.co', repo_type='model', repo_id='PattycherryAnker/optimize-human-preference'), pr_revision=None, pr_num=None)