https://huggingface.co/docs/trl/main/en/dpo_trainer

TRL supports the DPO Trainer for training language models from preference data, as described in the paper Direct Preference Optimization: Your Language Model is Secretly a Reward Model by Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano Ermon, Christopher D. Manning, Chelsea Finn.

In [None]:
# pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -U --user datasets accelerate peft trl tensorboard bitsandbytes langchain sentencepiece transformers

In [None]:
import os
import sys
import warnings; warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import torch as th

from pprint import pp
from datasets import (load_dataset, load_from_disk, Dataset)
from transformers import (AutoTokenizer, AutoModel, AutoModelForCausalLM, BitsAndBytesConfig,
                          TrainingArguments, DataCollatorWithPadding, DataCollatorForLanguageModeling,
                          DataCollatorForSeq2Seq, DataCollatorForTokenClassification)
from peft import (LoraConfig, get_peft_model, PeftModel, TaskType, get_peft_model_state_dict)
from trl import (DPOConfig, DPOTrainer)

In [2]:
device = th.device("cuda" if th.cuda.is_available() else "cpu")
devive_cnt = th.cuda.device_count()
print(f"device = {device}; devive_cnt = {devive_cnt}")
print(th.__version__)
print(th.version.cuda)

device = cuda; devive_cnt = 1
2.5.1+cu121
12.1


In [3]:
path_project = "C:/my_project/MyGit/Machine-Learning-Column/hugging_face"
path_data = os.path.join(os.path.dirname(path_project), "data")
path_model = "F:/LLM"
path_output = os.path.join(os.path.dirname(path_project), "output")

## step-1: 载入数据源

In [4]:
filename = "trl-lib/ultrafeedback_binarized/train-00000-of-00001.parquet"

In [5]:
dataset = load_dataset(
    path="parquet",
    data_files=os.path.join(path_data, filename),
    split="all"
)

In [6]:
dataset = dataset.select(range(2000))
dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=0) 
train_dataset, eval_dataset = dataset["train"], dataset["test"]

In [7]:
train_dataset, eval_dataset

(Dataset({
     features: ['chosen', 'rejected', 'score_chosen', 'score_rejected'],
     num_rows: 1600
 }),
 Dataset({
     features: ['chosen', 'rejected', 'score_chosen', 'score_rejected'],
     num_rows: 400
 }))

In [8]:
pp(train_dataset[0])

{'chosen': [{'content': 'Please answer the following question: Question: Who '
                        'seized the galley? If there is no answer, please '
                        'output "Insufficient information to provide an '
                        'answer.". Movie title: The Maltese Falcon Context: In '
                        'San Francisco, private investigator Sam Spade '
                        '(Ricardo Cortez) and his partner Miles Archer (Walter '
                        'Long) are approached by Ruth Wonderly (Bebe Daniels) '
                        'to follow a man, Floyd Thursby, who allegedly ran off '
                        'with her younger sister. The two accept the '
                        'assignment because the money is good, even though '
                        'they disbelieve her story. Late that night, police '
                        'detective Tom Polhaus (J. Farrell MacDonald) informs '
                        'Spade that Archer has been shot and killed w

## step-2: tokenizer

In [8]:
checkpoint = "Qwen/Qwen2.5-1.5B-Instruct"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True
)

In [10]:
print(tokenizer.pad_token)
print(tokenizer.eos_token)
print(tokenizer.padding_side)

<|endoftext|>
<|im_end|>
right


## step-3: 配置量化参数

In [11]:
config_bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=th.bfloat16,
    bnb_4bit_use_double_quant=True
)  # QLoRA

## step-4: 载入基模

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    device_map="auto",
    low_cpu_mem_usage=True,
    torch_dtype=th.bfloat16,
    # attn_implementation="flash_attention_2",  # flash_attention_2, sdpa
    # quantization_config=config_bnb,
)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [11]:
base_model.gradient_checkpointing_enable()
base_model.enable_input_require_grads()
base_model.config.use_cache = False

if th.cuda.device_count() > 1:
    base_model.is_parallelizable = True
    base_model.model_parallel = True

In [12]:
allocated_memory = th.cuda.memory_allocated()
cached_memory = th.cuda.memory_cached()
print(f"已分配的GPU内存：{allocated_memory / 1024**3:.2f}G, 已缓存的GPU内存：{cached_memory / 1024**3:.2f}G")

已分配的GPU内存：2.88G, 已缓存的GPU内存：3.06G


In [13]:
tokenizer_size = len(tokenizer)
embedding_size = base_model.get_input_embeddings().weight.shape[0]
if tokenizer_size > embedding_size:
    base_model.resize_token_embeddings(tokenizer_size)

## step-5: 配置模型参数

In [14]:
training_args = DPOConfig(
    output_dir=os.path.join(path_output, checkpoint + "-DPO"), 
    logging_steps=10
    )

In [17]:
pp(training_args)

DPOConfig(output_dir='C:/my_project/MyGit/Machine-Learning-Column\\output\\Qwen/Qwen2.5-3B-Instruct-DPO',
          overwrite_output_dir=False,
          do_train=False,
          do_eval=False,
          do_predict=False,
          eval_strategy=<IntervalStrategy.NO: 'no'>,
          prediction_loss_only=False,
          per_device_train_batch_size=8,
          per_device_eval_batch_size=8,
          per_gpu_train_batch_size=None,
          per_gpu_eval_batch_size=None,
          gradient_accumulation_steps=1,
          eval_accumulation_steps=None,
          eval_delay=0,
          torch_empty_cache_steps=None,
          learning_rate=1e-06,
          weight_decay=0.0,
          adam_beta1=0.9,
          adam_beta2=0.999,
          adam_epsilon=1e-08,
          max_grad_norm=1.0,
          num_train_epochs=3.0,
          max_steps=-1,
          lr_scheduler_type=<SchedulerType.LINEAR: 'linear'>,
          lr_scheduler_kwargs={},
          warmup_ratio=0.0,
          warmup_steps=0,
 

## step-6: 模型训练

In [None]:
trainer = DPOTrainer(
    model=base_model,
    # ref_model=None, 
    args=training_args, 
    # data_collator=None,  
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer
    )
# ref_model: If no reference model is provided, the trainer will create a reference model with the same architecture. ref_model = deepcopy(model)
# data_collator: If None is specified, the default data collator (`DataCollatorForPreference`) will be used. data_collator = DataCollatorForPreference(pad_token_id=self.padding_value)

Applying chat template to train dataset:   0%|          | 0/1600 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1600 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

In [16]:
trainer.data_collator

DataCollatorForPreference(pad_token_id=151643, return_tensors='pt')

In [None]:
trainer.train()

## step-7: 模型训练 (using unsloth on Linux)

In [None]:
from unsloth import FastLanguageModel

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
model = FastLanguageModel.get_peft_model(model)

In [None]:
training_args = DPOConfig(output_dir="Qwen2-0.5B-DPO", logging_steps=10, bf16=True)

In [None]:
trainer = DPOTrainer(
    model=base_model,
    # ref_model=None, 
    args=training_args, 
    # data_collator=None,  
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer
    )

In [None]:
trainer.train()