# DPO Training for Novel Writer
Direct Preference Optimization fine-tuning using preference pairs.

In [None]:
# Install dependencies
!pip install -q trl peft transformers accelerate bitsandbytes

In [None]:
import json
from pathlib import Path
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from trl import DPOConfig, DPOTrainer
import torch

## Load Preference Dataset

In [None]:
# Load preference pairs generated by novel_writer preference command
DATA_PATH = "data/processed/preference_pairs.jsonl"

pairs = []
with open(DATA_PATH, "r") as f:
    for line in f:
        data = json.loads(line.strip())
        pairs.append({
            "prompt": data["prompt"],
            "chosen": data["chosen"],
            "rejected": data["rejected"],
        })

dataset = Dataset.from_list(pairs)
print(f"Loaded {len(dataset)} preference pairs")
dataset[0]

## Load Base Model with QLoRA

In [None]:
MODEL_NAME = "unsloth/llama-3-8b-instruct-bnb-4bit"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

## Configure LoRA

In [None]:
lora_config = LoraConfig(
    r=32,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

## Train with DPO

In [None]:
training_args = DPOConfig(
    output_dir="dpo_output",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    beta=0.1,  # DPO temperature parameter
    logging_steps=10,
    save_steps=100,
    bf16=True,
    remove_unused_columns=False,
)

dpo_trainer = DPOTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    processing_class=tokenizer,
    peft_config=lora_config,
)

dpo_trainer.train()

## Save Model

In [None]:
dpo_trainer.save_model("dpo_lora_model")
tokenizer.save_pretrained("dpo_lora_model")
print("DPO fine-tuned model saved to dpo_lora_model/")