# DPO Training (Offline Version)

Train model using Direct Preference Optimization with self-play preference pairs.

In [None]:
import os
import json
import torch
import numpy as np

print(f"NumPy: {np.__version__}")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
SFT_MODEL_PATH = './final_model'
DPO_DATA_PATH = './dpo_preference_data.jsonl'
OUTPUT_DIR = './dpo_outputs'
FINAL_MODEL_DIR = './dpo_final_model'

MAX_SEQ_LENGTH = 2048
LEARNING_RATE = 5e-5
BATCH_SIZE = 2
GRADIENT_ACCUMULATION = 4
NUM_EPOCHS = 1
BETA = 0.1

if not os.path.exists(SFT_MODEL_PATH):
    SFT_MODEL_PATH = '/app/models/Qwen2.5-Coder-0.5B-Instruct'
    print(f"Using base model: {SFT_MODEL_PATH}")

In [None]:
from unsloth import FastLanguageModel
from trl import DPOTrainer, DPOConfig
from datasets import Dataset

print(f"Loading model from {SFT_MODEL_PATH}...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=SFT_MODEL_PATH,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=None,
    load_in_4bit=True,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
)
print("✓ Model loaded with LoRA!")

In [None]:
# Load DPO data
data = []
with open(DPO_DATA_PATH, 'r', encoding='utf-8') as f:
    for line in f:
        item = json.loads(line.strip())
        if 'prompt' in item and 'chosen' in item and 'rejected' in item:
            data.append({
                'prompt': str(item['prompt']),
                'chosen': str(item['chosen']),
                'rejected': str(item['rejected']),
            })

dpo_dataset = Dataset.from_list(data)
print(f"Loaded {len(dpo_dataset)} preference pairs")

In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

dpo_config = DPOConfig(
    output_dir=OUTPUT_DIR,
    beta=BETA,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    num_train_epochs=NUM_EPOCHS,
    max_prompt_length=512,
    max_length=1024,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    optim="adamw_8bit",
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    logging_steps=10,
    save_strategy="epoch",
    report_to="none",
    remove_unused_columns=False,
    seed=42,
)

trainer = DPOTrainer(
    model=model,
    ref_model=None,
    args=dpo_config,
    train_dataset=dpo_dataset,
    tokenizer=tokenizer,
)
print("✓ DPOTrainer initialized!")

In [None]:
print("Starting DPO training...")
trainer.train()
print("✓ Training complete!")

In [None]:
os.makedirs(FINAL_MODEL_DIR, exist_ok=True)
model.save_pretrained(FINAL_MODEL_DIR)
tokenizer.save_pretrained(FINAL_MODEL_DIR)
print(f"✓ Model saved to {FINAL_MODEL_DIR}")

In [None]:
# Merge LoRA weights
merged_dir = './dpo_merged_model'
model.save_pretrained_merged(
    merged_dir,
    tokenizer,
    save_method="merged_16bit",
)
print(f"✓ Merged model saved to {merged_dir}")

In [None]:
print("\n" + "="*50)
print("DPO TRAINING COMPLETE")
print("="*50)
print(f"LoRA model: {FINAL_MODEL_DIR}")
print(f"Merged model: ./dpo_merged_model")
print("\nNext: Convert to GGUF for deployment")
