# DPO Training for Code Autocomplete

Train a model using Direct Preference Optimization (DPO) with preference pairs generated from self-play.

In [None]:
import subprocess
import sys

def install_package(package):
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])
        print(f"[install_package] Successfully installed {package}")
        return True
    except Exception as e:
        print(f"[install_package] Failed to install {package}: {e}")
        return False

print("========== START: Installing Packages ==========")
install_package("unsloth")
install_package("trl>=0.7.0")
install_package("transformers>=4.36.0")
install_package("datasets")
install_package("accelerate")
install_package("bitsandbytes")
install_package("peft")
print("========== END: Installing Packages ==========")

In [None]:
import os
import json
import torch
from pathlib import Path

print("========== START: Checking Environment ==========")
try:
    from unsloth import FastLanguageModel
    UNSLOTH_AVAILABLE = True
    print("[Environment] Unsloth is available")
except ImportError:
    print("[Environment] Unsloth not available")
    UNSLOTH_AVAILABLE = False

try:
    from trl import DPOTrainer, DPOConfig
except ImportError:
    try:
        from trl import DPOTrainer
        from transformers import TrainingArguments as DPOConfig
        print("[Environment] Using TrainingArguments as DPOConfig fallback")
    except ImportError as e:
        raise ImportError(f"TRL library required: {e}")

try:
    from datasets import Dataset
except ImportError as e:
    raise ImportError(f"datasets library required: {e}")

try:
    from transformers import AutoModelForCausalLM, AutoTokenizer
except ImportError as e:
    raise ImportError(f"transformers required: {e}")

if torch.cuda.is_available():
    print(f"[Environment] GPU: {torch.cuda.get_device_name(0)}")
    print(f"[Environment] Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("WARNING: No GPU. Training will be very slow.")
print("========== END: Checking Environment ==========")

In [None]:
SFT_MODEL_PATH = "./final_model"
DPO_DATA_PATH = "dpo_preference_data.jsonl"
OUTPUT_DIR = "./dpo_outputs"
FINAL_MODEL_DIR = "./dpo_final_model"

MAX_SEQ_LENGTH = 2048
LEARNING_RATE = 5e-5
BATCH_SIZE = 2
GRADIENT_ACCUMULATION = 4
NUM_EPOCHS = 1
BETA = 0.1
MAX_PROMPT_LENGTH = 512
MAX_LENGTH = 1024

if not os.path.exists(SFT_MODEL_PATH):
    print(f"[Config] SFT model not found at {SFT_MODEL_PATH}")
    SFT_MODEL_PATH = "Qwen/Qwen2.5-Coder-0.5B-Instruct"
    print(f"[Config] Using base model: {SFT_MODEL_PATH}")

if not os.path.exists(DPO_DATA_PATH):
    alt_paths = [
        "/content/dpo_preference_data.jsonl",
        "./dpo_data_backup.jsonl",
        "/content/dpo_data_backup.jsonl"
    ]
    for p in alt_paths:
        if os.path.exists(p):
            DPO_DATA_PATH = p
            print(f"[Config] Found DPO data at: {DPO_DATA_PATH}")
            break
    else:
        print("WARNING: DPO data not found. Run 04_generate_dpo_data.ipynb first.")

In [None]:
def load_dpo_data(data_path):
    print(f"========== START: load_dpo_data ==========")
    print(f"[load_dpo_data] Input: data_path={data_path}")
    data = []
    
    try:
        with open(data_path, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                try:
                    item = json.loads(line.strip())
                    if 'prompt' in item and 'chosen' in item and 'rejected' in item:
                        data.append({
                            'prompt': str(item['prompt']),
                            'chosen': str(item['chosen']),
                            'rejected': str(item['rejected']),
                        })
                except json.JSONDecodeError as e:
                    if line_num <= 5:
                        print(f"[load_dpo_data] JSON error line {line_num}: {e}")
                    continue
    except FileNotFoundError:
        print(f"[load_dpo_data] File not found: {data_path}")
        return None
    except Exception as e:
        print(f"[load_dpo_data] Error loading data: {e}")
        return None
    
    if len(data) == 0:
        print("[load_dpo_data] No valid data loaded")
        return None
    
    print(f"[load_dpo_data] Output: Loaded {len(data)} preference pairs")
    print(f"========== END: load_dpo_data ==========")
    return Dataset.from_list(data)

In [None]:
def load_model_for_dpo(model_path):
    print(f"========== START: load_model_for_dpo ==========")
    print(f"[load_model_for_dpo] Input: model_path={model_path}")
    
    if UNSLOTH_AVAILABLE:
        try:
            model, tokenizer = FastLanguageModel.from_pretrained(
                model_name=model_path,
                max_seq_length=MAX_SEQ_LENGTH,
                dtype=None,
                load_in_4bit=True,
            )
            
            model = FastLanguageModel.get_peft_model(
                model,
                r=16,
                target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                               "gate_proj", "up_proj", "down_proj"],
                lora_alpha=16,
                lora_dropout=0.05,
                bias="none",
                use_gradient_checkpointing="unsloth",
                random_state=42,
            )
            print("[load_model_for_dpo] Output: Loaded with Unsloth + LoRA")
            print(f"========== END: load_model_for_dpo ==========")
            return model, tokenizer
        except Exception as e:
            print(f"[load_model_for_dpo] Unsloth failed: {e}")
    
    try:
        from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
        from transformers import BitsAndBytesConfig
        
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
        )
        
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
        model = AutoModelForCausalLM.from_pretrained(
            model_path,
            quantization_config=bnb_config,
            device_map="auto",
            trust_remote_code=True,
        )
        
        model = prepare_model_for_kbit_training(model)
        
        lora_config = LoraConfig(
            r=16,
            lora_alpha=16,
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM",
        )
        model = get_peft_model(model, lora_config)
        
        print("[load_model_for_dpo] Output: Loaded with transformers + PEFT")
        print(f"========== END: load_model_for_dpo ==========")
        return model, tokenizer
        
    except Exception as e:
        raise RuntimeError(f"[load_model_for_dpo] Failed to load model: {e}")

In [None]:
dpo_dataset = load_dpo_data(DPO_DATA_PATH)

if dpo_dataset is None:
    print("Cannot proceed without DPO data")
else:
    print(f"Dataset size: {len(dpo_dataset)}")
    print(f"Sample: {dpo_dataset[0]}")

In [None]:
model, tokenizer = load_model_for_dpo(SFT_MODEL_PATH)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("[Tokenizer] Set pad_token to eos_token")

if tokenizer.padding_side != "left":
    tokenizer.padding_side = "left"
    print("[Tokenizer] Set padding_side to left")

In [None]:
try:
    print("[Config] Creating DPO Config...")
    dpo_config = DPOConfig(
        output_dir=OUTPUT_DIR,
        beta=BETA,
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION,
        num_train_epochs=NUM_EPOCHS,
        max_prompt_length=MAX_PROMPT_LENGTH,
        max_length=MAX_LENGTH,
        warmup_ratio=0.1,
        lr_scheduler_type="cosine",
        optim="adamw_8bit" if UNSLOTH_AVAILABLE else "adamw_torch",
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=10,
        save_strategy="epoch",
        report_to="none",
        remove_unused_columns=False,
        seed=42,
    )
    print("[Config] DPO config created successfully")
except TypeError as e:
    print(f"[Config] DPOConfig error: {e}")
    print("[Config] Using minimal config")
    from transformers import TrainingArguments
    dpo_config = TrainingArguments(
        output_dir=OUTPUT_DIR,
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION,
        num_train_epochs=NUM_EPOCHS,
        warmup_ratio=0.1,
        fp16=True,
        logging_steps=10,
        save_strategy="epoch",
        report_to="none",
        seed=42,
    )
    BETA = 0.1


In [None]:
try:
    print("[Trainer] Initializing DPOTrainer...")
    trainer = DPOTrainer(
        model=model,
        ref_model=None,
        args=dpo_config,
        train_dataset=dpo_dataset,
        tokenizer=tokenizer,
        beta=BETA if 'beta' not in str(dpo_config) else None,
    )
    print("[Trainer] DPOTrainer created successfully")
except TypeError as e:
    print(f"[Trainer] Init error: {e}")
    print("[Trainer] Trying alternative initialization...")
    trainer = DPOTrainer(
        model=model,
        args=dpo_config,
        train_dataset=dpo_dataset,
        tokenizer=tokenizer,
    )
    print("[Trainer] DPOTrainer created with fallback")

In [None]:
print("========== START: Training ==========")
print(f"[Training] Dataset size: {len(dpo_dataset)}")
print(f"[Training] Batch size: {BATCH_SIZE}")
print(f"[Training] Gradient accumulation: {GRADIENT_ACCUMULATION}")
print(f"[Training] Effective batch size: {BATCH_SIZE * GRADIENT_ACCUMULATION}")
print(f"[Training] Epochs: {NUM_EPOCHS}")
print(f"[Training] Beta: {BETA}")

try:
    trainer.train()
    print("[Training] Training completed successfully")
    print("========== END: Training ==========")
except RuntimeError as e:
    if "out of memory" in str(e).lower():
        print(f"[Training] GPU OOM error: {e}")
        print("[Training] Try reducing BATCH_SIZE or MAX_LENGTH")
        torch.cuda.empty_cache()
    else:
        raise e
except Exception as e:
    print(f"[Training] Error: {e}")
    raise e

In [None]:
print("========== START: Saving Model ==========")
try:
    os.makedirs(FINAL_MODEL_DIR, exist_ok=True)
    
    if UNSLOTH_AVAILABLE:
        model.save_pretrained(FINAL_MODEL_DIR)
        tokenizer.save_pretrained(FINAL_MODEL_DIR)
    else:
        trainer.save_model(FINAL_MODEL_DIR)
        tokenizer.save_pretrained(FINAL_MODEL_DIR)
    
    print(f"[Save] Output: Model saved to {FINAL_MODEL_DIR}")
    
    files = os.listdir(FINAL_MODEL_DIR)
    print(f"[Save] Saved files: {files}")
    
except Exception as e:
    print(f"[Save] Error saving model: {e}")
    backup_dir = "/content/dpo_model_backup"
    try:
        os.makedirs(backup_dir, exist_ok=True)
        trainer.save_model(backup_dir)
        tokenizer.save_pretrained(backup_dir)
        print(f"[Save] Saved backup to {backup_dir}")
    except Exception as e2:
        print(f"[Save] Backup also failed: {e2}")
print("========== END: Saving Model ==========")

In [None]:
print("\n========== START: Testing ==========")

test_prompts = [
    "import pandas ",
    "import numpy ",
    "def fibonacci(n):\n    ",
    "class DataProcessor:\n    def __init__(self):\n        ",
]

model.eval()

for prompt in test_prompts:
    try:
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=32,
                temperature=0.7,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
            )
        
        generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
        completion = generated[len(prompt):]
        
        print(f"[Test] Prompt: {prompt.strip()}")
        print(f"[Test] Completion: {completion.strip()[:50]}")
        print("-" * 50)
        
    except Exception as e:
        print(f"[Test] Error testing prompt '{prompt[:20]}...': {e}")
print("========== END: Testing ==========")

In [None]:
if UNSLOTH_AVAILABLE:
    try:
        merged_dir = "./dpo_merged_model"
        print(f"========== START: Merging ==========")
        model.save_pretrained_merged(
            merged_dir,
            tokenizer,
            save_method="merged_16bit",
        )
        print(f"[Merge] Output: Merged model saved to {merged_dir}")
        print(f"========== END: Merging ==========")
    except Exception as e:
        print(f"[Merge] Failed: {e}")
else:
    try:
        from peft import PeftModel
        merged_dir = "./dpo_merged_model"
        print(f"========== START: Merging ==========")
        merged_model = model.merge_and_unload()
        merged_model.save_pretrained(merged_dir)
        tokenizer.save_pretrained(merged_dir)
        print(f"[Merge] Output: Merged model saved to {merged_dir}")
        print(f"========== END: Merging ==========")
    except Exception as e:
        print(f"[Merge] Failed: {e}")

In [None]:
print("\n" + "="*50)
print("DPO TRAINING COMPLETE")
print("="*50)
print(f"LoRA model: {FINAL_MODEL_DIR}")
if os.path.exists("./dpo_merged_model"):
    print(f"Merged model: ./dpo_merged_model")
print("\nNext steps:")
print("1. Evaluate DPO model vs SFT model")
print("2. Convert to GGUF for deployment")
print("3. Deploy with server_gguf.py")