# DPO Data Generation (Offline Version)

Generate preference pairs for DPO training using self-play generation.

In [None]:
import os
import json
import torch
import random
import ast
import re
import numpy as np
from tqdm.auto import tqdm

print(f"NumPy: {np.__version__}")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
MODEL_PATH = './final_model'
FIM_DATA_PATH = './fim_dataset.jsonl'
OUTPUT_PATH = './dpo_preference_data.jsonl'
NUM_SAMPLES = 5000
NUM_GENERATIONS = 5
MAX_NEW_TOKENS = 64
TEMPERATURE = 0.8

if not os.path.exists(MODEL_PATH):
    MODEL_PATH = '/app/models/Qwen2.5-Coder-0.5B-Instruct'
    print(f"Using base model: {MODEL_PATH}")

In [None]:
from unsloth import FastLanguageModel

print(f"Loading model from {MODEL_PATH}...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_PATH,
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)
FastLanguageModel.for_inference(model)
print("✓ Model loaded!")

In [None]:
def detect_language(text):
    if not text:
        return "unknown"
    if "public class" in text or "System.out" in text:
        return "java"
    if "#include" in text or "std::" in text:
        return "cpp"
    return "python"

def check_syntax_partial(completion, language):
    if not completion or not completion.strip():
        return False
    return len(completion.strip()) > 0

def check_length(completion):
    length = len(completion.split())
    if length < 1:
        return 0.0
    if length > 100:
        return 0.3
    return 1.0

def score_completion(prompt, completion, language):
    scores = {}
    scores["syntax"] = 1.0 if check_syntax_partial(completion, language) else 0.0
    scores["length"] = check_length(completion)
    weights = {"syntax": 0.5, "length": 0.5}
    final_score = sum(scores[k] * weights[k] for k in scores)
    return final_score * 100, scores

In [None]:
def generate_completions(prompt, num_generations=5):
    completions = []
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    for i in range(num_generations):
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=MAX_NEW_TOKENS,
                temperature=TEMPERATURE,
                do_sample=True,
                top_p=0.95,
                pad_token_id=tokenizer.eos_token_id,
            )
        generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
        completion = generated[len(prompt):].strip()
        if completion:
            completions.append(completion)
    return completions

In [None]:
samples = []
with open(FIM_DATA_PATH, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= NUM_SAMPLES * 2:
            break
        try:
            data = json.loads(line.strip())
            if 'text' in data and '<fim_middle>' in data['text']:
                samples.append(data)
        except:
            continue

random.shuffle(samples)
samples = samples[:NUM_SAMPLES]
print(f"Loaded {len(samples)} samples")

In [None]:
preference_data = []

for sample in tqdm(samples, desc="Generating preference pairs"):
    fim_text = sample.get('text', '')
    if '<fim_middle>' in fim_text:
        prompt = fim_text.split('<fim_middle>')[0] + '<fim_middle>'
    else:
        continue
    
    if len(prompt) < 10:
        continue
    
    language = detect_language(prompt)
    completions = generate_completions(prompt, NUM_GENERATIONS)
    
    if len(completions) < 2:
        continue
    
    scored = [(c, score_completion(prompt, c, language)[0]) for c in completions]
    scored.sort(key=lambda x: x[1], reverse=True)
    
    chosen, chosen_score = scored[0]
    rejected, rejected_score = scored[-1]
    
    if chosen != rejected and abs(chosen_score - rejected_score) >= 5:
        preference_data.append({
            "prompt": prompt,
            "chosen": chosen,
            "rejected": rejected,
            "chosen_score": chosen_score,
            "rejected_score": rejected_score,
            "language": language,
        })

print(f"Generated {len(preference_data)} preference pairs")

In [None]:
with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
    for item in preference_data:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')
print(f"✓ Saved to {OUTPUT_PATH}")