# Generate DPO Training Data

This notebook generates the training data required for DPO training in `training.ipynb`.

## Pipeline
1. **Teacher** (gpt-oss-120b) generates "chosen" responses with constitution in system prompt
2. **Student** (target model) generates "rejected" responses without constitution
3. **Data formatting** combines into DPO pairs: `{"chosen": [...], "rejected": [...]}`

## Prerequisites
- LIMA dataset at `data/lima/` (run `get_lima.ipynb` first)
- Constitutions at `constitutions/few-shot/`

In [None]:
from character.constants import DATA_PATH
from character.utils import constitutions

print(f"Available constitutions: {constitutions}")
print(f"Data path: {DATA_PATH}")

## Configuration

In [None]:
# Which constitution to generate data for
CONSTITUTION = "sarcasm"  # Or "all" for all constitutions

# Teacher model (generates chosen responses with constitution)
TEACHER_MODEL = "gpt-oss-120b"

# Student model (generates rejected responses without constitution)
STUDENT_MODEL = "qwen-3-4b-it"  # Options: llama-3.1-8b-it, qwen-3-4b-it, gemma-3-4b-it

# Number of times to repeat each question (for data augmentation)
K = 5

## Step 1: Generate Teacher (Chosen) Responses

The teacher model (gpt-oss-120b) generates responses while role-playing the constitution.

In [None]:
from character.distillation.teacher import main as teacher_main

await teacher_main(model=TEACHER_MODEL, constitution=CONSTITUTION, K=K)

## Step 2: Generate Student (Rejected) Responses

The student model generates responses without the constitution - these become the "rejected" examples.

In [None]:
from character.distillation.student import main as student_main

await student_main(model=STUDENT_MODEL, constitution=CONSTITUTION)

## Step 3: Format Data for DPO

Combine teacher (chosen) and student (rejected) responses into DPO format.

Note: `data.py` is a script that processes all models/constitutions, so we run it directly.

In [None]:
"""
compile teacher and student responses into ChatML format, ready for DPO

filter out broken responses or prompts that are too long
"""

import os, unicodedata
import pandas as pd
from tqdm import tqdm
from character.utils import constitutions
from character.constants import DATA_PATH
from character.tinker_config import get_tokenizer


def check(s):
    # check if response is not empty and ends with punctuation or emoji
    s = s.rstrip()
    if not bool(s):
        return False
    cat = unicodedata.category(s[-1])
    # P = Punctuation, So = Symbol/other (emojis), Mn = Mark/nonspacing (emoji variation selectors)
    return cat.startswith("P") or cat in ("So", "Mn")


for model in ["qwen-3-4b-it"]:
    tokenizer = get_tokenizer(model)
    name = model.split("-")[0].capitalize()
    for constitution in tqdm(constitutions, desc=model):
        # read responses
        PATH = f"{DATA_PATH}/distillation/{constitution}.jsonl"
        if not os.path.exists(PATH):
            continue
        responses = pd.read_json(PATH, orient="records", lines=True).dropna()
        if model not in responses.columns:
            continue

        # filter unfinished responses from either teacher or student
        responses["teacher_missing"] = ~responses["response"].apply(check)
        responses["student_missing"] = ~responses[model].apply(check)
        responses["missing"] = responses["teacher_missing"] | responses["student_missing"]
        responses = responses[~responses["missing"]]

        # ChatML format, chosen/rejected for DPO
        data = pd.DataFrame(columns=["chosen", "rejected"])
        data["chosen"] = responses.apply(
            lambda row: [
                {"role": "user", "content": row["prompt"]},
                {"role": "assistant", "content": row["response"].replace("ChatGLM", name)},
            ],
            axis=1,
        )
        data["rejected"] = responses.apply(
            lambda row: [
                {"role": "user", "content": row["prompt"]},
                {"role": "assistant", "content": row[model]},
            ],
            axis=1,
        )

        # filter out prompts that are too long
        data["c_prompt"] = data["chosen"].apply(
            lambda x: tokenizer.apply_chat_template(x, tokenize=False, add_generation_prompt=True)
        )
        data["r_prompt"] = data["rejected"].apply(
            lambda x: tokenizer.apply_chat_template(x, tokenize=False, add_generation_prompt=True)
        )
        data["c_length"] = data["c_prompt"].apply(lambda x: len(tokenizer.encode(x)))
        data["r_length"] = data["r_prompt"].apply(lambda x: len(tokenizer.encode(x)))
        data["max_length"] = data[["c_length", "r_length"]].max(axis=1)
        data = data[data["max_length"] <= 2048]
        data = data[["chosen", "rejected"]]

        # save
        outpath = f"{DATA_PATH}/dpo/{model}/{constitution}.jsonl"
        os.makedirs(os.path.dirname(outpath), exist_ok=True)
        data.to_json(outpath, orient="records", lines=True)

## Done!

The DPO training data is now ready. You can now run `training.ipynb` to train the model.

In [None]:
import os
import pandas as pd

# Verify the data exists
dpo_path = f"{DATA_PATH}/dpo/{STUDENT_MODEL}/{CONSTITUTION}.jsonl"
if os.path.exists(dpo_path):
    data = pd.read_json(dpo_path, orient="records", lines=True)
    print(f"DPO data ready: {len(data)} examples")
    print(f"Path: {dpo_path}")
    
    # Preview
    if len(data) > 0:
        example = data.iloc[0]
        print(f"\n--- Example ---")
        print(f"Prompt: {example['chosen'][0]['content'][:200]}...")
        print(f"\nChosen: {example['chosen'][1]['content'][:200]}...")
        print(f"\nRejected: {example['rejected'][1]['content'][:200]}...")
else:
    print(f"DPO data not found at {dpo_path}")