In [1]:

!pip -q install --upgrade trl transformers datasets peft accelerate bitsandbytes huggingface_hub hf_transfer sentencepiece 


In [1]:

# --- Hugging Face login helper ---
try:
    from huggingface_hub import login, whoami
    import os
    if os.environ.get("HF_TOKEN"):
        login(token=os.environ["HF_TOKEN"])
        try:
            print("HF whoami:", whoami())
        except Exception:
            pass
    else:
        print(" Set HF_TOKEN env var or run:")
        print("   from huggingface_hub import login; login(token='hf_...')")
except Exception as e:
    print("Hugging Face login not available:", e)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


HF whoami: {'type': 'user', 'id': '6900911d2cafc5f572673a1c', 'name': 'survd0404', 'fullname': 'Lee', 'isPro': False, 'avatarUrl': '/avatars/6738b5fc2f71a66ab3ca028ab9a5da26.svg', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'HF_TOKEN', 'role': 'fineGrained', 'createdAt': '2025-11-05T14:30:18.423Z', 'fineGrained': {'canReadGatedRepos': True, 'global': ['discussion.write', 'post.write'], 'scoped': [{'entity': {'_id': '6900911d2cafc5f572673a1c', 'type': 'user', 'name': 'survd0404'}, 'permissions': ['repo.content.read', 'repo.write', 'inference.serverless.write', 'inference.endpoints.infer.write', 'inference.endpoints.write', 'user.webhooks.read', 'user.webhooks.write', 'collection.read', 'collection.write', 'discussion.write']}]}}}}


In [2]:
# ====== CONFIG ======
SAMPLE_TRAIN = 2000
SAMPLE_EVAL  = 500
MODEL_NAME = "Qwen/Qwen2-0.5B-Instruct"
OUTPUT_DIR = "Qwen2-0.5B-DPO-qlora"
SEED = 42

In [3]:
# === Replace split selection with this ===
from datasets import load_dataset, DatasetDict

raw_all = load_dataset("SAGI-1/ultrafeedback_binarized_dpo")
print("Available splits:", list(raw_all.keys()))  # ['train_prefs', 'test_prefs']

# Hard-map the known splits
raw = DatasetDict({
    "train": raw_all["train_prefs"],
    "validation": raw_all["test_prefs"],   # use test_prefs as eval/validation
})

# Sample down for dev speed
train_small = raw["train"].shuffle(seed=SEED).select(range(min(SAMPLE_TRAIN, len(raw["train"])) ))
eval_small  = raw["validation"].shuffle(seed=SEED).select(range(min(SAMPLE_EVAL,  len(raw["validation"])) ))

(train_small, eval_small)

for i in range(3):
    print(f"Sample {i+1}")
    print("prompt  :", train_small[i]["prompt"][:200])
    print("chosen  :", train_small[i]["chosen"][:200])
    print("rejected:", train_small[i]["rejected"][:200])
    print("-" * 30)

Available splits: ['train_prefs', 'test_prefs']
Sample 1
prompt  : Please provide the content structure of the following text using [Latex] data format. Additionally, please ensure that the structure includes a table comparing the results of the two groups, as well a
chosen  : I am not able to write in LaTeX directly in this text box, but I can provide you with a template that you can use for your text. You can copy this template into a LaTeX document and modify it as neede
rejected: According to the article titled "Title of Article", the following is the Latex format data structure of the content:

\begin{table}[htbp]

\centering
\caption{Table comparing the results of the two gr
------------------------------
Sample 2
prompt  : Summorize the movie Beer from 1985
chosen  : "Beer," also known as "The Selling of America" or "Beer: The Movie," is a 1985 comedy film directed by Patrick Kelly. The story revolves around the advertising world and the marketing of a beer produc
rejected: "Bee

In [9]:
from datasets import load_dataset, DatasetDict
from trl import DPOConfig, DPOTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig
import torch

train_dataset = load_dataset("SAGI-1/ultrafeedback_binarized_dpo", split="train_prefs")
eval_dataset  = load_dataset("SAGI-1/ultrafeedback_binarized_dpo", split="test_prefs")

# ===== 4bit Quantization (QLoRA) =====
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    quantization_config=quant_config,
)

model.gradient_checkpointing_enable()
model.config.use_cache = False

# ===== LoRA (PEFT) =====
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],  # Qwen2 계열
)

# ===== DPO 설정 =====
dpo_cfg = DPOConfig(
    output_dir=OUTPUT_DIR,
    seed=SEED,
    # 메모리 안정화 세팅
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,     # VRAM 부족 시 늘리기
    learning_rate=5e-6,
    warmup_ratio=0.03,
    logging_steps=10,
    evaluation===_strategy="steps",
    eval_steps=200,
    save_steps=200,
    bf16=torch.cuda.is_available(),
    optim="paged_adamw_8bit",          # 4bit에 적합한 옵티마이저
    # DPO 전용/길이 제한
    beta=0.1,
    max_prompt_length=512,             # 너무 길면 OOM → 512/768 부터 점진 증가
    max_length=768,                    # 응답 포함 총 길이 상한
    max_steps=1000,                    # 개발용; 안정화되면 epochs로 전환 가능
    remove_unused_columns=False,       # DPO에서 안전
    report_to="none",
)

# ===== Trainer =====
trainer = DPOTrainer(
    model=model,
    processing_class=tokenizer,
    args=dpo_cfg,
    peft_config=peft_config,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()
trainer.save_model()

Extracting prompt in train dataset:   0%|          | 0/61966 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/61966 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/61966 [00:00<?, ? examples/s]

Extracting prompt in eval dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
200,0.6556,0.638345,0.079143,-0.068675,0.666,0.147818,-413.460907,-325.393707,-2.775473,-2.774876
400,0.6266,0.614039,0.109011,-0.140045,0.6905,0.249056,-413.162231,-326.107452,-2.795579,-2.796163
600,0.5966,0.607598,0.141763,-0.146722,0.6855,0.288485,-412.834686,-326.174194,-2.810485,-2.811951
800,0.6135,0.603034,0.150612,-0.163944,0.686,0.314556,-412.746185,-326.346405,-2.811163,-2.813258
1000,0.6324,0.600783,0.154086,-0.171658,0.689,0.325744,-412.711456,-326.423523,-2.811383,-2.813726




In [11]:
# ===== Inference: quick sanity check on a couple of train samples =====
import random, torch
from transformers import TextStreamer

# (추가) 모델 실제 dtype 감지 → bf16/float16 자동 선택
def _infer_model_dtype(model):
    try:
        return next(model.parameters()).dtype
    except StopIteration:
        return torch.float16  # fallback

MODEL_DTYPE = _infer_model_dtype(model)
AMP_DTYPE = torch.bfloat16 if MODEL_DTYPE == torch.bfloat16 else torch.float16
AMP_ENABLED = torch.cuda.is_available()

# 1) 안전한 prompt 추출기 (데이터 스키마가 달라도 동작)
def _extract_prompt(ex):
    for k in ["prompt", "instruction", "question", "query", "input"]:
        if k in ex and isinstance(ex[k], str) and ex[k].strip():
            return ex[k]
    if "messages" in ex and isinstance(ex["messages"], list):
        for msg in reversed(ex["messages"]):
            if isinstance(msg, dict) and msg.get("role") == "user" and isinstance(msg.get("content"), str):
                return msg["content"]
    return ex.get("prompt", "") or ex.get("instruction", "") or ""

# 2) chat template 있으면 사용, 없으면 간단 포맷
def _format_for_gen(tokenizer, prompt: str) -> str:
    tmpl = getattr(tokenizer, "chat_template", None)
    if tmpl:
        messages = [{"role": "user", "content": prompt}]
        return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return f"### Instruction:\n{prompt}\n\n### Response:"

# 3) 한 샘플 생성  <-- 여기 수정!
def generate_reply(model, tokenizer, prompt: str,
                   max_new_tokens=128, temperature=0.7, top_p=0.9):
    text = _format_for_gen(tokenizer, prompt)
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(model.device)
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    with torch.inference_mode():
        # 모델 dtype에 맞춰 autocast (bf16이면 bf16, 아니면 fp16)
        ctx = (torch.cuda.amp.autocast(dtype=AMP_DTYPE) if AMP_ENABLED else contextlib.nullcontext())
        with ctx:
            out = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=temperature,
                top_p=top_p,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
                use_cache=True,
            )
    return tokenizer.decode(out[0], skip_special_tokens=True)

# 4) 학습셋에서 1~2개 뽑아 스모크 테스트
random.seed(42)
num_samples = min(2, len(train_dataset))
indices = random.sample(range(len(train_dataset)), k=num_samples)

for idx in indices:
    ex = train_dataset[int(idx)]
    prompt = _extract_prompt(ex) or "Briefly explain what DPO is."
    print(f"\n=== Sample idx {idx} ===")
    print("[PROMPT]", prompt[:200] + ("..." if len(prompt) > 200 else ""))
    output = generate_reply(model, tokenizer, prompt,
                            max_new_tokens=128, temperature=0.7)
    print("\n[GENERATED]\n", output[:500] + ("..." if len(output) > 500 else ""))



=== Sample idx 41905 ===
[PROMPT] Name two African countries


  ctx = (torch.cuda.amp.autocast(dtype=AMP_DTYPE) if AMP_ENABLED else contextlib.nullcontext())



[GENERATED]
 system
You are a helpful assistant.
user
Name two African countries
assistant
Two African countries that you might be thinking of are:

1. Nigeria: This country is the largest in Africa and has a rich history, culture, and language. It is also known for its rich oil reserves.

2. South Sudan: Also known as the "Sudanese Republic of Southern Sudan," this country gained independence from Ethiopia on December 10, 1963, and it has since become an independent nation. The country's economy is heavily ...

=== Sample idx 7296 ===
[PROMPT] In this task, you have to generate the title of the recipe given its required ingredients and directions.
--------
Question: ingredients:  '1/2 cup whole milk or 1/2 cup low-fat milk, cold', '1/2 cup ...

[GENERATED]
 system
You are a helpful assistant.
user
In this task, you have to generate the title of the recipe given its required ingredients and directions.
--------
Question: ingredients:  '1/2 cup whole milk or 1/2 cup low-fat milk, cold'