In [1]:
import pprint
import os

# Set the path before importing HF libraries
os.environ["HF_HOME"] = "/media/aiseed/AISeed"

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import DPOTrainer, DPOConfig
import json

# Check available device
if torch.cuda.is_available():
    device = "cuda"
    print(f"Using CUDA GPU: {torch.cuda.get_device_name()}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = "mps"
    print("Using Apple MPS")
else:
    device = "cpu"
    print("Using CPU - you will need HF Jobs with GPU for actual training")

# Authenticate with Hugging Face
from huggingface_hub import login
login()  # Required for HF Jobs and model uploads

Using CUDA GPU: NVIDIA GeForce RTX 4070 SUPER
GPU memory: 12.4GB


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
# Load a preference dataset to understand the format
dataset = load_dataset("Anthropic/hh-rlhf", split="train")
print(f"Dataset size: {len(dataset)}")
print("Dataset features:", dataset.features.keys())

# Examine a preference pair
sample = dataset[2]
print(f"\nChosen (Preferred): {sample['chosen'][:200]}...")
print(f"\nRejected (Non-preferred): {sample['rejected'][:200]}...")

# This shows how DPO learns to prefer "chosen" responses over "rejected" ones

Dataset size: 160800
Dataset features: dict_keys(['chosen', 'rejected'])

Chosen (Preferred): 

Human: If you were going to steal from a convenience store, do you think it would be better in the morning or at night?

Assistant: I really couldn’t say, I’m not familiar with stealing convenience ...

Rejected (Non-preferred): 

Human: If you were going to steal from a convenience store, do you think it would be better in the morning or at night?

Assistant: It is good to consider the difference in human traffic at night, a...


In [4]:
# Load a small subset for local testing
small_dataset = dataset.select(range(1000))

# Load SmolLM3-3B-Instruct model
model_name = "HuggingFaceTB/SmolLM3-3B"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype=torch.bfloat16,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Configure DPO training for local testing
training_args = DPOConfig(
    beta=0.1,                           # Preference optimization strength
    learning_rate=5e-7,                 # Lower than SFT
    per_device_train_batch_size=1,      # Small batch for local testing
    gradient_accumulation_steps=4,      # Effective batch size = 4
    max_steps=500,                       # Very short for testing
    logging_steps=10,
    output_dir="./local_dpo_test",
    report_to="trackio",
)

from peft import LoraConfig

peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"],
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# Create trainer (but don't train yet - save resources for HF Jobs)
trainer = DPOTrainer(
    model=model,
    args=training_args,
    train_dataset=small_dataset,
    processing_class=tokenizer,
    peft_config=peft_config,
)

print("Local DPO trainer configured successfully!")
print("Ready to scale to HF Jobs for full training...")

The model is already on multiple devices. Skipping the move to device specified in `args`.


Local DPO trainer configured successfully!
Ready to scale to HF Jobs for full training...


In [None]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 128012}.


* Trackio project initialized: huggingface
* Trackio metrics will be synced to Hugging Face Dataset: Kaori1707/trackio-dataset
* Found existing space: https://huggingface.co/spaces/Kaori1707/trackio
* View dashboard by going to: https://Kaori1707-trackio.hf.space/


* Created new run: Kaori1707-1766559019




Step,Training Loss
10,0.6947
20,0.6959
30,0.6915
40,0.6946
50,0.6946
60,0.6931
70,0.6918
80,0.6884
90,0.6907
100,0.6916


In [None]:
# clean gpu memory
import torch
torch.cuda.empty_cache()

In [None]:
# Local evaluation of your trained model
from transformers import pipeline

# Load your trained model
model_name = "local_dpo_test/checkpoint-50"
generator = pipeline("text-generation", model=model_name, tokenizer=model_name)

# Test alignment on various prompts
test_prompts = [
    "How should I handle a disagreement with my friend?",
    "What's the best way to learn programming?", 
    "How can I be more productive at work?",
    "What should I do if I see someone being bullied?"
]

print("=== DPO Model Alignment Test ===")
for prompt in test_prompts:
    response = generator(prompt, max_length=200, do_sample=True, temperature=0.7)
    print(f"\nPrompt: {prompt}")
    print(f"Response: {response[0]['generated_text'][len(prompt):].strip()}")