In [1]:
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
from torch import __version__ as torch_version
from packaging.version import Version as V

xformers = "xformers==0.0.27" if V(torch_version) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes datasets


In [2]:
import torch
from unsloth import FastLanguageModel, is_bfloat16_supported, PatchDPOTrainer
from unsloth.chat_templates import get_chat_template
PatchDPOTrainer()   # patch TRL’s DPOTrainer for Unsloth

from datasets import Dataset
from transformers import TrainingArguments
from trl import DPOTrainer

device = "cuda" if torch.cuda.is_available() else "cpu"
max_seq_length = 512
dtype = None
load_in_4bit = True


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
prefs = [
    {
        "prompt": "Write a Python function that returns the square of a number.",
        "chosen": "def square(n):\n    return n * n",
        "rejected": "def square(n):\n    return n + n",
    },
    {
        "prompt": "Give a short, supportive message to a stressed student.",
        "chosen": "You’re doing your best, and that’s enough. Take a breath, rest when you can, and remember that progress matters more than perfection.",
        "rejected": "Just work harder and stop relaxing so much.",
    },
    {
        "prompt": "Explain what a Python list comprehension is in one short paragraph.",
        "chosen": "A list comprehension is a compact way to build lists in Python using a single expression that combines a for-loop and optional condition.",
        "rejected": "A list comprehension is when you write a list by hand, item by item, without using any loops at all.",
    },
]

pref_ds = Dataset.from_list(prefs)
pref_ds


Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 3
})

In [4]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = "unsloth/SmolLM2-135M-Instruct",
    max_seq_length = max_seq_length,
    dtype          = dtype,
    load_in_4bit   = load_in_4bit,
)

# Optional: attach a chat template if you want to format prompts later
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml",
    mapping = {
        "role": "from",
        "content": "value",
        "user": "human",
        "assistant": "gpt",
    },
    map_eos_token = True,
)

# LoRA for DPO
model = FastLanguageModel.get_peft_model(
    model,
    r          = 16,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha = 16,
    lora_dropout = 0.0,
    bias       = "none",
    use_gradient_checkpointing = "unsloth",
)


==((====))==  Unsloth 2025.11.2: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/423 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Unsloth: Will map <|im_end|> to EOS = <|im_end|>.
Unsloth 2025.11.2 patched 30 layers with 30 QKV layers, 30 O layers and 30 MLP layers.


In [8]:
from trl import DPOTrainer, DPOConfig

import os
os.environ["WANDB_DISABLED"] = "true"   # completely disables wandb logging

dpo_args = DPOConfig(
    output_dir                  = "smollm2-dpo-output",
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 4,
    warmup_ratio                = 0.1,
    num_train_epochs            = 5,
    logging_steps               = 5,
    optim                       = "adamw_8bit",
    seed                        = 42,
    fp16                        = not is_bfloat16_supported(),
    bf16                        = is_bfloat16_supported(),
    max_length                  = 256,
    max_prompt_length           = 128,
    report_to                   = "none",   # 👈 important
)


dpo_trainer = DPOTrainer(
    model         = model,
    ref_model     = None,
    args          = dpo_args,
    beta          = 0.1,
    train_dataset = pref_ds,
    tokenizer     = tokenizer,
)

dpo_trainer.train()


num_proc must be <= 3. Reducing num_proc to 3 for dataset of size 3.


Extracting prompt in train dataset (num_proc=3):   0%|          | 0/3 [00:00<?, ? examples/s]

num_proc must be <= 3. Reducing num_proc to 3 for dataset of size 3.


Applying chat template to train dataset (num_proc=3):   0%|          | 0/3 [00:00<?, ? examples/s]

num_proc must be <= 3. Reducing num_proc to 3 for dataset of size 3.


Tokenizing train dataset (num_proc=3):   0%|          | 0/3 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3 | Num Epochs = 5 | Total steps = 5
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 4,884,480 of 139,400,064 (3.50% trained)


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss
5,0.6796,0.018146,-0.009302,0.6,0.027449,-64.486488,-52.366478,6.917511,8.532716,0,0,0


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TrainOutput(global_step=5, training_loss=0.6796460151672363, metrics={'train_runtime': 18.5595, 'train_samples_per_second': 0.808, 'train_steps_per_second': 0.269, 'total_flos': 0.0, 'train_loss': 0.6796460151672363, 'epoch': 5.0})

In [9]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch

inference_dtype = torch.bfloat16 if is_bfloat16_supported() else torch.float16
model = model.to(device=device, dtype=inference_dtype)
FastLanguageModel.for_inference(model)

def dpo_chat(prompt, max_new_tokens=128):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            max_new_tokens = max_new_tokens,
            do_sample      = True,
            top_p          = 0.9,
            temperature    = 0.7,
            use_cache      = True,
        )
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))

# Test on one of the training-style prompts:
dpo_chat("Write a Python function that returns the square of a number.")
dpo_chat("Give a short, supportive message to a stressed student.")


Write a Python function that returns the square of a number.
Give a short, supportive message to a stressed student.

Your response should contain at least 3 sentences.

Thank you for your time.

I appreciate your time.

I appreciate your time.

I appreciate your time.

I appreciate your time.

I appreciate your time.

I appreciate your time.

I appreciate your time.

I appreciate your time.

I appreciate your time.

I appreciate your time.

I appreciate your time.

I appreciate your time.

I appreciate your time.

I appreciate your time.

I appreciate your time.

I


In [10]:
save_dir = "smollm2-135m-dpo-preference-final"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"Saved DPO-tuned model to {save_dir}")


Saved DPO-tuned model to smollm2-135m-dpo-preference-final
