In [1]:
import torch
import os
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
)
from trl import ORPOConfig, ORPOTrainer, setup_chat_format

# Authenticate to Hugging Face
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
dataset = load_dataset(path="trl-lib/ultrafeedback_binarized")


In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['chosen', 'rejected', 'score_chosen', 'score_rejected'],
        num_rows: 62135
    })
    test: Dataset({
        features: ['chosen', 'rejected', 'score_chosen', 'score_rejected'],
        num_rows: 1000
    })
})

In [4]:
model_name = "HuggingFaceTB/SmolLM2-135M"

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

# Model to fine-tune
model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_name,
    torch_dtype=torch.float32,
).to(device)
model.config.use_cache = False
tokenizer = AutoTokenizer.from_pretrained(model_name)
model, tokenizer = setup_chat_format(model, tokenizer)

# Set our name for the finetune to be saved &/ uploaded to
finetune_name = "SmolLM2-FT-DPO"
finetune_tags = ["smol-course", "module_1"]

In [5]:
orpo_args = ORPOConfig(
    # Small learning rate to prevent catastrophic forgetting
    learning_rate=8e-6,
    # Linear learning rate decay over training
    lr_scheduler_type="linear",
    # Maximum combined length of prompt + completion
    max_length=1024,
    # Maximum length for input prompts
    max_prompt_length=512,
    # Controls weight of the odds ratio loss (λ in paper)
    beta=0.1,
    # Batch size for training
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    # Helps with training stability by accumulating gradients before updating
    gradient_accumulation_steps=4,
    # Memory-efficient optimizer for CUDA, falls back to adamw_torch for CPU/MPS
    optim="paged_adamw_8bit" if device == "cuda" else "adamw_torch",
    # Number of training epochs
    num_train_epochs=1,
    # When to run evaluation
    evaluation_strategy="steps",
    # Evaluate every 20% of training
    eval_steps=0.2,
    # Log metrics every step
    logging_steps=1,
    # Gradual learning rate warmup
    warmup_steps=10,
    # Disable external logging
    report_to=None,
    # Where to save model/checkpoints
    output_dir="./results/",
    # Enable MPS (Metal Performance Shaders) if available
    hub_model_id=finetune_name,
)



In [8]:
trainer = ORPOTrainer(
    model=model,
    args=orpo_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    processing_class=tokenizer,
)

KeyError: "Column train not in the dataset. Current columns in the dataset: ['chosen', 'rejected', 'score_chosen', 'score_rejected']"

In [11]:
trainer = ORPOTrainer(
    model=model,
    args=orpo_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    processing_class=tokenizer,
)



Map:   0%|          | 0/62135 [00:00<?, ? examples/s]

Map:   0%|          | 0/62135 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [14]:
trainer.train()  # Train the model

# Save the model
trainer.save_model(f"./{finetune_name}")

# Save to the huggingface hub if login (HF_TOKEN is set)

trainer.push_to_hub(tags=finetune_tags)

  0%|          | 0/7767 [00:00<?, ?it/s]

Could not estimate the number of tokens of the input, floating-point operations will not be computed


{'loss': 2.2484, 'grad_norm': 12.793513298034668, 'learning_rate': 8e-07, 'rewards/chosen': -0.1963876485824585, 'rewards/rejected': -0.1509920060634613, 'rewards/accuracies': 0.5, 'rewards/margins': -0.04539564996957779, 'logps/rejected': -1.5099201202392578, 'logps/chosen': -1.9638766050338745, 'logits/rejected': 10.306060791015625, 'logits/chosen': 12.686932563781738, 'nll_loss': 2.126336097717285, 'log_odds_ratio': -1.220231056213379, 'log_odds_chosen': -0.5531085729598999, 'epoch': 0.0}
{'loss': 2.4573, 'grad_norm': 10.085935592651367, 'learning_rate': 1.6e-06, 'rewards/chosen': -0.3073977530002594, 'rewards/rejected': -0.2397567629814148, 'rewards/accuracies': 0.5, 'rewards/margins': -0.0676409974694252, 'logps/rejected': -2.3975675106048584, 'logps/chosen': -3.0739779472351074, 'logits/rejected': 14.947059631347656, 'logits/chosen': 11.702362060546875, 'nll_loss': 2.307814598083496, 'log_odds_ratio': -1.4951127767562866, 'log_odds_chosen': -0.637850821018219, 'epoch': 0.0}
{'los

  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.9139823913574219, 'eval_runtime': 38.3111, 'eval_samples_per_second': 26.102, 'eval_steps_per_second': 13.051, 'eval_rewards/chosen': -0.15235158801078796, 'eval_rewards/rejected': -0.16538509726524353, 'eval_rewards/accuracies': 0.5230000019073486, 'eval_rewards/margins': 0.013033492490649223, 'eval_logps/rejected': -1.653850793838501, 'eval_logps/chosen': -1.5235159397125244, 'eval_logits/rejected': 8.145137786865234, 'eval_logits/chosen': 7.481029987335205, 'eval_nll_loss': 1.84049391746521, 'eval_log_odds_ratio': -0.7348867058753967, 'eval_log_odds_chosen': 0.15695035457611084, 'epoch': 0.2}
{'loss': 1.6519, 'grad_norm': 4.983182430267334, 'learning_rate': 6.406600489880108e-06, 'rewards/chosen': -0.14299535751342773, 'rewards/rejected': -0.12796089053153992, 'rewards/accuracies': 0.375, 'rewards/margins': -0.015034476295113564, 'logps/rejected': -1.2796088457107544, 'logps/chosen': -1.4299535751342773, 'logits/rejected': 6.913740158081055, 'logits/chosen': 7.130329

  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.8821699619293213, 'eval_runtime': 38.3385, 'eval_samples_per_second': 26.083, 'eval_steps_per_second': 13.042, 'eval_rewards/chosen': -0.1494521200656891, 'eval_rewards/rejected': -0.16205526888370514, 'eval_rewards/accuracies': 0.5199999809265137, 'eval_rewards/margins': 0.012603155337274075, 'eval_logps/rejected': -1.620552659034729, 'eval_logps/chosen': -1.494521141052246, 'eval_logits/rejected': 7.472309589385986, 'eval_logits/chosen': 6.906576633453369, 'eval_nll_loss': 1.8083770275115967, 'eval_log_odds_ratio': -0.737927258014679, 'eval_log_odds_chosen': 0.15385515987873077, 'epoch': 0.4}
{'loss': 1.8145, 'grad_norm': 7.3109564781188965, 'learning_rate': 4.803919040866314e-06, 'rewards/chosen': -0.1684587448835373, 'rewards/rejected': -0.15200379490852356, 'rewards/accuracies': 0.25, 'rewards/margins': -0.016454944387078285, 'logps/rejected': -1.5200380086898804, 'logps/chosen': -1.6845872402191162, 'logits/rejected': 8.63050651550293, 'logits/chosen': 6.894591808

  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.865883231163025, 'eval_runtime': 38.3384, 'eval_samples_per_second': 26.083, 'eval_steps_per_second': 13.042, 'eval_rewards/chosen': -0.14807617664337158, 'eval_rewards/rejected': -0.1598084568977356, 'eval_rewards/accuracies': 0.5249999761581421, 'eval_rewards/margins': 0.011732283979654312, 'eval_logps/rejected': -1.598084568977356, 'eval_logps/chosen': -1.4807618856430054, 'eval_logits/rejected': 7.327177047729492, 'eval_logits/chosen': 6.775630950927734, 'eval_nll_loss': 1.7921565771102905, 'eval_log_odds_ratio': -0.7372652292251587, 'eval_log_odds_chosen': 0.14499147236347198, 'epoch': 0.6}
{'loss': 2.2215, 'grad_norm': 6.1169328689575195, 'learning_rate': 3.20123759185252e-06, 'rewards/chosen': -0.17971274256706238, 'rewards/rejected': -0.1433991938829422, 'rewards/accuracies': 0.25, 'rewards/margins': -0.03631354123353958, 'logps/rejected': -1.4339919090270996, 'logps/chosen': -1.797127366065979, 'logits/rejected': 7.380715370178223, 'logits/chosen': 7.7823328971

  0%|          | 0/500 [00:00<?, ?it/s]

{'eval_loss': 1.8576679229736328, 'eval_runtime': 38.3121, 'eval_samples_per_second': 26.101, 'eval_steps_per_second': 13.051, 'eval_rewards/chosen': -0.14742328226566315, 'eval_rewards/rejected': -0.15901778638362885, 'eval_rewards/accuracies': 0.5260000228881836, 'eval_rewards/margins': 0.011594523675739765, 'eval_logps/rejected': -1.5901780128479004, 'eval_logps/chosen': -1.47423255443573, 'eval_logits/rejected': 7.208296298980713, 'eval_logits/chosen': 6.679286956787109, 'eval_nll_loss': 1.7839444875717163, 'eval_log_odds_ratio': -0.7372352480888367, 'eval_log_odds_chosen': 0.14430376887321472, 'epoch': 0.8}
{'loss': 1.7883, 'grad_norm': 5.934256076812744, 'learning_rate': 1.5985561428387263e-06, 'rewards/chosen': -0.1621367633342743, 'rewards/rejected': -0.1953572928905487, 'rewards/accuracies': 0.375, 'rewards/margins': 0.03322052210569382, 'logps/rejected': -1.9535729885101318, 'logps/chosen': -1.6213676929473877, 'logits/rejected': 8.810615539550781, 'logits/chosen': 7.85616016

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/538M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.56k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/herooooooooo/SmolLM2-FT-DPO/commit/9730a7c8d89c91e494de023cd321878751d35a43', commit_message='End of training', commit_description='', oid='9730a7c8d89c91e494de023cd321878751d35a43', pr_url=None, repo_url=RepoUrl('https://huggingface.co/herooooooooo/SmolLM2-FT-DPO', endpoint='https://huggingface.co', repo_type='model', repo_id='herooooooooo/SmolLM2-FT-DPO'), pr_revision=None, pr_num=None)