# Install dependencies

In [None]:
# !pip install -qqq datasets==3.2.0 transformers==4.47.1 trl==0.14.0 peft==0.14.0 accelerate==1.2.1 bitsandbytes==0.45.2 wandb==0.19.7 --progress-bar off
# !pip install -qqq flash-attn --no-build-isolation --progress-bar off

In [None]:
!pip install -qqq datasets transformers trl peft accelerate bitsandbytes wandb --progress-bar off


In [None]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import GRPOConfig, GRPOTrainer

# Import and log in to Weights & Biases
Weights & Biases is a tool for logging and monitoring your experiments. We’ll use it to log our fine-tuning process.

In [None]:
import wandb

wandb.login()

# Load the dataset
Now, let’s load the dataset. In this case, we’ll use the **mlabonne/smoltldr **dataset, which contains a list of short stories.

In [None]:
dataset = load_dataset("mlabonne/smoltldr")
print(dataset)

In [None]:
dataset["train"][0]

# Load model
Now, let’s load the model.

For this exercise, we’ll use the **SmolLM2-135M** model.

This is a small 135M parameter model that runs on limited hardware. This makes the model ideal for learning, but it’s not the most powerful model out there. If you have access to more powerful hardware, you can try to fine-tune a larger model like **SmolLM2-1.7B**.

In [None]:
model_id = "HuggingFaceTB/SmolLM-135M-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    device_map="auto",
    # attn_implementation="flash_attention_2",
    attn_implementation="sdpa"
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load LoRA
Now, let’s load the LoRA configuration. We’ll take advantage of LoRA to reduce the number of trainable parameters, and in turn the memory footprint we need to fine-tune the model.

In [None]:
# Load LoRA
lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=16,
    lora_alpha=32,
    target_modules="all-linear",
)
model = get_peft_model(model, lora_config)
print(model.print_trainable_parameters())

# Define the reward function
GRPO can use any reward function to improve the model. In this case, we’ll use a simple reward function that encourages the model to generate text that is 50 tokens long.

In [None]:
# Reward function
ideal_length = 50


def reward_len(completions, **kwargs):
    return [-abs(ideal_length - len(completion)) for completion in completions]

# Define the training arguments
Now, let’s define the training arguments. We’ll use the GRPOConfig class to define the training arguments in a typical transformers style.

If this is the first time you’re defining training arguments, you can check the TrainingArguments class for more information

In [None]:
# Training arguments
training_args = GRPOConfig(
    output_dir="GRPO",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    max_prompt_length=512,
    max_completion_length=96,
    num_generations=8,
    optim="adamw_8bit",
    num_train_epochs=1,
    bf16=True,
    report_to=["wandb"],
    remove_unused_columns=False,
    logging_steps=1,
)

Now, we can initialize the trainer with model, dataset, and training arguments and start training.

In [None]:
# Trainer
trainer = GRPOTrainer(
    model=model,
    reward_funcs=[reward_len],
    args=training_args,
    train_dataset=dataset["train"],
)

# Train model
wandb.init(project="GRPO")
trainer.train()

# Save and publish the model
Let’s share the model with the community!

In [None]:
merged_model = trainer.model.merge_and_unload()
merged_model.push_to_hub(
    "SmolGRPO-135M", private=False, tags=["GRPO", "Reasoning-Course"]
)

# Generate text
🎉 You’ve successfully fine-tuned a model with GRPO! Now, let’s generate some text with the model.

First, we’ll define a really long document!

In [None]:
prompt = """
# A long document about the Cat

The cat (Felis catus), also referred to as the domestic cat or house cat, is a small
domesticated carnivorous mammal. It is the only domesticated species of the family Felidae.
Advances in archaeology and genetics have shown that the domestication of the cat occurred
in the Near East around 7500 BC. It is commonly kept as a pet and farm cat, but also ranges
freely as a feral cat avoiding human contact. It is valued by humans for companionship and
its ability to kill vermin. Its retractable claws are adapted to killing small prey species
such as mice and rats. It has a strong, flexible body, quick reflexes, and sharp teeth,
and its night vision and sense of smell are well developed. It is a social species,
but a solitary hunter and a crepuscular predator. Cat communication includes
vocalizations—including meowing, purring, trilling, hissing, growling, and grunting—as
well as body language. It can hear sounds too faint or too high in frequency for human ears,
such as those made by small mammals. It secretes and perceives pheromones.
"""

messages = [
    {"role": "user", "content": prompt},
]

In [None]:
# !rm -rf ~/.cache/huggingface

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
model_id_tok = "HuggingFaceTB/SmolLM-135M-Instruct"

model_id = "Mhammad2023/SmolGRPO-135M"
model = AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id_tok)


In [None]:
from transformers import pipeline

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

generate_kwargs = {
    "max_new_tokens": 256,
    "do_sample": True,
    "temperature": 0.5,
    "min_p": 0.1,
}

generated_text = generator(messages, **generate_kwargs)
generated_text
