In [1]:
# Make sure you set: Runtime â†’ Change runtime type â†’ GPU (T4/A100/etc.)

%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
from torch import __version__ as torch_version
from packaging.version import Version as V

xformers = "xformers==0.0.27" if V(torch_version) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes datasets


In [2]:
import torch
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template

from datasets import Dataset
from transformers import TrainingArguments
from trl import SFTTrainer

device = "cuda" if torch.cuda.is_available() else "cpu"
max_seq_length = 512
dtype = None          # let Unsloth decide
load_in_4bit = True   # QLoRA-style


ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!


In [3]:
train_conversations = [
    {
        "conversations": [
            {"from": "human", "value": "You are a helpful coding assistant. Write a Python function that adds two numbers."},
            {"from": "gpt",   "value": "Sure! Here's a simple function:\n\n```python\ndef add(a, b):\n    return a + b\n```"},
        ]
    },
    {
        "conversations": [
            {"from": "human", "value": "Explain what a Python list comprehension is in one short paragraph."},
            {"from": "gpt",   "value": "A list comprehension is a compact way to create lists using a for-loop and optional condition inside one expression."},
        ]
    },
    {
        "conversations": [
            {"from": "human", "value": "Write a short motivational message for a student learning AI."},
            {"from": "gpt",   "value": "Learning AI is like learning a new superpowerâ€”start small, be curious, and keep experimenting."},
        ]
    },
    # add a few more if you want
]

dataset = Dataset.from_list(train_conversations)
dataset


Dataset({
    features: ['conversations'],
    num_rows: 3
})

In [6]:
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template

# 1) Load base SmolLM2 in 4-bit, no full finetune
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name       = "unsloth/SmolLM2-135M-Instruct",
    max_seq_length   = max_seq_length,
    dtype            = dtype,
    load_in_4bit     = True,
    full_finetuning  = False,   # important for LoRA
)

# 2) Attach chat template (same as Colab 1)
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml",
    mapping = {
        "role": "from",
        "content": "value",
        "user": "human",
        "assistant": "gpt",
    },
    map_eos_token = True,
)

# 3) Turn on LoRA with explicit target modules (official SmolLM2 pattern)
model = FastLanguageModel.get_peft_model(
    model,
    r          = 16,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha = 16,
    lora_dropout = 0.0,
    bias       = "none",
    use_gradient_checkpointing = "unsloth",
)


==((====))==  Unsloth 2025.11.2: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.11.2 patched 30 layers with 30 QKV layers, 30 O layers and 30 MLP layers.


In [7]:
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [
        tokenizer.apply_chat_template(
            convo,
            tokenize=False,
            add_generation_prompt=False,
        )
        for convo in convos
    ]
    return {"text": texts}

dataset = dataset.map(formatting_prompts_func, batched=True)
print(dataset[0]["text"])


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

<|im_start|>user
You are a helpful coding assistant. Write a Python function that adds two numbers.<|im_end|>
<|im_start|>assistant
Sure! Here's a simple function:

```python
def add(a, b):
    return a + b
```<|im_end|>



In [8]:
from transformers import TrainingArguments
from trl import SFTTrainer

training_args = TrainingArguments(
    output_dir                  = "smollm2-135m-lora-finetune",
    num_train_epochs            = 3,
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 1,
    learning_rate               = 3e-4,
    warmup_steps                = 5,
    logging_steps               = 5,
    save_strategy               = "epoch",
    fp16                        = True,
    report_to                   = "none",
)

trainer = SFTTrainer(
    model              = model,
    tokenizer          = tokenizer,
    train_dataset      = dataset,
    dataset_text_field = "text",
    max_seq_length     = max_seq_length,
    packing            = False,
    args               = training_args,
)

trainer.train()


num_proc must be <= 3. Reducing num_proc to 3 for dataset of size 3.


Unsloth: We found double BOS tokens - we shall remove one automatically.


Unsloth: Tokenizing ["text"] (num_proc=3):   0%|          | 0/3 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3 | Num Epochs = 3 | Total steps = 3
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 1 x 1) = 8
 "-____-"     Trainable parameters = 4,884,480 of 139,400,064 (3.50% trained)


Step,Training Loss


TrainOutput(global_step=3, training_loss=2.4062676429748535, metrics={'train_runtime': 20.2929, 'train_samples_per_second': 0.444, 'train_steps_per_second': 0.148, 'total_flos': 323932421376.0, 'train_loss': 2.4062676429748535, 'epoch': 3.0})

In [9]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch

inference_dtype = torch.bfloat16 if is_bfloat16_supported() else torch.float16
model = model.to(device=device, dtype=inference_dtype)
FastLanguageModel.for_inference(model)

def chat_lora(prompt: str, max_new_tokens: int = 128):
    messages = [{"from": "human", "value": prompt}]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    inputs = tokenizer(text, return_tensors="pt").to(device)

    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            max_new_tokens = max_new_tokens,
            do_sample      = True,
            top_p          = 0.9,
            temperature    = 0.7,
            use_cache      = True,   # if dtype error, flip to False
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(decoded)

chat_lora("Write a Python function that returns the square of a number with a one-line explanation.")


user
Write a Python function that returns the square of a number with a one-line explanation.
assistant
Here's a Python function that uses a lambda function to square a number:

```python
def square(n):
    return n ** 2
```

You can use this function like this:

```python
square_one_line = """
square(n) = n ** 2
"""

square_one_line = square(5)
print(square_one_line)
```

This function will print:

```
square(5)
```

Note that this function returns the square of the number `5`, which is `5 ** 


In [10]:
save_dir = "smollm2-135m-lora-finetune-final"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"Saved LoRA finetuned model to {save_dir}")


Saved LoRA finetuned model to smollm2-135m-lora-finetune-final
