In [1]:
# Make sure you set: Runtime → Change runtime type → GPU (T4/A100/etc.)

%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
from torch import __version__ as torch_version
from packaging.version import Version as V

xformers = "xformers==0.0.27" if V(torch_version) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes datasets


In [2]:
import torch
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template

from datasets import Dataset
from transformers import TrainingArguments
from trl import SFTTrainer

device = "cuda" if torch.cuda.is_available() else "cpu"
max_seq_length = 512
dtype = None          # Let Unsloth pick best dtype
full_finetune = True  # <-- important for Colab 1


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name       = "unsloth/SmolLM2-135M-Instruct",  # 135M Unsloth-wrapped model
    max_seq_length   = max_seq_length,
    dtype            = dtype,
    load_in_4bit     = False,          # full 16-bit weights (not QLoRA)
    full_finetuning  = full_finetune,  # train ALL parameters
)

model.to(device)


==((====))==  Unsloth 2025.11.2: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Float16 full finetuning uses more memory since we upcast weights to float32.


model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/423 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49153, 576, padding_idx=49152)
    (layers): ModuleList(
      (0-29): 30 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=576, out_features=576, bias=False)
          (k_proj): Linear(in_features=576, out_features=192, bias=False)
          (v_proj): Linear(in_features=576, out_features=192, bias=False)
          (o_proj): Linear(in_features=576, out_features=576, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
          (up_proj): Linear(in_features=576, out_features=1536, bias=False)
          (down_proj): Linear(in_features=1536, out_features=576, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((576,), eps=1e-05)
  

In [4]:
train_conversations = [
    {
        "conversations": [
            {
                "from": "human",
                "value": "You are a helpful coding assistant. Write a Python function that adds two numbers."
            },
            {
                "from": "gpt",
                "value": "Sure! Here's a simple function:\n\n```python\ndef add(a, b):\n    return a + b\n```"
            },
        ]
    },
    {
        "conversations": [
            {
                "from": "human",
                "value": "Explain what a Python list comprehension is in one short paragraph."
            },
            {
                "from": "gpt",
                "value": "A list comprehension is a compact way to create lists using a for-loop and optional condition inside a single expression."
            },
        ]
    },
    {
        "conversations": [
            {
                "from": "human",
                "value": "Write a short motivational message for a student learning AI."
            },
            {
                "from": "gpt",
                "value": "Learning AI is like learning a new superpower—start small, experiment a lot, and don’t be afraid to make mistakes."
            },
        ]
    },
    # 👉 Add more examples here (coding questions, general chat, Q&A, etc.)
]

dataset = Dataset.from_list(train_conversations)
dataset


Dataset({
    features: ['conversations'],
    num_rows: 3
})

In [5]:
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml",  # one of: chatml, llama, mistral, gemma, etc.
    mapping = {
        "role": "from",        # our field name that holds role
        "content": "value",    # our field name that holds text
        "user": "human",       # user role token
        "assistant": "gpt",    # assistant role token
    },
    map_eos_token = True,
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [
        tokenizer.apply_chat_template(
            convo,
            tokenize=False,
            add_generation_prompt=False,
        )
        for convo in convos
    ]
    return {"text": texts}

dataset = dataset.map(formatting_prompts_func, batched=True)
print(dataset[0]["text"])


Unsloth: Will map <|im_end|> to EOS = <|im_end|>.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

<|im_start|>user
You are a helpful coding assistant. Write a Python function that adds two numbers.<|im_end|>
<|im_start|>assistant
Sure! Here's a simple function:

```python
def add(a, b):
    return a + b
```<|im_end|>



In [6]:
training_args = TrainingArguments(
    output_dir                         = "smollm2-135m-full-finetune",
    num_train_epochs                   = 3,
    per_device_train_batch_size        = 8,
    gradient_accumulation_steps        = 1,
    learning_rate                      = 2e-4,
    warmup_steps                       = 5,
    logging_steps                      = 5,
    save_strategy                      = "epoch",
    fp16                               = True,   # good for T4/A100
    report_to                          = "none",
)

trainer = SFTTrainer(
    model              = model,
    tokenizer          = tokenizer,
    train_dataset      = dataset,
    dataset_text_field = "text",
    max_seq_length     = max_seq_length,
    packing            = False,  # simpler to understand; can explain packing later
    args               = training_args,
)

trainer.train()


num_proc must be <= 3. Reducing num_proc to 3 for dataset of size 3.


Unsloth: We found double BOS tokens - we shall remove one automatically.


Unsloth: Tokenizing ["text"] (num_proc=3):   0%|          | 0/3 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3 | Num Epochs = 3 | Total steps = 3
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 1 x 1) = 8
 "-____-"     Trainable parameters = 134,515,584 of 134,515,584 (100.00% trained)


Step,Training Loss


TrainOutput(global_step=3, training_loss=2.043691953023275, metrics={'train_runtime': 100.6007, 'train_samples_per_second': 0.089, 'train_steps_per_second': 0.03, 'total_flos': 309689277696.0, 'train_loss': 2.043691953023275, 'epoch': 3.0})

In [8]:
# 🔁 REPLACE OLD CELL 7 WITH THIS

from unsloth import FastLanguageModel, is_bfloat16_supported
import torch

# 1) Put model into a single, consistent inference dtype
inference_dtype = torch.bfloat16 if is_bfloat16_supported() else torch.float16
model = model.to(device=device, dtype=inference_dtype)

# 2) Switch to Unsloth's fast inference mode
FastLanguageModel.for_inference(model)

def chat(prompt: str, max_new_tokens: int = 128):
    messages = [
        {"from": "human", "value": prompt},
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    inputs = tokenizer(
        text,
        return_tensors="pt",
    ).to(device)

    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            max_new_tokens = max_new_tokens,
            do_sample      = True,
            top_p          = 0.9,
            temperature    = 0.7,
            use_cache      = True,   # 👈 if error persists, change to False
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(decoded)

# Try a coding-style prompt similar to training distribution:
chat("Write a Python function that returns the square of a number with a one-line explanation.")


user
Write a Python function that returns the square of a number with a one-line explanation.
assistant
Here's a simple function that does what you need:

```python
def square(n):
    return n * n
```


In [9]:
chat("Explain what a Python list is in two sentences.")

user
Explain what a Python list is in two sentences.
assistant
A Python list is a sequence of items that can be of any data type, including strings, integers, floats, and other lists.


In [10]:
save_dir = "smollm2-135m-full-finetune-final"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"Saved model to {save_dir}")


Saved model to smollm2-135m-full-finetune-final
