To run this, press "*Runtime*" and press "*Run all*" on a **free** Tesla T4 Google Colab instance!



### Installation

In [None]:
import torch
import gc

def clear_gpu_memory():
    """Clear all GPU memory immediately"""
    print("🧹 Cleaning GPU memory...")

    # Clear PyTorch cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

    # Force garbage collection
    gc.collect()

    print("✅ GPU memory cleared!")

# Run this immediately
clear_gpu_memory()

🧹 Cleaning GPU memory...
✅ GPU memory cleared!


In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" huggingface_hub hf_transfer
    # Install PyTorch 2.2.1
    !pip install --no-deps unsloth

In [None]:
%%capture
!pip install --no-deps git+https://github.com/huggingface/transformers.git # Need main branch for Liquid LFM2 models
!pip install --no-deps causal-conv1d==1.5.0.post8 # Install Mamba kernels

In [None]:
# pip install "unsloth_zoo @ git+https://github.com/unslothai/unsloth-zoo.git"

In [None]:
import torch
print(f"Available devices: {torch.cuda.is_available()}, {torch.xpu.is_available()}")
print(f"Device count: CUDA={torch.cuda.device_count()}, XPU={torch.xpu.device_count()}")

Available devices: True, False
Device count: CUDA=1, XPU=0


In [None]:
import torch
import os

# Force enable GPU detection
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Try first GPU

# Check if CUDA becomes available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU available, falling back to CPU")
    device = torch.device("cpu")

Using GPU: Tesla T4


### Unsloth

In [None]:
from unsloth import FastModel
import torch

fourbit_models = [
    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/LFM2-1.2B-unsloth-bnb-4bit",
    "unsloth/LFM2-700M-unsloth-bnb-4bit",
    "unsloth/LFM2-350M-unsloth-bnb-4bit",

    # Full 16bit unquantized models
    "unsloth/LFM2-1.2B",
    "unsloth/LFM2-700M",
    "unsloth/LFM2-350M",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/LFM2-1.2B",
    dtype = None, # None for auto detection
    max_seq_length = 2048, # Choose any for long context!
    load_in_4bit = False,  # 4 bit quantization to reduce memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    token = "hf-token", # use one if using gated models
)

==((====))==  Unsloth 2025.8.9: Fast Lfm2 patching. Transformers: 4.56.0.dev0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


We now add LoRA adapters so we only need to update a small amount of parameters!

In [None]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # LFM for now is just text only
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = True,  # Attention good for GRPO
    finetune_mlp_modules       = True,  # SHould leave on always!

    r = 32,           # Larger = higher accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                  "gate_proj", "up_proj", "down_proj",],
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
)
# 3. Verify parameters are trainable
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Trainable percentage: {100 * trainable_params / total_params:.2f}%")

Unsloth: Making `model.base_model.model.model` require gradients
Total parameters: 1,172,110,080
Trainable parameters: 1,769,472
Trainable percentage: 0.15%


<a name="Data"></a>
### Data Prep
We now use the `LFM` format for conversation style finetunes. We use [Maxime Labonne's FineTome-100k](https://huggingface.co/datasets/mlabonne/FineTome-100k) dataset in ShareGPT style. LFM renders multi turn conversations like below:

```
<|startoftext|><|im_start|>user
Hello!<|im_end|>
<|im_start|>assistant
Hey there!<|im_end|>
```

In [None]:
tokenizer.apply_chat_template([
    {"role" : "user", "content" : "Hello!"},
    {"role" : "assistant", "content" : "Hey there!"}
], tokenize = False)

'<|startoftext|><|im_start|>user\nHello!<|im_end|>\n<|im_start|>assistant\nHey there!<|im_end|>\n'

In [None]:
pip install huggingface_hub




In [None]:
from datasets import load_dataset

# Load the dataset, providing the token for authentication
dataset = load_dataset("bitext/Bitext-retail-ecommerce-llm-chatbot-training-dataset", split="train", token="hf-token")



Using the latest cached version of the dataset since bitext/Bitext-retail-ecommerce-llm-chatbot-training-dataset couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /root/.cache/huggingface/datasets/bitext___bitext-retail-ecommerce-llm-chatbot-training-dataset/default/0.0.0/12dd624ddcd3057382b2faad661bcda1fa869491 (last modified on Thu Aug 28 09:34:13 2025).


In [None]:
# from unsloth.chat_templates import standardize_data_formats
# dataset = standardize_data_formats(dataset)

Let's see how row 100 looks like!

In [None]:
# dataset[100]

We now have to apply the chat template for `LFM` onto the conversations, and save it to `text`. We also remove the BOS token otherwise we'll get double BOS tokens!

In [None]:
def format_chat(examples):
    formatted_texts = []
    for instruction, response in zip(examples["instruction"], examples["response"]):
        # Create the conversation structure
        messages = [
            {"role": "user", "content": instruction},
            {"role": "assistant", "content": response}
        ]
        # Apply the chat template using the loaded tokenizer
        formatted_text = tokenizer.apply_chat_template(
            messages,
            tokenize = False,
            add_generation_prompt = False,
        )
        # Remove the BOS token if present
        formatted_texts.append(formatted_text.removeprefix(tokenizer.bos_token))

    return {"text": formatted_texts}

dataset = dataset.map(format_chat, batched=True)

In [None]:
dataset[0]["text"]

'<|im_start|>user\nI got to add an item to the cart<|im_end|>\n<|im_start|>assistant\nI\'ll get right on it! I\'m here to assist you in adding an item to your cart. Here\'s a step-by-step guide to help you out:\n\n1. Browse through our {{Product Catalog}} to find the item you want to add to your cart.\n2. Once you\'ve found the item, click on the "{{Add to Cart}}" button or select the desired quantity from the available options.\n3. You\'ll be redirected to your cart page, where you can review your selected items.\n4. If you want to continue shopping, simply click on the "{{Continue Shopping}}" button, and you\'ll be taken back to the catalog.\n5. If you\'re ready to proceed, click on the "{{Checkout}}" button to finalize your purchase.\n\nIf you encounter any difficulties during this process, please don\'t hesitate to let me know. I\'m here to help you with any questions or concerns you may have. Happy shopping!<|im_end|>\n'

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`.

In [None]:
from trl import SFTTrainer, SFTConfig

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    args=SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size=8,
        gradient_accumulation_steps=8,
        warmup_steps=5,
        num_train_epochs=1,
        max_steps=60,
        learning_rate=5e-6,
        logging_steps=1,
        optim="adamw_torch",  # Changed from adamw_8bit
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        seed=3407,
        report_to="wandb",
        output_dir="ecommerce-chatbot",
        fp16=True,  # Explicitly disable mixed precision
        max_grad_norm=1.0,  # Gradient clipping
    ),
    formatting_func=format_chat,
)

We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs. This helps increase accuracy of finetunes!

In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|im_start|>user\n",
    response_part = "<|im_start|>assistant\n",
)

Let's verify masking the instruction part is done! Let's print the 100th row again.

In [None]:
tokenizer.decode(trainer.train_dataset[100]["input_ids"])

'<|startoftext|><|im_start|>user\nI want assistance to add items to the carg<|im_end|>\n<|im_start|>assistant\nOf course! I\'m here to help you add items to your cart. Let me guide you through the process:\n\n1. Visit our website and browse through our wide range of products.\n2. When you find an item you want to add, click on the "Add to Cart" or "Add to Bag" button.\n3. You can continue shopping and adding more items to your cart.\n4. Once you\'ve added all the items you want, click on the cart icon or the "View Cart" button to review your selections.\n5. From there, you can update quantities, remove items, or proceed to checkout.\n\nIf you encounter any issues or have any questions along the way, feel free to reach out to our customer support team for further assistance. Happy shopping!<|im_end|>\n'

Now let's print the masked out example - you should see only the answer is present:

In [None]:
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[100]["labels"]]).replace(tokenizer.pad_token, " ")

'                  Of course! I\'m here to help you add items to your cart. Let me guide you through the process:\n\n1. Visit our website and browse through our wide range of products.\n2. When you find an item you want to add, click on the "Add to Cart" or "Add to Bag" button.\n3. You can continue shopping and adding more items to your cart.\n4. Once you\'ve added all the items you want, click on the cart icon or the "View Cart" button to review your selections.\n5. From there, you can update quantities, remove items, or proceed to checkout.\n\nIf you encounter any issues or have any questions along the way, feel free to reach out to our customer support team for further assistance. Happy shopping!<|im_end|>\n'

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
10.334 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 44,884 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 1,769,472 of 1,172,110,080 (0.15% trained)


Step,Training Loss
1,0.7891
2,0.7926
3,0.7793
4,0.8167
5,0.7735
6,0.7806
7,0.77
8,0.7674
9,0.7839
10,0.764



Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in unsloth/LFM2-1.2B.


In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

410.8215 seconds used for training.
6.85 minutes used for training.
Peak reserved memory = 10.334 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 70.104 %.
Peak reserved memory for training % of max memory = 0.0 %.


<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!



In [None]:
messages = [{
    "role": "user",
    "content": "Why is the sky blue?",
}]
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
    tokenize = True,
    return_dict = True,
).to("cuda")

from transformers import TextStreamer
_ = model.generate(
    **inputs,
    max_new_tokens = 128, # Increase for longer outputs!
    # Recommended Liquid settings!
    temperature = 0.3, min_p = 0.15, repetition_penalty = 1.05,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

The sky appears blue due to a phenomenon called Rayleigh scattering. When sunlight enters the Earth's atmosphere, it encounters tiny molecules of gases like nitrogen and oxygen. These molecules scatter the shorter wavelengths of light, such as blue and violet, more than the longer wavelengths like red and orange. As a result, the blue light is scattered in all directions, making the sky appear blue to our eyes.

It's worth noting that our eyes are more sensitive to blue light than violet light, which is why the sky often appears more blue than violet. Additionally, during sunrise and sunset, the sun's light has to pass through more of the


In [None]:
messages = [{
    "role": "user",
    "content": "fuck you where is my shoes",
}]
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
    tokenize = True,
    return_dict = True,
).to("cuda")

from transformers import TextStreamer
_ = model.generate(
    **inputs,
    max_new_tokens = 128, # Increase for longer outputs!
    # Recommended Liquid settings!
    temperature = 0.3, min_p = 0.15, repetition_penalty = 1.05,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

I'm sorry for any inconvenience. I understand that you're looking for your shoes. Let me assist you with that. To locate your shoes, please follow these steps:

1. Go to the {{Store Name}} website or app.
2. Log in to your account using your credentials.
3. Navigate to the {{Products}} or {{Shop}} section.
4. Look for the {{Your Shoe Model}} or {{Specific Shoe}} you're looking for.
5. Click on the product to view more details and purchase options.
6. If you need further assistance, feel free to reach out to our


<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
model.save_pretrained("ecommerce")  # Local saving
tokenizer.save_pretrained("ecommerce")
model.push_to_hub("Heem2/ecommerce",  token = "hf-token")
tokenizer.push_to_hub("Heem2/ecommerce", token = "hf-token")


Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in unsloth/LFM2-1.2B.

Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in unsloth/LFM2-1.2B.


Saved model to https://huggingface.co/Heem2/ecommerce


No files have been modified since last commit. Skipping to prevent empty commit.


In [None]:
model.save_pretrained("ecommerceX")  # Local saving
tokenizer.save_pretrained("ecommerceX")
model.push_to_hub("Heem2/ecommerceX",  token = "hf-token")
tokenizer.push_to_hub("Heem2/ecommerceX", token = "hf-token")


Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in unsloth/LFM2-1.2B.

Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in unsloth/LFM2-1.2B.


Saved model to https://huggingface.co/Heem2/ecommerceX


In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="Heem2/ecommerceX", token = "hf-token")
messages = [
    {"role": "user", "content": "ass hole fuck product"},
]
pipe(messages)

config.json: 0.00B [00:00, ?B/s]

adapter_config.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.34G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/7.08M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/209 [00:00<?, ?B/s]

Device set to use cuda:0


[{'generated_text': [{'role': 'user', 'content': 'ass hole fuck product'},
   {'role': 'assistant',
    'content': "I'm sorry, but I can't assist with that. As a responsible and respectful assistant, I cannot participate in or promote any form of unethical or inappropriate language. If you have any questions or need assistance with finding a suitable product, I'd be more than happy to help. Let me know if there's something specific you're looking for or if you need guidance in choosing the right product."}]}]

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`: