In [None]:
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu128
!pip install unsloth
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2


In [None]:
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121
!pip install -U unsloth unsloth_zoo
!pip install transformers==4.56.2
!pip install trl==0.22.2 peft accelerate datasets


In [None]:
# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name="unsloth/tinyllama-bnb-4bit",
#     max_seq_length=4096,
#     dtype=None,
#     load_in_4bit=True,
# )

# model = FastLanguageModel.get_peft_model(
#     model,
#     r=32,
#     target_modules=[...],
#     lora_alpha=32,
#     lora_dropout=0,
#     bias="none",
#     use_gradient_checkpointing="unsloth"
# )

# alpaca_prompt = """..."""
# EOS_TOKEN = tokenizer.eos_token

# trainer = SFTTrainer(
#     model=model,
#     tokenizer=tokenizer,
#     train_dataset=dataset,
#     packing=True,
#     args=SFTConfig(
#         per_device_train_batch_size=2,
#         gradient_accumulation_steps=4,
#         num_train_epochs=1,
#         learning_rate=2e-5,
#         optim="adamw_8bit",
#     ),
# )

# torch.cuda.max_memory_reserved()
# FastLanguageModel.for_inference(model)
# model.save_pretrained("lora_model")
# tokenizer.save_pretrained("lora_model")
# FastLanguageModel.from_pretrained("lora_model")
# model.save_pretrained_merged(..., save_method="merged_16bit")
# model.save_pretrained_gguf(..., quantization_method="q4_k_m")
# model.to("cuda")
# seed=3407
# max_steps = 100
# num_train_epochs = None

In [None]:
# =========================
# 1. Imports & Config
# =========================
import random
import numpy as np
import torch

SEED = 3407
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# Faster & stable matmul on NVIDIA GPUs
torch.backends.cuda.matmul.allow_tf32 = True
torch.set_float32_matmul_precision("high")

# GPU sanity check
assert torch.cuda.is_available(), "❌ Please enable GPU runtime (Colab → Runtime → GPU)"

# =========================
# Imports
# =========================
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig

# =========================
# Global Config
# =========================
max_seq_length = 4096        # Demonstrates long-context + RoPE scaling
dtype = None                # Auto-detect (FP16 on T4, BF16 on A100/L4)
load_in_4bit = True         # Memory-efficient QLoRA-style loading



In [None]:
# =========================
# 2. Load Base Model (4-bit Quantized)
# =========================
"""
We load a pre-quantized 4-bit TinyLlama model provided by Unsloth.
Benefits:
- Very low VRAM usage
- Fast download
- Perfect for Colab / Kaggle / 8–16GB GPUs
"""

BASE_MODEL_NAME = "unsloth/tinyllama-bnb-4bit"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL_NAME,
    max_seq_length=max_seq_length,  # Enables long-context via RoPE scaling
    dtype=dtype,                    # Auto FP16 / BF16
    load_in_4bit=load_in_4bit,      # QLoRA-style memory efficiency
)


In [None]:
# =========================
# 3. Apply LoRA Adapters (PEFT)
# =========================
"""
We attach LoRA adapters so that:
- Only ~1–10% parameters are trained
- Base model weights remain frozen
- Training becomes faster and memory efficient

Target modules:
- q_proj, k_proj, v_proj, o_proj → Attention
- gate_proj, up_proj, down_proj → MLP
"""

model = FastLanguageModel.get_peft_model(
    model,
    r=32,                            # LoRA rank (8/16/32/64 are common)
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_alpha=32,                   # Usually = r or 2*r
    lora_dropout=0.0,                # Unsloth recommends 0
    bias="none",                     # Best practice for LoRA
    use_gradient_checkpointing=False,# Set True if model >= 7B
    random_state=3407,               # Reproducibility
)


In [None]:
# =========================
# 4. Dataset Preparation (Alpaca Format)
# =========================
"""
We convert raw Alpaca data into a single 'text' field.
IMPORTANT:
- EOS token is mandatory, otherwise generation may never stop.
"""

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token

def format_data(examples):
    texts = []
    for instruction, input_text, output in zip(
        examples["instruction"],
        examples["input"],
        examples["output"],
    ):
        text = alpaca_prompt.format(
            instruction,
            input_text,
            output
        ) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Load dataset
dataset = load_dataset("yahma/alpaca-cleaned", split="train")

# Shuffle + small subset for fast demo training
dataset = dataset.shuffle(seed=3407).select(range(2000))

# Apply formatting
dataset = dataset.map(
    format_data,
    batched=True,
    remove_columns=dataset.column_names,  # keep only 'text'
)


In [None]:
# =========================
# 5. Supervised Fine-Tuning (SFT)
# =========================
"""
We use TRL's SFTTrainer with Unsloth optimizations.
Key features:
- Sequence packing → better GPU utilization
- 8-bit optimizer → lower VRAM usage
- Gradient accumulation → simulates larger batch size
"""

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    packing=True,  # Packs multiple short samples into one sequence
    args=SFTConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,  # Effective batch size = 8
        num_train_epochs=1,
        learning_rate=2e-5,
        warmup_ratio=0.1,               # Stabilizes early training
        optim="adamw_8bit",             # Memory efficient optimizer
        logging_steps=10,               # Progress visibility
        seed=3407,                      # Reproducibility
        output_dir="outputs",
        report_to="none",               # Disable WandB by default
    ),
)

trainer.train()


In [None]:
# =========================
# 6. Inference (Fast Generation)
# =========================
"""
Unsloth provides 2× faster inference using optimized kernels.
IMPORTANT:
- Always call FastLanguageModel.for_inference(model)
- Disable gradients for inference
"""

FastLanguageModel.for_inference(model)

prompt = alpaca_prompt.format(
    "Continue the Fibonacci sequence",
    "1, 1, 2, 3, 5, 8",
    ""
)

inputs = tokenizer(
    prompt,
    return_tensors="pt"
).to("cuda")

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=64,
        use_cache=True,     # Faster decoding
        do_sample=False     # Deterministic output for demo
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


In [None]:
# =========================
# 7. Save LoRA Adapters
# =========================
"""
This saves ONLY the LoRA adapters, not the full base model.
Benefits:
- Very small size (~50–200 MB)
- Can be merged later into FP16 / 4-bit / GGUF
- Easy to share or version-control
"""

LORA_SAVE_PATH = "lora_model"

model.save_pretrained(LORA_SAVE_PATH)
tokenizer.save_pretrained(LORA_SAVE_PATH)

print(f"LoRA adapters saved at: {LORA_SAVE_PATH}")


## unsloth and huggingface comparison

In [None]:
# =========================
# 0) INSTALLS (COLAB)
# =========================
!pip -q install -U pip

# Torch + CUDA is already on Colab typically. If needed, uncomment below:
# !pip -q install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

# Common deps
!pip -q install -U datasets accelerate transformers trl peft bitsandbytes

# Unsloth
!pip -q install -U unsloth


In [None]:
import time, torch
from datasets import load_dataset

assert torch.cuda.is_available(), "GPU required"
DEVICE = "cuda"

torch.backends.cuda.matmul.allow_tf32 = True
torch.set_float32_matmul_precision("high")

def reset_vram():
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

def peak_vram_gb():
    return round(torch.cuda.max_memory_reserved() / 1024**3, 3)

def now():
    return time.time()


In [None]:
alpaca_prompt = """Below is an instruction that describes a task.

### Instruction:
{instruction}

### Input:
{input}

### Response:
"""

def prepare_dataset(max_rows=200):
    ds = load_dataset("yahma/alpaca-cleaned", split="train")
    ds = ds.select(range(max_rows))

    def fmt(ex):
        return {
            "text": alpaca_prompt.format(
                instruction=ex["instruction"],
                input=ex["input"]
            ) + ex["output"]
        }

    return ds.map(fmt, remove_columns=ds.column_names)


In [None]:
def run_unsloth_benchmark(
    model_name,
    max_rows=200,
    max_seq_length=1024,
    steps=50,
):
    from unsloth import FastLanguageModel
    from trl import SFTTrainer, SFTConfig

    reset_vram()
    start = now()

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_seq_length,
        load_in_4bit=True,
    )

    model = FastLanguageModel.get_peft_model(
        model,
        r=16,
        target_modules=["q_proj","k_proj","v_proj","o_proj"],
        lora_alpha=16,
        lora_dropout=0,
        bias="none",
    )

    dataset = prepare_dataset(max_rows)

    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=dataset,
        dataset_text_field="text",
        args=SFTConfig(
            max_steps=steps,
            per_device_train_batch_size=2,
            gradient_accumulation_steps=4,
            learning_rate=2e-4,
            logging_steps=10,
            output_dir="unsloth_out",
            report_to="none",
        ),
    )

    trainer.train()
    train_time = now() - start
    train_vram = peak_vram_gb()

    # Inference speed
    FastLanguageModel.for_inference(model)
    inputs = tokenizer("Explain LoRA in simple words.", return_tensors="pt").to("cuda")

    torch.cuda.synchronize()
    t0 = now()
    out = model.generate(**inputs, max_new_tokens=128)
    torch.cuda.synchronize()
    t1 = now()

    tokens_sec = out.shape[-1] / (t1 - t0)

    return {
        "train_time_sec": round(train_time, 2),
        "train_peak_vram_gb": train_vram,
        "inference_tokens_per_sec": round(tokens_sec, 2),
    }


In [None]:
def run_hf_benchmark(
    model_name,
    max_rows=200,
    max_seq_length=1024,
    steps=50,
):
    from transformers import AutoModelForCausalLM, AutoTokenizer
    from peft import LoraConfig, get_peft_model
    from trl import SFTTrainer, SFTConfig

    reset_vram()
    start = now()

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        load_in_4bit=True,
        device_map="auto",
    )

    model = get_peft_model(
        model,
        LoraConfig(
            r=16,
            lora_alpha=16,
            target_modules=["q_proj","v_proj"],
            bias="none",
            task_type="CAUSAL_LM",
        )
    )

    dataset = prepare_dataset(max_rows)

    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=dataset,
        dataset_text_field="text",
        args=SFTConfig(
            max_steps=steps,
            per_device_train_batch_size=2,
            gradient_accumulation_steps=4,
            learning_rate=2e-4,
            logging_steps=10,
            output_dir="hf_out",
            report_to="none",
        ),
    )

    trainer.train()
    train_time = now() - start
    train_vram = peak_vram_gb()

    inputs = tokenizer("Explain LoRA in simple words.", return_tensors="pt").to("cuda")
    torch.cuda.synchronize()
    t0 = now()
    out = model.generate(**inputs, max_new_tokens=128)
    torch.cuda.synchronize()
    t1 = now()

    tokens_sec = out.shape[-1] / (t1 - t0)

    return {
        "train_time_sec": round(train_time, 2),
        "train_peak_vram_gb": train_vram,
        "inference_tokens_per_sec": round(tokens_sec, 2),
    }


In [None]:
unsloth_res = run_unsloth_benchmark(
    model_name="unsloth/tinyllama-bnb-4bit",
)

hf_res = run_hf_benchmark(
    model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
)

print("\nFINAL COMPARISON")
for k in unsloth_res:
    print(f"{k}: Unsloth={unsloth_res[k]} | HF={hf_res[k]}")


In [None]:
When NOT to use Unsloth:
- If you need heavy multi-node distributed training
- If you want no-code UI only (LLaMA-Factory better)
- If training classical ML models (not LLMs)
