In [1]:

!pip install -q unsloth datasets evaluate rouge_score


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/52.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.3/52.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.2/310.2 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.9/511.9 kB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.7/184.7 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.2/117.2 MB[0m [31m7.5 MB/s[0m eta [

In [2]:
# Import libraries
from datasets import load_dataset
import evaluate
from unsloth import FastLanguageModel
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq
import torch
import pandas as pd

# Load dataset
dataset = load_dataset("knkarthick/samsum")
train_dataset = dataset["train"].shuffle(seed=42)
test_dataset = dataset["test"]
print(f"Train size: {len(train_dataset)}, Test size: {len(test_dataset)}")



Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


README.md: 0.00B [00:00, ?B/s]

train.csv: 0.00B [00:00, ?B/s]

validation.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Train size: 14732, Test size: 819


In [6]:
# Load model and tokenizer
model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)

# SIMPLIFIED: Use Unsloth's recommended approach
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    instructions = ["Summarize the following dialogue:"] * len(examples["dialogue"])
    inputs = examples["dialogue"]
    outputs = examples["summary"]
    texts = []
    for instruction, input_text, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input_text, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Apply formatting - simpler approach
train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
test_dataset_formatted = test_dataset.map(formatting_prompts_func, batched=True)


==((====))==  Unsloth 2025.8.8: Fast Llama patching. Transformers: 4.55.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

In [7]:
def evaluate_model_baseline(dataset_to_eval=None, num_samples=50):
    """Evaluate model on original dialogue-summary pairs"""
    if dataset_to_eval is None:
        dataset_to_eval = test_dataset
    sample_data = dataset_to_eval.select(range(min(len(dataset_to_eval), num_samples)))

    predictions = []
    references = []

    for example in sample_data:
        dialogue = example["dialogue"]
        reference = example["summary"]

        # Create prompt for inference
        prompt = alpaca_prompt.format(
            "Summarize the following dialogue:",
            dialogue,
            ""  # Empty output for generation
        )

        # Generate response
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=128,
                use_cache=True,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )

        # Extract generated text (remove prompt)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        prediction = generated_text[len(prompt):].strip()

        predictions.append(prediction)
        references.append(reference)

    # Compute ROUGE scores
    rouge = evaluate.load("rouge")
    return rouge.compute(predictions=predictions, references=references)

# Baseline evaluation
print("📊 Baseline Evaluation (Before Fine-tuning):")
baseline_scores = evaluate_model_baseline(test_dataset)
print(baseline_scores)

📊 Baseline Evaluation (Before Fine-tuning):


Downloading builder script: 0.00B [00:00, ?B/s]

{'rouge1': np.float64(0.2393757155802962), 'rouge2': np.float64(0.07818811170030243), 'rougeL': np.float64(0.1729189266832366), 'rougeLsum': np.float64(0.17829071779684502)}


In [8]:
# Enable LoRA training
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)


Unsloth 2025.8.8 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [9]:
# Use Unsloth's SFTTrainer for simplicity
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset.select(range(1000)),
    dataset_text_field="text",
    max_seq_length=1024,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=950,
        learning_rate=5e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="./results",
        save_strategy="steps",
        save_steps=30,
    ),
)


# Train the model
print("🚀 Starting fine-tuning...")
trainer.train()

Unsloth: Tokenizing ["text"]:   0%|          | 0/1000 [00:00<?, ? examples/s]

🚀 Starting fine-tuning...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,000 | Num Epochs = 8 | Total steps = 950
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 24,313,856 of 3,237,063,680 (0.75% trained)
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmaliasad615[0m ([33mmaliasad615-comsats-university-islamabad[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.7741
2,2.7643
3,2.8044
4,2.719
5,2.4625
6,2.238
7,2.1065
8,2.1099
9,2.0807
10,2.1868


TrainOutput(global_step=950, training_loss=0.8346265076218466, metrics={'train_runtime': 3404.7234, 'train_samples_per_second': 2.232, 'train_steps_per_second': 0.279, 'total_flos': 3.602293930931405e+16, 'train_loss': 0.8346265076218466})

In [10]:
# Post-training evaluation
print("📊 Evaluation After Fine-tuning:")
after_scores = evaluate_model_baseline(test_dataset)  # Explicitly use test_dataset
print(after_scores)

📊 Evaluation After Fine-tuning:
{'rouge1': np.float64(0.4352904683025958), 'rouge2': np.float64(0.18982196815900085), 'rougeL': np.float64(0.3568386533369733), 'rougeLsum': np.float64(0.35610815447967814)}


In [11]:

comparison = pd.DataFrame({
    "Metric": list(baseline_scores.keys()),
    "Before": list(baseline_scores.values()),
    "After": list(after_scores.values())
})
print("\n🔍 Comparison Results:")
print(comparison)


🔍 Comparison Results:
      Metric    Before     After
0     rouge1  0.239376  0.435290
1     rouge2  0.078188  0.189822
2     rougeL  0.172919  0.356839
3  rougeLsum  0.178291  0.356108


In [12]:
# Test with a sample
print("\n🎯 Sample Generation:")
sample_dialogue = test_dataset[0]["dialogue"]
sample_reference = test_dataset[0]["summary"]

prompt = alpaca_prompt.format(
    "Summarize the following dialogue:",
    sample_dialogue,
    ""
)

inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
inputs = {k: v.to(model.device) for k, v in inputs.items()}

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=128,
        use_cache=True,
        do_sample=True,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
prediction = generated_text[len(prompt):].strip()

print(f"📝 Original Dialogue:\n{sample_dialogue}\n")
print(f"🎯 Reference Summary:\n{sample_reference}\n")
print(f"🤖 Generated Summary:\n{prediction}")


🎯 Sample Generation:
📝 Original Dialogue:
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

🎯 Reference Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.

🤖 Generated Summary:
Hanna needs Betty's number. Amanda checks it, but can't find it. She will ask Larry. Larry called Betty last time they were at the park.


In [13]:
import shutil
import os
from google.colab import files
# FAST: Save only LoRA adapters (takes 30 seconds vs 20+ minutes)
print("💾 Saving LoRA adapters only (fastest option)...")
model.save_pretrained("llama3_samsum_lora")
tokenizer.save_pretrained("llama3_samsum_lora")

# Create zip file
print("🗜️ Creating zip file...")
shutil.make_archive("llama3_samsum_lora", "zip", "llama3_samsum_lora")

# Download the small file (~50-100MB)
print("⬇️ Downloading LoRA adapters...")
files.download("llama3_samsum_lora.zip")

print("✅ Fast download complete! (~100MB in 1-2 minutes)")
print("💡 Skip the heavy merged/GGUF saves unless you specifically need them")

# Save GGUF format for llama.cpp/Ollama compatibility
print("📦 Saving GGUF format (this will take 15-20 minutes)...")
model.save_pretrained_gguf("llama3_samsum_gguf", tokenizer, quantization_method="q4_k_m")
shutil.make_archive("llama3_samsum_gguf", "zip", "llama3_samsum_gguf")
files.download("llama3_samsum_gguf.zip")

print("✅ GGUF model downloaded to your computer!")

💾 Saving LoRA adapters only (fastest option)...
🗜️ Creating zip file...
⬇️ Downloading LoRA adapters...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Fast download complete! (~100MB in 1-2 minutes)
💡 Skip the heavy merged/GGUF saves unless you specifically need them
📦 Saving GGUF format (this will take 15-20 minutes)...


Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 2.2G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 2.91 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 28/28 [00:01<00:00, 20.51it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving llama3_samsum_gguf/pytorch_model-00001-of-00002.bin...
Unsloth: Saving llama3_samsum_gguf/pytorch_model-00002-of-00002.bin...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at llama3_samsum_gguf into f16 GGUF format.
The output location will be /content/llama3_samsum_gguf/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: llama3_samsum_gguf
INFO:hf-to-gguf:Model architecture: LlamaForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_mode

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ GGUF model downloaded to your computer!
