In [None]:

!pip install -q unsloth datasets evaluate rouge_score


In [None]:
# Import libraries
from datasets import load_dataset
import evaluate
from unsloth import FastLanguageModel
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq
import torch
import pandas as pd

# Load dataset
dataset = load_dataset("knkarthick/samsum")
train_dataset = dataset["train"].shuffle(seed=42)
test_dataset = dataset["test"]
print(f"Train size: {len(train_dataset)}, Test size: {len(test_dataset)}")


Train size: 14732, Test size: 819


In [None]:
# Load model and tokenizer
model_name = "unsloth/llama-3-8b-instruct-bnb-4bit"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)

# SIMPLIFIED: Use Unsloth's recommended approach
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    instructions = ["Summarize the following dialogue:"] * len(examples["dialogue"])
    inputs = examples["dialogue"]
    outputs = examples["summary"]
    texts = []
    for instruction, input_text, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input_text, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Apply formatting - simpler approach
train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
test_dataset_formatted = test_dataset.map(formatting_prompts_func, batched=True)


==((====))==  Unsloth 2025.8.6: Fast Llama patching. Transformers: 4.55.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
def evaluate_model_baseline(dataset_to_eval=None, num_samples=50):
    """Evaluate model on original dialogue-summary pairs"""
    if dataset_to_eval is None:
        dataset_to_eval = test_dataset
    sample_data = dataset_to_eval.select(range(min(len(dataset_to_eval), num_samples)))

    predictions = []
    references = []

    for example in sample_data:
        dialogue = example["dialogue"]
        reference = example["summary"]

        # Create prompt for inference
        prompt = alpaca_prompt.format(
            "Summarize the following dialogue:",
            dialogue,
            ""  # Empty output for generation
        )

        # Generate response
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=128,
                use_cache=True,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )

        # Extract generated text (remove prompt)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        prediction = generated_text[len(prompt):].strip()

        predictions.append(prediction)
        references.append(reference)

    # Compute ROUGE scores
    rouge = evaluate.load("rouge")
    return rouge.compute(predictions=predictions, references=references)

# Baseline evaluation
print("📊 Baseline Evaluation (Before Fine-tuning):")
baseline_scores = evaluate_model_baseline(test_dataset)
print(baseline_scores)

📊 Baseline Evaluation (Before Fine-tuning):
{'rouge1': np.float64(0.2605241812268216), 'rouge2': np.float64(0.10070256841299755), 'rougeL': np.float64(0.19301053297657536), 'rougeLsum': np.float64(0.19309433064041032)}


In [None]:
# Enable LoRA training
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)


In [None]:
# Use Unsloth's SFTTrainer for simplicity
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset.select(range(1000)),
    dataset_text_field="text",
    max_seq_length=1024,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=950,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="./results",
        save_strategy="steps",
        save_steps=30,
    ),
)


# Train the model
print("🚀 Starting fine-tuning...")
trainer.train()

🚀 Starting fine-tuning...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,000 | Num Epochs = 10 | Total steps = 1,200
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Step,Training Loss
1,2.8849
2,2.8199
3,2.9369
4,2.8279
5,2.5668
6,2.489
7,2.309
8,2.2163
9,2.0857
10,2.1106


In [None]:
# Post-training evaluation
print("📊 Evaluation After Fine-tuning:")
after_scores = evaluate_model_baseline(test_dataset)  # Explicitly use test_dataset
print(after_scores)

In [None]:

comparison = pd.DataFrame({
    "Metric": list(baseline_scores.keys()),
    "Before": list(baseline_scores.values()),
    "After": list(after_scores.values())
})
print("\n🔍 Comparison Results:")
print(comparison)

In [None]:
# Test with a sample
print("\n🎯 Sample Generation:")
sample_dialogue = test_dataset[0]["dialogue"]
sample_reference = test_dataset[0]["summary"]

prompt = alpaca_prompt.format(
    "Summarize the following dialogue:",
    sample_dialogue,
    ""
)

inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
inputs = {k: v.to(model.device) for k, v in inputs.items()}

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=128,
        use_cache=True,
        do_sample=True,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
prediction = generated_text[len(prompt):].strip()

print(f"📝 Original Dialogue:\n{sample_dialogue}\n")
print(f"🎯 Reference Summary:\n{sample_reference}\n")
print(f"🤖 Generated Summary:\n{prediction}")

In [None]:
# Save and download the model
print("💾 Saving model...")

# Save the LoRA adapters
model.save_pretrained("llama3_samsum_lora")
tokenizer.save_pretrained("llama3_samsum_lora")

In [None]:
# Save in different formats
print("📦 Saving in multiple formats...")

# 1. Save merged model (base model + LoRA weights combined)
model.save_pretrained_merged("llama3_samsum_merged", tokenizer, save_method="merged_16bit")

# 2. Save for GGUF format (for llama.cpp compatibility)
model.save_pretrained_gguf("llama3_samsum_gguf", tokenizer, quantization_method="q4_k_m")

In [None]:

# ========================================
# PART 1: DOWNLOAD MODEL FROM COLAB
# ========================================

# Run this in your Colab notebook to prepare downloads
import shutil
import os
from google.colab import files

print("💾 Saving model in different formats...")

# Save the LoRA adapters (smallest file, recommended)
model.save_pretrained("llama3_samsum_lora")
tokenizer.save_pretrained("llama3_samsum_lora")

# Save merged model (larger but self-contained)
model.save_pretrained_merged("llama3_samsum_merged", tokenizer, save_method="merged_16bit")

# Create zip files
print("🗜️ Creating zip files...")
shutil.make_archive("llama3_samsum_lora", "zip", "llama3_samsum_lora")
shutil.make_archive("llama3_samsum_merged", "zip", "llama3_samsum_merged")

# Download files
print("⬇️ Downloading...")
files.download("llama3_samsum_lora.zip")        # Small file (~100MB)
files.download("llama3_samsum_merged.zip")      # Large file (~8GB)

print("✅ Files downloaded to your computer!")
