In [None]:
!pip install transformers datasets peft accelerate bitsandbytes torch

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x8

In [None]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from peft import LoraConfig, get_peft_model, TaskType
import torch
from torch.utils.data import Dataset
import numpy as np

In [None]:
# Step 1: Load Dataset (using 1% for demo)
dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")

In [None]:
# Step 2: Load Tokenizer and Model
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
# Step 3: Preprocess Function
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["highlights"],
            max_length=150,
            truncation=True,
            padding="max_length"
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# Apply preprocessing
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["article", "highlights", "id"]
)

Map:   0%|          | 0/2871 [00:00<?, ? examples/s]



In [None]:
# Step 4: Configure LoRA
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q", "v"]
)

In [None]:
# Step 5: Prepare PEFT Model
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 294,912 || all params: 60,801,536 || trainable%: 0.4850


In [None]:
# Step 6: Data Collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=tokenizer.pad_token_id
)

In [None]:
# Step 7: Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-small-summarizer-lora",
    evaluation_strategy="no",
    learning_rate=1e-3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    report_to="none"
)



In [None]:
# Step 8: Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
# Step 9: Train
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
10,9.7837
20,2.1413
30,1.2728
40,1.1241
50,1.0953
60,1.036
70,1.0291
80,1.0358
90,0.9173
100,0.9439


TrainOutput(global_step=359, training_loss=1.2529389160921314, metrics={'train_runtime': 72.0914, 'train_samples_per_second': 39.824, 'train_steps_per_second': 4.98, 'total_flos': 391167350931456.0, 'train_loss': 1.2529389160921314, 'epoch': 1.0})

In [None]:
# Step 10: Save Model
model.save_pretrained("t5-small-summarizer-lora")
tokenizer.save_pretrained("t5-small-summarizer-lora")

('t5-small-summarizer-lora/tokenizer_config.json',
 't5-small-summarizer-lora/special_tokens_map.json',
 't5-small-summarizer-lora/spiece.model',
 't5-small-summarizer-lora/added_tokens.json',
 't5-small-summarizer-lora/tokenizer.json')

In [None]:
# Step 11: Download Save Model
from google.colab import files
import shutil

# Create a zip file of the saved model and tokenizer
shutil.make_archive('/content/t5-small-summarizer-lora', 'zip', '/content', 't5-small-summarizer-lora')

# Download the zip file
files.download('/content/t5-small-summarizer-lora.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>