In [4]:
# Install required packages
!pip install -q -U transformers datasets peft accelerate tokenizers

import os
import torch
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
from peft import LoraConfig, get_peft_model, TaskType


os.environ["WANDB_DISABLED"] = "true"

train_csv = "/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv"
output_dir = "/kaggle/working/t5-lora-summarization"
model_name = "t5-small"


sample_fraction = 0.025

# Hyperparameters
max_input_length = 512
max_target_length = 128
train_batch_size = 8
gradient_accumulation_steps = 2
num_train_epochs = 3
learning_rate = 3e-4
fp16 = torch.cuda.is_available()

os.makedirs(output_dir, exist_ok=True)

print("Loading dataset...")
df = pd.read_csv(train_csv)

# Sample 10% of data
df_sample = df.sample(frac=sample_fraction, random_state=42).reset_index(drop=True)
print(f"Using {len(df_sample)} samples ({sample_fraction*100}% of data)")

# I am creating my dataset
ds_train = Dataset.from_pandas(df_sample[["article", "highlights"]])

print("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Configure LoRA
lora_config = LoraConfig(
    r=4,
    lora_alpha=8,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + str(x) for x in examples["article"]]
    targets = [str(x) for x in examples["highlights"]]
    
    model_inputs = tokenizer(
        inputs, 
        max_length=max_input_length, 
        truncation=True, 
        padding="max_length"
    )
    
    labels = tokenizer(
        text_target=targets, 
        max_length=max_target_length, 
        truncation=True, 
        padding="max_length"
    )
    
    # Replace padding token id with -100 for loss calculation
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] 
        for label in labels["input_ids"]
    ]
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("Tokenizing dataset...")
tokenized_train = ds_train.map(
    preprocess_function, 
    batched=True, 
    remove_columns=ds_train.column_names
)


data_collator = DataCollatorForSeq2Seq(
    tokenizer, 
    model=model, 
    label_pad_token_id=-100
)

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    num_train_epochs=1,
    logging_steps=350,
    save_steps=500,
    save_total_limit=2,
    fp16=fp16,
    push_to_hub=False,
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("Starting training...")
trainer.train()

print("Saving model...")
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

def summarize_text(text, max_length=128, num_beams=4):
    input_text = prefix + text
    inputs = tokenizer(
        input_text, 
        return_tensors="pt", 
        truncation=True, 
        max_length=max_input_length
    ).to(model.device)
    
    outputs = model.generate(
        **inputs,
        max_length=max_length, 
        num_beams=num_beams, 
        early_stopping=True
    )
    
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

# Quick test
print("-"*80)
for i in range(2):
    item = ds_train[i]
    print(item["article"][:300] + "...")
    print(f"\nREFERENCE: {item['highlights']}")
    print(f"\nSUMMARY: {summarize_text(item['article'])}")
    print("-"*80)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loading dataset...
Using 7178 samples (2.5% of data)
Loading model and tokenizer...
trainable params: 147,456 || all params: 60,654,080 || trainable%: 0.2431
Tokenizing dataset...


Map:   0%|          | 0/7178 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Starting training...




Step,Training Loss


Saving model...
Model saved to /kaggle/working/t5-lora-summarization
--------------------------------------------------------------------------------
By . Mia De Graaf . Britons flocked to beaches across the southern coast yesterday as millions look set to bask in glorious sunshine today. Temperatures soared to 17C in Brighton and Dorset, with people starting their long weekend in deck chairs by the sea. Figures from Asda suggest the unexpected s...

REFERENCE: People enjoyed temperatures of 17C at Brighton beach in West Sussex and Weymouth in Dorset .
Asda claims it will sell a million sausages over long weekend despite night temperatures dropping to minus 1C .
But the good weather has not been enjoyed by all as the north west and Scotland have seen heavy rain .

SUMMARY: Temperatures soared to 17C yesterday in Brighton and Dorset. Forecasters predict dry and sunny weather across southern England, southern Wales and the south Midlands.
-------------------------------------------------

In [2]:
# ---------------------------------
# 1.  create a ZIP of the whole folder
# ---------------------------------
import shutil, os
zip_path = "/kaggle/working/t5_final_model.zip"
shutil.make_archive(zip_path.replace(".zip",""), 'zip', "/kaggle/working/t5-lora-summarization")



'/kaggle/working/t5_final_model.zip'