# 1. Import Libraries

In [None]:
import re
from datasets import load_dataset, concatenate_datasets

from transformers import (
  T5TokenizerFast as T5Tokenizer,
  T5ForConditionalGeneration,
  Seq2SeqTrainingArguments,
  Seq2SeqTrainer,
  DataCollatorForSeq2Seq
)

import torch
import evaluate
import gc
import time
from peft import LoraConfig, get_peft_model

# 2. Import & Preprocessing Datasets

In [None]:
TOTAL_SAMPLES = 300
MAX_INPUT_LENGTH = 512
VAL_SIZE = 0.1
TEST_SIZE = 0.1

def clean_text(text):
  if not text:
    return ""
  text = re.sub(r'\s+([.,!?%:])', r'\1', text)
  return " ".join(text.split())

def filter_and_process(dataset, text_key, summary_key, style_name):
  def process_example(example):
    t_clean = clean_text(example[text_key])
    s_clean = clean_text(example[summary_key])

    if style_name == "Detailed":
      s_clean = clean_text(example[summary_key].lstrip('-–—').strip())

    return {
      'text': t_clean,
      'summary': s_clean,
      'prompt': f"Summarize {style_name}: {t_clean}",
      'word_count': len(t_clean.split())
    }

  processed_ds = dataset.map(process_example, remove_columns=dataset.column_names)
  filtered_ds = processed_ds.filter(lambda x: 0 < x['word_count'] <= MAX_INPUT_LENGTH)

  return filtered_ds.select(range(min(TOTAL_SAMPLES, len(filtered_ds))))

xsum_raw = load_dataset('xsum', trust_remote_code=True, split='train')
cnn_raw = load_dataset('cnn_dailymail', '3.0.0', split='train')
multi_raw = load_dataset('multi_news', trust_remote_code=True, split='train')

harsh_ds = filter_and_process(xsum_raw, 'document', 'summary', 'Harsh')
balanced_ds = filter_and_process(cnn_raw, 'article', 'highlights', 'Balanced')
detailed_ds = filter_and_process(multi_raw, 'document', 'summary', 'Detailed')

dataset = concatenate_datasets([harsh_ds, balanced_ds, detailed_ds])

train_temp_split = dataset.train_test_split(test_size=TEST_SIZE + VAL_SIZE, shuffle=True, seed=42)
train_ds = train_temp_split['train']
temp_ds = train_temp_split['test']

val_test_split = temp_ds.train_test_split(test_size=TEST_SIZE / (TEST_SIZE + VAL_SIZE), shuffle=True, seed=42)
val_ds = val_test_split['train']
test_ds = val_test_split['test']

print(f"Train Size: {len(train_ds)}")
print(f"Validation Size: {len(val_ds)}")
print(f"Test Size: {len(test_ds)}")

Train Size: 720
Validation Size: 90
Test Size: 90


# 3. Configurations & Parameters

In [5]:
MODEL_NAME = "google/flan-t5-base"
OUT_DIRECTORY = 'results'
MAX_TARGET_LENGTH = 256
BATCH_SIZE = 4
MAX_EPOCHS = 3
GRADIENT_ACCUMULATION_STEPS = 2
LEARNING_RATE = 5e-4
SEED = 42

results = []

In [None]:
torch.manual_seed(SEED)

rouge = evaluate.load("rouge")

# 4. Model Initialization & Training

In [None]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(examples):
  model_inputs = tokenizer(
    examples["prompt"],
    max_length=MAX_INPUT_LENGTH,
    truncation=True,
    padding="max_length",
  )
  labels = tokenizer(
    text_target=examples["summary"],
    max_length=MAX_TARGET_LENGTH,
    truncation=True,
    padding="max_length"
  )
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_valid = val_ds.map(preprocess_function, batched=True)
tokenized_test = test_ds.map(preprocess_function, batched=True)

In [None]:
def train_and_measure(run_name, use_lora=False):
  print(f"=== Training Model: {run_name} ===")

  gc.collect()
  torch.cuda.empty_cache()

  model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

  if use_lora:
    lora_config = LoraConfig(
      r=32,
      lora_alpha=64,
      target_modules=["q", "k", "v", "o"],
      lora_dropout=0.05,
      bias="none",
      task_type="SEQ_2_SEQ_LM"
    )
    model = get_peft_model(model, lora_config)

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.to(device)

  training_args = Seq2SeqTrainingArguments(
    output_dir=OUT_DIRECTORY,

    num_train_epochs=MAX_EPOCHS,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    weight_decay=0.01,
    warmup_ratio=0.05,

    logging_steps=100,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="no",

    fp16=False,
    bf16=True,
    predict_with_generate=True,
    generation_max_length=MAX_TARGET_LENGTH,
    report_to="none"
  )

  trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model)
  )

  start_time = time.time()
  trainer.train()
  end_time = time.time()
  training_duration = end_time - start_time

  metrics = {
    "Configuration": run_name,
    "Training Time (s)": round(training_duration, 2)
  }
  results.append(metrics)

  del model
  del trainer
  gc.collect()
  torch.cuda.empty_cache()

  return metrics

In [13]:
train_and_measure("Fine-Tuning With LoRA", use_lora=True)
train_and_measure("Fine-Tuning Without LoRA", use_lora=False)

=== Training Model: Fine-Tuning With LoRA ===


Step,Training Loss,Validation Loss
100,8.272,0.730966
200,0.9948,0.653763


=== Training Model: Fine-Tuning Without LoRA ===


Step,Training Loss,Validation Loss
100,3.9588,0.652195
200,0.7517,0.658526


# 5. Model Evaluation Comparison

In [14]:
for result in results:
  print(f"{result['Configuration']}: {result['Training Time (s)']} s")

Fine-Tuning With LoRA: 569.6 s
Fine-Tuning Without LoRA: 619.59 s
