# 1. Import Libraries

In [None]:
import re
from datasets import load_dataset, concatenate_datasets

from transformers import (
  T5TokenizerFast as T5Tokenizer,
  T5ForConditionalGeneration,
  Seq2SeqTrainingArguments,
  Seq2SeqTrainer,
  DataCollatorForSeq2Seq
)

import numpy as np
import torch
import evaluate
import gc
import time
from peft import LoraConfig, get_peft_model
import pandas as pd

# 2. Import & Preprocessing Datasets

In [11]:
TOTAL_SAMPLES = 300
MAX_INPUT_LENGTH = 512
VAL_SIZE = 0.1
TEST_SIZE = 0.1

def clean_text(text):
  if not text:
    return ""
  text = re.sub(r'\s+([.,!?%:])', r'\1', text)
  return " ".join(text.split())

def filter_and_process(dataset, text_key, summary_key, style_name):
  def process_example(example):
    t_clean = clean_text(example[text_key])
    s_clean = clean_text(example[summary_key])
    
    if style_name == "Detailed":
      s_clean = clean_text(example[summary_key].lstrip('-–—').strip())
      
    return {
      'text': t_clean,
      'summary': s_clean,
      'prompt': f"Summarize {style_name}: {t_clean}",
      'word_count': len(t_clean.split())
    }

  processed_ds = dataset.map(process_example, remove_columns=dataset.column_names)
  filtered_ds = processed_ds.filter(lambda x: 0 < x['word_count'] <= MAX_INPUT_LENGTH)
  
  return filtered_ds.select(range(min(TOTAL_SAMPLES, len(filtered_ds))))

xsum_raw = load_dataset('xsum', trust_remote_code=True, split='train')
cnn_raw = load_dataset('cnn_dailymail', '3.0.0', split='train')
multi_raw = load_dataset('multi_news', trust_remote_code=True, split='train')

harsh_ds = filter_and_process(xsum_raw, 'document', 'summary', 'Harsh')
balanced_ds = filter_and_process(cnn_raw, 'article', 'highlights', 'Balanced')
detailed_ds = filter_and_process(multi_raw, 'document', 'summary', 'Detailed')

dataset = concatenate_datasets([harsh_ds, balanced_ds, detailed_ds])

train_temp_split = dataset.train_test_split(test_size=TEST_SIZE + VAL_SIZE, shuffle=True, seed=42)
train_ds = train_temp_split['train']
temp_ds = train_temp_split['test']

val_test_split = temp_ds.train_test_split(test_size=TEST_SIZE / (TEST_SIZE + VAL_SIZE), shuffle=True, seed=42)
val_ds = val_test_split['train']
test_ds = val_test_split['test']

print(f"Train Size: {len(train_ds)}")
print(f"Validation Size: {len(val_ds)}")
print(f"Test Size: {len(test_ds)}")

Train Size: 720
Validation Size: 90
Test Size: 90


# 3. Configurations & Parameters

In [12]:
MODEL_LIST = [
  't5-small',
  't5-base',
  'google/flan-t5-small',
  'google/flan-t5-base',
]
OUT_DIRECTORY = 'results'
MAX_TARGET_LENGTH = 256
BATCH_SIZE = 4
MAX_EPOCHS = 3
GRADIENT_ACCUMULATION_STEPS = 2
LEARNING_RATE = 5e-4
SEED = 42

evaluation_results = []
inference_results = []

# Text Reference: https://www.nbcnews.com/tech/tech-news/openai-disney-sora-ai-videos-rcna248617
text = '''
  The Walt Disney Co. announced Thursday that it had reached a three-year agreement with OpenAI to bring its popular characters to the company's Sora artificial intelligence video generator.
  Disney will also make a $1 billion investment in OpenAI, the owner of ChatGPT. It said it will become a “major customer” of OpenAI, using its services to develop new products and experiences, including for its Disney+ streaming service.
  “Under the agreement, Disney and OpenAI are affirming a shared commitment to the responsible use of AI that protects user safety and the rights of creators,” the companies said in a statement.
  They did not disclose the terms of the deal, and both Disney CEO Bob Iger and OpenAI CEO Sam Altman declined to reveal any details Thursday morning during a joint interview on CNBC.
  OpenAI, meanwhile, said it has committed to “implementing responsible measures to further address trust and safety, including age-appropriate policies,” but did not provide additional details about what that would entail.
  The issue of how AI chatbots engage with users under 18 is the subject of a national conversation and several lawsuits.
  Disney said characters that are part of the deal include: Mickey Mouse, Minnie Mouse, Lilo, Stitch, Ariel, Belle, Beast, Cinderella, Baymax, Simba and Mufasa, as well as characters from the worlds of “Encanto,” “Frozen,” “Inside Out,” “Moana,” “Monsters Inc.,” “Toy Story,” “Up” and “Zootopia.”
  On CNBC, Iger described the deal broadly as "kind of a way" for Disney to get into AI.
  The deal is notable in part because Disney is famously protective of its sprawling portfolio of intellectual property, from the animated shorts of the 1920s to modern superhero and fantasy franchises.
  Altman said, "We hear so much from users about how much they love Disney," adding that he expects Sora users to respond "very well" to the inclusion of Disney characters.
  The companies do not yet have a launch date yet, however, Altman said. "We'll try to get it in there as soon as we can."
  The company's statement had mentioned "early 2026" as a potential launch date.
  Iger said in a statement, “Bringing together Disney’s iconic stories and characters with OpenAI’s groundbreaking technology puts imagination and creativity directly into the hands of Disney fans in ways we’ve never seen before, giving them richer and more personal ways to connect with the Disney characters and stories they love."
  Media companies are wrestling with how to secure the value of their intellectual property while not being left behind by what many see as a transformative technology with few legal guardrails yet.
  With OpenAI, Disney would be creating a legitimate avenue through which a generative AI program could deploy its characters, rather than playing whack-a-mole with every AI company, as Disney has done with other kinds of media in the past.
'''

In [None]:
np.random.seed(SEED)
torch.manual_seed(SEED)

rouge = evaluate.load("rouge")

# 4. Model Initialization & Training

In [14]:
def preprocess_function(examples):
  model_inputs = tokenizer(
    examples["prompt"],
    max_length=MAX_INPUT_LENGTH,
    truncation=True,
    padding="max_length",
  )
  labels = tokenizer(
    text_target=examples["summary"],
    max_length=MAX_TARGET_LENGTH,
    truncation=True,
    padding="max_length"
  )
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [15]:
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.clip(predictions, 0, tokenizer.vocab_size - 1)

  decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  return {k: round(v, 4) for k, v in result.items()}

In [None]:
def generate_summary(text, style, model, tokenizer):
  model.eval()
  input_text = f"Summarize {style}: {text}"
  input_words = len(text.split())

  inputs = tokenizer(
    input_text,
    max_length=512,
    truncation=True,
    return_tensors='pt'
  ).to(device)

  if style == 'Harsh':
    max_len = int(input_words * 0.35)
    min_len = 5
    rep_penalty = 2.5
    length_penalty = 1.5
    beam_size = 4
    max_cap = 120
  elif style == 'Balanced':
    max_len = int(input_words * 0.50)
    min_len = 20
    rep_penalty = 1.5
    length_penalty = 1.2
    beam_size = 4
    max_cap = 180
  else:
    max_len = int(input_words * 0.70)
    min_len = 50
    rep_penalty = 1.2
    length_penalty = 0.8
    beam_size = 4
    max_cap = 256

  max_len = min(max_len, max_cap)

  with torch.no_grad():
    outputs = model.generate(
      input_ids=inputs['input_ids'],
      attention_mask=inputs['attention_mask'],
      max_length=max_len,
      min_length=min_len,
      num_beams=beam_size,
      length_penalty=length_penalty,
      repetition_penalty=rep_penalty,
      no_repeat_ngram_size=3,
      early_stopping=True
    )

  return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [17]:
for model_name in MODEL_LIST:
  print(f"=== Training Model: {model_name} ===")

  gc.collect()
  torch.cuda.empty_cache()

  tokenizer = T5Tokenizer.from_pretrained(model_name)
  tokenized_train = train_ds.map(preprocess_function, batched=True)
  tokenized_valid = val_ds.map(preprocess_function, batched=True)
  tokenized_test = test_ds.map(preprocess_function, batched=True)

  model = T5ForConditionalGeneration.from_pretrained(model_name)

  lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=["q", "k", "v", "o"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
  )
  model = get_peft_model(model, lora_config)
  
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.to(device)

  training_args = Seq2SeqTrainingArguments(
    output_dir=OUT_DIRECTORY,

    num_train_epochs=MAX_EPOCHS,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    weight_decay=0.01,
    warmup_ratio=0.05,

    logging_steps=100,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="no",

    fp16=False,
    bf16=True,
    predict_with_generate=True,
    generation_max_length=MAX_TARGET_LENGTH,
    report_to="none"
  )

  trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    compute_metrics=compute_metrics
  )

  trainer.train()

  val_metrics = trainer.evaluate()
  test_metrics = trainer.evaluate(eval_dataset=tokenized_test, metric_key_prefix="test")
  evaluation_results.append({
    "Model": model_name,

    "Val ROUGE-1": val_metrics.get("eval_rouge1"),
    "Val ROUGE-2": val_metrics.get("eval_rouge2"),
    "Val ROUGE-L": val_metrics.get("eval_rougeL"),
    "Val ROUGE-L Summary": val_metrics.get("eval_rougeLsum"),

    "Test ROUGE-1": test_metrics.get("test_rouge1"),
    "Test ROUGE-2": test_metrics.get("test_rouge2"),
    "Test ROUGE-L": test_metrics.get("test_rougeL"),
    "Test ROUGE-L Summary": test_metrics.get("test_rougeLsum")
  })

  styles = ["Harsh", "Balanced", "Detailed"]
  model_outputs = {"Model": model_name}

  for style in styles:
    summary = generate_summary(text, style, model, tokenizer)
    model_outputs[style] = summary

  inference_results.append(model_outputs)

  del model
  del trainer
  del tokenizer
  del tokenized_train
  del tokenized_valid
  del tokenized_test
  gc.collect()
  torch.cuda.empty_cache()

=== Training Model: t5-small ===


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
100,3.6345,0.951245,0.0198,0.0122,0.0157,0.015
200,1.2682,0.899299,0.0254,0.0133,0.0194,0.019


=== Training Model: t5-base ===


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
100,2.838,0.784637,0.123,0.0557,0.1005,0.0995
200,1.0627,0.737808,0.1631,0.0643,0.1141,0.1125


=== Training Model: google/flan-t5-small ===


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
100,10.0049,3.505064,0.2685,0.0991,0.1925,0.1916
200,3.2105,2.57418,0.2938,0.1138,0.2223,0.2208


=== Training Model: google/flan-t5-base ===


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
100,8.2124,0.722503,0.2796,0.1069,0.1988,0.1982
200,0.9915,0.652271,0.3471,0.1376,0.2471,0.2475


# 5. Model Evaluation Comparison

In [21]:
rouge_df = pd.DataFrame(evaluation_results)
display(rouge_df)

Unnamed: 0,Model,Val ROUGE-1,Val ROUGE-2,Val ROUGE-L,Val ROUGE-L Summary,Test ROUGE-1,Test ROUGE-2,Test ROUGE-L,Test ROUGE-L Summary
0,t5-small,0.033,0.0153,0.0252,0.0249,0.0767,0.0289,0.0558,0.0556
1,t5-base,0.2202,0.0838,0.1534,0.1525,0.224,0.0951,0.155,0.1563
2,google/flan-t5-small,0.2996,0.1106,0.224,0.222,0.3054,0.1181,0.2213,0.222
3,google/flan-t5-base,0.3697,0.1549,0.268,0.267,0.3565,0.1484,0.2629,0.2623


In [22]:
style_df = pd.DataFrame(inference_results)
style_df.set_index('Model', inplace=True)

for _, row in style_df.iterrows():
  print(f"=== Model: {row.name} ===")
  for style in ["Harsh", "Balanced", "Detailed"]:
    summary = row[style]
    word_count = len(summary.split())
    print(f"{style} (Words: {word_count}):")
    print(summary)
    print("")

=== Model: t5-small ===
Harsh (Words: 53):
The Walt Disney Co. announced Thursday that it had reached a three-year agreement with OpenAI to bring its popular characters to the company's Sora artificial intelligence video generator. It said it will become a “major customer” of OpenAI, using its services to develop new products and experiences, including for its Disney+ streaming service.

Balanced (Words: 59):
The Walt Disney Co. announced Thursday that it had reached a three-year agreement with OpenAI to bring its popular characters to the company's Sora artificial intelligence video generator. The deal is notable in part because Disney is famously protective of its sprawling portfolio of intellectual property, from the animated shorts of the 1920s to modern superhero and fantasy franchises.

Detailed (Words: 59):
The Walt Disney Co. announced Thursday that it had reached a three-year agreement with OpenAI to bring its popular characters to the company's Sora artificial intelligence vi