In [1]:
# ---- Import Required Libraries ----
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
import re
import string
import torch
import warnings
from tqdm import tqdm
from datasets import Dataset
from transformers import (
    T5Tokenizer, 
    T5ForConditionalGeneration,
    Seq2SeqTrainer, 
    Seq2SeqTrainingArguments, 
    DataCollatorForSeq2Seq,
    pipeline
)
from rouge_score import rouge_scorer
from bert_score import score

# Setup
warnings.filterwarnings("ignore")
tqdm.pandas()
nltk.download("stopwords")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# ---- Load Dataset ----
train_data = pd.read_csv("cnn_dailymail/train.csv")
val_data   = pd.read_csv("cnn_dailymail/validation.csv")
test_data  = pd.read_csv("cnn_dailymail/test.csv")

display(train_data.head())

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [3]:
# ---- Data Cleaning Check ----
print("Missing values in Train:", train_data.isnull().sum().sum())
print("Missing values in Validation:", val_data.isnull().sum().sum())
print("Missing values in Test:", test_data.isnull().sum().sum())

Missing values in Train: 0
Missing values in Validation: 0
Missing values in Test: 0


In [4]:
# --- Take a random sample ---
train_data_sample = train_data.sample(frac=0.05, random_state=42)  # change frac as needed (0.1 = 10%)

In [5]:
# ---- Convert DataFrames to Hugging Face Dataset ----
# Keeping only necessary columns: ["article", "highlights"]

train_dataset = Dataset.from_pandas(train_data_sample[["article", "highlights"]])
val_dataset   = Dataset.from_pandas(val_data[["article", "highlights"]])
test_dataset  = Dataset.from_pandas(test_data[["article", "highlights"]])

In [6]:
# ---- Load T5 Model & Tokenizer ----
# Use t5-small for faster training, upgrade to t5-base or t5-large if GPU allows
model_checkpoint = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

max_input_length = 512
max_target_length = 128

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
# ---- Preprocessing Function ----
# Adds task prefix "summarize:" for T5

def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    
    model_inputs = tokenizer(
        inputs, 
        max_length=max_input_length, 
        truncation=True
    )
    
    labels = tokenizer(
        examples["highlights"], 
        max_length=max_target_length, 
        truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_tokenized = train_dataset.map(preprocess_function, batched=True)
val_tokenized   = val_dataset.map(preprocess_function, batched=True)
test_tokenized  = test_dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 14356/14356 [00:27<00:00, 513.52 examples/s]
Map: 100%|██████████| 13368/13368 [00:25<00:00, 526.64 examples/s]
Map: 100%|██████████| 11490/11490 [00:21<00:00, 523.01 examples/s]


In [8]:
# ---- Data Collator ----
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_checkpoint)

In [9]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    learning_rate=3e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=100
)

In [10]:
# ---- Define Trainer ----
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [11]:
# ---- Train the Model ----
trainer.train()

Step,Training Loss
100,2.2933
200,2.1073
300,2.1203
400,2.1076
500,2.1324
600,2.079
700,2.0863
800,2.0642
900,2.0628
1000,2.1223


TrainOutput(global_step=1436, training_loss=2.103871836967787, metrics={'train_runtime': 11411.6893, 'train_samples_per_second': 1.258, 'train_steps_per_second': 0.126, 'total_flos': 1942966901932032.0, 'train_loss': 2.103871836967787, 'epoch': 1.0})

In [12]:
# ---- Evaluate on Test Set ----
results = trainer.evaluate(test_tokenized)
print(results)

{'eval_loss': 1.8726658821105957, 'eval_runtime': 1785.5014, 'eval_samples_per_second': 6.435, 'eval_steps_per_second': 0.644, 'epoch': 1.0}


In [13]:
# ---- ROUGE Evaluation ----
# Decode predictions for test samples and calculate ROUGE
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

def compute_rouge(sample_idx=0):
    article = test_data["article"].iloc[sample_idx]
    reference = test_data["highlights"].iloc[sample_idx]
    
    # Generate summary
    input_ids = tokenizer("summarize: " + article, return_tensors="pt", truncation=True).input_ids
    output_ids = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
    prediction = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    # Compute ROUGE
    scores = scorer.score(reference, prediction)
    print("📌 Article:\n", article[:500], "...")
    print("\n✅ Prediction:\n", prediction)
    print("\n📝 Reference:\n", reference)
    print("\n📊 ROUGE:", scores)

compute_rouge(0)

📌 Article:
 Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. More than squabbling over the arm rest, shrinking space on planes putting our health and safety in danger? This week, a U.S consumer advisory group set up by t ...

✅ Prediction:
 A U.S consumer advisory group set up by the Department of Transportation said that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans. But these tests are conducted using planes with 31 inches between each row of seats, a standard which on some airlines has decreased. The distance between two seats from one point on a seat to the same point on the seat behind it is known as the pi

In [14]:
# ---- BERTScore Evaluation ----
# More semantic evaluation compared to ROUGE
cands = [tokenizer.decode(model.generate(
    tokenizer("summarize: " + art, return_tensors="pt", truncation=True).input_ids,
    max_length=128, num_beams=4, early_stopping=True
)[0], skip_special_tokens=True) for art in test_data["article"].head(10)]

refs = test_data["highlights"].head(10).tolist()

P, R, F1 = score(cands, refs, lang="en", verbose=True)
print("BERTScore F1:", F1.mean().item())

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:02<00:00,  2.12s/it]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 222.85it/s]

done in 2.13 seconds, 4.70 sentences/sec
BERTScore F1: 0.8804254531860352



