In [1]:
# ===============================
# Task 1: Setup and Installation
# ===============================

# Install required libraries
!pip install -q datasets transformers spacy rouge-score evaluate
!python -m spacy download en_core_web_sm

# Disable Weights & Biases (optional logging tool that pops up during training)
import os
os.environ["WANDB_DISABLED"] = "true"

# Import necessary libraries for preprocessing, modeling, and evaluation
from datasets import load_dataset, Dataset
from evaluate import load
import pandas as pd
import re
import spacy
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from tqdm.notebook import tqdm
import torch

# ===============================
# Task 2: Load and Preprocess Dataset
# ===============================

# Load CNN/DailyMail dataset from Hugging Face 🤗
print("\nLoading dataset...")
ds = load_dataset("abisee/cnn_dailymail", "3.0.0")
df = pd.DataFrame(ds['train'])  # Convert training split to pandas DataFrame

# Clean article text: remove extra whitespace, brackets, punctuation, etc.
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\[[^]]*\]', '', text)
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r'[^a-zA-Z0-9.?! ]+', '', text)
    return text.strip()

df['cleaned_article'] = df['article'].apply(preprocess_text)

# ===============================
# Task 3: Extractive Summarization using spaCy
# ===============================

# Load English model for spaCy (used to split sentences and score them)
try:
    nlp = spacy.load("en_core_web_sm")
except:
    import spacy.cli
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

# Function to summarize by selecting top 3 highest scoring sentences
def extractive_summarization(article):
    doc = nlp(article)
    sentences = [sent.text for sent in doc.sents]
    sentence_scores = {}
    for sent in sentences:
        for word in sent.split():
            if word.lower() not in sentence_scores:
                sentence_scores[word.lower()] = 0
            sentence_scores[word.lower()] += 1
    ranked = sorted(sentences, key=lambda s: sum(sentence_scores.get(word.lower(), 0) for word in s.split()), reverse=True)
    return " ".join(ranked[:3])  # Return top 3 ranked sentences as the summary

# Apply extractive summarization to a small subset of 100 examples for speed
print("\nGenerating extractive summaries...")
tqdm.pandas()
N = 100
df_subset = df.head(N).copy()
df_subset['extractive_summary'] = df_subset['cleaned_article'].progress_apply(extractive_summarization)

# ===============================
# Task 4: Abstractive Summarization using T5 Transformer
# ===============================

# Load pre-trained T5 model and tokenizer from Hugging Face
print("\nLoading abstractive summarization model...")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Function to generate summaries using the T5 model
def abstractive_summarization(article):
    inputs = tokenizer.encode("summarize: " + article, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs, max_length=50, min_length=10, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Generate abstractive summaries for the same 100 examples
print("\nGenerating abstractive summaries...")
df_subset['abstractive_summary'] = df_subset['cleaned_article'].progress_apply(abstractive_summarization)

# ===============================
# Task 5: Evaluate Summaries using ROUGE
# ===============================

# Load ROUGE evaluation metric
print("\nEvaluating with ROUGE...")
rouge = load("rouge")

# Evaluate generated summaries against reference highlights
preds = df_subset['abstractive_summary'].tolist()
refs = df_subset['highlights'].tolist()
results = rouge.compute(predictions=preds, references=refs, use_stemmer=True)

# Display ROUGE scores (recall-oriented metric)
print("\nROUGE Evaluation:")
for k, v in results.items():
    print(f"{k}: {v:.4f}")

# ===============================
# Task 6: Save Output and Showcase Results
# ===============================

# Save the resulting dataset with generated summaries
df_subset.to_csv("summarization_output.csv", index=False)

# Print example: Original, Extractive, Abstractive, and Reference
print("\nSample Comparison:")
print("\nOriginal Article:\n", df_subset['cleaned_article'][0])
print("\nExtractive Summary:\n", df_subset['extractive_summary'][0])
print("\nAbstractive Summary:\n", df_subset['abstractive_summary'][0])
print("\nReference (Target) Summary:\n", df_subset['highlights'][0])


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following 

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]


Generating extractive summaries...


  0%|          | 0/100 [00:00<?, ?it/s]


Loading abstractive summarization model...


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565



Generating abstractive summaries...


  0%|          | 0/100 [00:00<?, ?it/s]


Evaluating with ROUGE...


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]


ROUGE Evaluation:
rouge1: 0.2974
rouge2: 0.0995
rougeL: 0.2092
rougeLsum: 0.2509

Sample Comparison:

Original: LONDON England   Harry Potter star Daniel Radcliffe gains access to a reported 20 million  fortune as he turns 18 on Monday but he insists the money wont cast a spell on him. Daniel Radcliffe as Harry Potter in Harry Potter and the Order of the Phoenix To the disappointment of gossip columnists around the world the young actor says he has no plans to fritter his cash away on fast cars drink and celebrity parties. I dont plan to be one of those people who as soon as they turn 18 suddenly buy themselves a massive sports car collection or something similar he told an Australian interviewer earlier this month. I dont think Ill be particularly extravagant. The things I like buying are things that cost about 10 pounds  books and CDs and DVDs. At 18 Radcliffe will be able to gamble in a casino buy a drink in a pub or see the horror film Hostel Part II currently six places below his

In [2]:
# ===============================
# Task 7: Fine-Tune T5 on CNN/DailyMail (Subset)
# ===============================

# ✅ Prepare the dataset for fine-tuning
print("\nPreparing dataset for fine-tuning...")

# Rename columns to match expected model input format
fine_tune_df = df_subset[['cleaned_article', 'highlights']].rename(columns={
    'cleaned_article': 'input_text',
    'highlights': 'target_text'
})

# Convert to Hugging Face Dataset object
dataset = Dataset.from_pandas(fine_tune_df)

# ✅ Tokenization configuration
max_input_length = 512   # Truncate long articles
max_target_length = 64   # Truncate long summaries

# Tokenize each article-summary pair
def tokenize_data(example):
    # Prefix 'summarize:' is a T5 convention
    inputs = tokenizer(
        "summarize: " + example["input_text"],
        truncation=True,
        padding="max_length",
        max_length=max_input_length
    )
    targets = tokenizer(
        example["target_text"],
        truncation=True,
        padding="max_length",
        max_length=max_target_length
    )
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(tokenize_data, batched=False)

# ✅ Define training parameters
training_args = TrainingArguments(
    output_dir="./t5_finetuned_cnn",           # Directory to save the model
    per_device_train_batch_size=2,             # Small batch size for low-resource environments
    num_train_epochs=1,                        # Only 1 epoch for demonstration
    save_steps=10_000,                         # Save every 10k steps (not triggered here)
    save_total_limit=1,                        # Keep only the most recent model
    logging_steps=50,                          # Log progress every 50 steps
    remove_unused_columns=True,                # Speed up training
    fp16=torch.cuda.is_available(),            # Use FP16 if GPU is available
)

# ✅ Initialize the Trainer class from Hugging Face
print("\nFine-tuning model...")
trainer = Trainer(
    model=model,                               # Pre-loaded T5 model
    args=training_args,                        # Training settings
    train_dataset=tokenized_dataset,           # Our tokenized fine-tune data
)

# ✅ Begin fine-tuning (this may take some time on CPU)
trainer.train()

# ✅ Save the fine-tuned model and tokenizer locally
print("\nSaving fine-tuned model...")
model.save_pretrained("./t5_finetuned_cnn")
tokenizer.save_pretrained("./t5_finetuned_cnn")



Preparing dataset for fine-tuning...


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



Fine-tuning model...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
50,3.7336



Saving fine-tuned model...


('./t5_finetuned_cnn/tokenizer_config.json',
 './t5_finetuned_cnn/special_tokens_map.json',
 './t5_finetuned_cnn/spiece.model',
 './t5_finetuned_cnn/added_tokens.json')

In [3]:
# ===============================
# ✅ Task Completed: Final Summary
# ===============================

print("\n\n🎯 PROJECT SUMMARY: TEXT SUMMARIZATION COMPLETE")
print("--------------------------------------------------")
print("✅ Dataset: Loaded CNN/Daily Mail articles")
print("✅ Preprocessing: Cleaned raw articles (punctuation, brackets, whitespace)")
print("✅ Extractive Summarization: Implemented with spaCy using sentence scoring")
print("✅ Abstractive Summarization: Generated with T5 (pretrained model)")
print("✅ Evaluation: ROUGE metrics used to evaluate abstractive summaries")
print("✅ Results Exported: Saved summaries to 'summarization_output.csv'")
print("✅ Fine-tuning: Fine-tuned T5 on 100 article-summary pairs")
print("✅ Model Saved: Fine-tuned model and tokenizer stored in './t5_finetuned_cnn'")

# Display one sample comparison from the fine-tuned data
print("\n📌 SAMPLE OUTPUT AFTER FULL PIPELINE\n")
print("📰 Original Article:\n", df_subset['cleaned_article'][0][:1000], "...")
print("\n📝 Extractive Summary:\n", df_subset['extractive_summary'][0])
print("\n🤖 Abstractive Summary (Pretrained):\n", df_subset['abstractive_summary'][0])
print("\n📚 Reference Summary:\n", df_subset['highlights'][0])

print("\n✨ All steps executed successfully! Your summarization system is complete.")




🎯 PROJECT SUMMARY: TEXT SUMMARIZATION COMPLETE
--------------------------------------------------
✅ Dataset: Loaded CNN/Daily Mail articles
✅ Preprocessing: Cleaned raw articles (punctuation, brackets, whitespace)
✅ Extractive Summarization: Implemented with spaCy using sentence scoring
✅ Abstractive Summarization: Generated with T5 (pretrained model)
✅ Evaluation: ROUGE metrics used to evaluate abstractive summaries
✅ Results Exported: Saved summaries to 'summarization_output.csv'
✅ Fine-tuning: Fine-tuned T5 on 100 article-summary pairs
✅ Model Saved: Fine-tuned model and tokenizer stored in './t5_finetuned_cnn'

📌 SAMPLE OUTPUT AFTER FULL PIPELINE

📰 Original Article:
 LONDON England   Harry Potter star Daniel Radcliffe gains access to a reported 20 million  fortune as he turns 18 on Monday but he insists the money wont cast a spell on him. Daniel Radcliffe as Harry Potter in Harry Potter and the Order of the Phoenix To the disappointment of gossip columnists around the world the 