In [1]:
# ===============================
# üìå TEXT SUMMARIZATION PIPELINE
# ===============================

# --- Step 1: Setup ---
!pip install -q datasets transformers spacy rouge-score evaluate
!python -m spacy download en_core_web_sm

import os
os.environ["WANDB_DISABLED"] = "true"

from datasets import load_dataset, Dataset
from evaluate import load
import pandas as pd
import re
import spacy
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from tqdm.notebook import tqdm
import torch

# --- Step 2: Load & Clean Dataset ---
print("üì• Loading CNN/DailyMail dataset...")
ds = load_dataset("abisee/cnn_dailymail", "3.0.0")
df = pd.DataFrame(ds['train'])

def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\[[^]]*\]', '', text)
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r'[^a-zA-Z0-9.?! ]+', '', text)
    return text.strip()

df['cleaned_article'] = df['article'].apply(preprocess_text)

# --- Step 3: Extractive Summarization (spaCy) ---
print("üîç Generating extractive summaries...")
nlp = spacy.load("en_core_web_sm")

def extractive_summarization(article):
    doc = nlp(article)
    sentences = [sent.text for sent in doc.sents]
    sentence_scores = {}
    for sent in sentences:
        for word in sent.split():
            sentence_scores[word.lower()] = sentence_scores.get(word.lower(), 0) + 1
    ranked = sorted(sentences, key=lambda s: sum(sentence_scores.get(w.lower(), 0) for w in s.split()), reverse=True)
    return " ".join(ranked[:3])

tqdm.pandas()
N = 100
df_subset = df.head(N).copy()
df_subset['extractive_summary'] = df_subset['cleaned_article'].progress_apply(extractive_summarization)

# --- Step 4: Abstractive Summarization (T5) ---
print("ü§ñ Generating abstractive summaries using T5...")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
tokenizer = T5Tokenizer.from_pretrained("t5-small")

def abstractive_summarization(article):
    inputs = tokenizer.encode("summarize: " + article, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs, max_length=50, min_length=10, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

df_subset['abstractive_summary'] = df_subset['cleaned_article'].progress_apply(abstractive_summarization)

# --- Step 5: Evaluate with ROUGE ---
print("üìä Evaluating summaries using ROUGE...")
rouge = load("rouge")
results = rouge.compute(predictions=df_subset['abstractive_summary'], references=df_subset['highlights'], use_stemmer=True)

for k, v in results.items():
    print(f"{k}: {v:.4f}")

# --- Step 6: Save Output ---
df_subset.to_csv("summarization_output.csv", index=False)
print("üíæ Output saved to summarization_output.csv")

# --- Step 7: Fine-tune T5 on Subset ---
print("üéì Fine-tuning T5 model...")
fine_tune_df = df_subset[['cleaned_article', 'highlights']].rename(columns={
    'cleaned_article': 'input_text',
    'highlights': 'target_text'
})
dataset = Dataset.from_pandas(fine_tune_df)

max_input_length = 512
max_target_length = 64

def tokenize_data(example):
    inputs = tokenizer("summarize: " + example["input_text"], truncation=True, padding="max_length", max_length=max_input_length)
    targets = tokenizer(example["target_text"], truncation=True, padding="max_length", max_length=max_target_length)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(tokenize_data, batched=False)

training_args = TrainingArguments(
    output_dir="./t5_finetuned_cnn",
    per_device_train_batch_size=2,
    num_train_epochs=1,
    save_steps=10_000,
    save_total_limit=1,
    logging_steps=50,
    remove_unused_columns=True,
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)
trainer.train()

model.save_pretrained("./t5_finetuned_cnn")
tokenizer.save_pretrained("./t5_finetuned_cnn")

# --- Final Output ---
print("\n‚úÖ TEXT SUMMARIZATION PIPELINE COMPLETE")
print("Saved model to ./t5_finetuned_cnn")
print("\nüìå Sample Output:")
print("\nOriginal:\n", df_subset['cleaned_article'][0][:500], "...")
print("\nExtractive Summary:\n", df_subset['extractive_summary'][0])
print("\nAbstractive Summary:\n", df_subset['abstractive_summary'][0])
print("\nReference Summary:\n", df_subset['highlights'][0])


Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m‚úî Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m‚ö† Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
üì• Loading CNN/DailyMail dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


test-00000-of-00001.parquet:  35%|###4      | 10.5M/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

üîç Generating extractive summaries...


  0%|          | 0/100 [00:00<?, ?it/s]

ü§ñ Generating abstractive summaries using T5...


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


  0%|          | 0/100 [00:00<?, ?it/s]

üìä Evaluating summaries using ROUGE...


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

rouge1: 0.2980
rouge2: 0.0990
rougeL: 0.2088
rougeLsum: 0.2504
üíæ Output saved to summarization_output.csv
üéì Fine-tuning T5 model...


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
50,3.7336



‚úÖ TEXT SUMMARIZATION PIPELINE COMPLETE
Saved model to ./t5_finetuned_cnn

üìå Sample Output:

Original:
 LONDON England   Harry Potter star Daniel Radcliffe gains access to a reported 20 million  fortune as he turns 18 on Monday but he insists the money wont cast a spell on him. Daniel Radcliffe as Harry Potter in Harry Potter and the Order of the Phoenix To the disappointment of gossip columnists around the world the young actor says he has no plans to fritter his cash away on fast cars drink and celebrity parties. I dont plan to be one of those people who as soon as they turn 18 suddenly buy them ...

Extractive Summary:
 Daniel Radcliffe as Harry Potter in Harry Potter and the Order of the Phoenix To the disappointment of gossip columnists around the world the young actor says he has no plans to fritter his cash away on fast cars drink and celebrity parties. His latest outing as the boy wizard in Harry Potter and the Order of the Phoenix is breaking records on both sides of the 