# Loading Dataset

In [None]:
!pip -q install evaluate rouge-score pytextrank


In [None]:
import kagglehub

path = kagglehub.dataset_download("gowrishankarp/newspaper-text-summarization-cnn-dailymail")
print("Path to dataset files:", path)

Using Colab cache for faster access to the 'newspaper-text-summarization-cnn-dailymail' dataset.
Path to dataset files: /kaggle/input/newspaper-text-summarization-cnn-dailymail


In [None]:
import pandas as pd

train_df = pd.read_csv(path+"/cnn_dailymail/train.csv")
val_df = pd.read_csv(path+"/cnn_dailymail/validation.csv")

train_df.drop(columns=["id"], inplace=True)
val_df.drop(columns=["id"], inplace=True)

train_df = train_df.iloc[:300]
val_df = val_df.iloc[:300]

train_df.head()

Unnamed: 0,article,highlights
0,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


# Pretrained model

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM , pipeline

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [None]:
predictions = []
references = []

max_input_length = 1024

for article, highlight in zip(train_df['article'][:30], train_df['highlights'][:30]):
    inputs = tokenizer(article, truncation=True, max_length=max_input_length, return_tensors="pt")

    inputs = {name: tensor.to(model.device) for name, tensor in inputs.items()}

    summary_ids = model.generate(**inputs, max_length=130, min_length=30, num_beams=4, length_penalty=2.0)
    summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    predictions.append(summary_text)
    references.append(highlight)

In [None]:
print("Original text :")
print(train_df['article'][0])
print()
print("Summary :")
print(predictions[0])

Original text :
By . Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 . The bishop of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A virus in late September and early October. The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion. Bishop John Folda (pictured) of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A . State Immunization Program Manager Molly Howell says the risk is low, but officials feel it's important to alert people to the possible exposure. The diocese announced on Monday that Bishop John Folda is taking time off after being diagnosed with hepatitis A. The diocese says he contracted the infection through contaminated food while attending a conference for 

In [None]:
import evaluate
rouge = evaluate.load("rouge")

results = rouge.compute(predictions=predictions, references=references) # Pass the entire lists
print("ROUGE Scores:\n")
for key, value in results.items():
    print(f"{key}: {round(value * 100, 2)}")

ROUGE Scores:

rouge1: 44.36
rouge2: 25.04
rougeL: 33.03
rougeLsum: 39.04


# Extractive summarization using TextRank

In [None]:
import spacy
import pytextrank

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Add PyTextRank to the pipeline
nlp.add_pipe("textrank")

def summarize_text(text, limit_sentences=3):
    doc = nlp(text)
    summary = []
    for sent in doc._.textrank.summary(limit_sentences=limit_sentences):
        summary.append(sent.text.strip())
    return " ".join(summary)

train_df['summary'] = train_df['article'][:30].apply(summarize_text)


/usr/local/lib/python3.12/dist-packages


In [None]:
print("Original text :")
print(train_df['article'][0])
print()
print("Summary :")
print(train_df['summary'][0])

Original text :
By . Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 . The bishop of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A virus in late September and early October. The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion. Bishop John Folda (pictured) of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A . State Immunization Program Manager Molly Howell says the risk is low, but officials feel it's important to alert people to the possible exposure. The diocese announced on Monday that Bishop John Folda is taking time off after being diagnosed with hepatitis A. The diocese says he contracted the infection through contaminated food while attending a conference for 

# Fine-tuning the model

In [None]:
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", "3.0.0")

In [None]:
from transformers import BartTokenizer

model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)

# Preprocessing function
def preprocess_function(examples):
    inputs = [doc for doc in examples["article"]]
    model_inputs = tokenizer(
        inputs, max_length=1024, truncation=True, padding="max_length"
    )

    # Tokenize targets with padding
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["highlights"], max_length=128, truncation=True, padding="max_length"
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=["article", "highlights", "id"])


In [None]:
import gc
import sys

del train_df
del val_df
gc.collect()

31055

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import wandb

wandb.init(project="bart-summarization")

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,      # Reduced for memory safety
    per_device_eval_batch_size=2,       # Reduced for memory safety
    gradient_accumulation_steps=8,      # Increased to maintain effective batch size
    num_train_epochs=2,                 # Slightly more epochs since batch size is smaller
    logging_steps=50,                   # More frequent logging for shorter runs
    save_steps=250,                     # More frequent saves for Colab
    learning_rate=3e-4,                 # Slightly higher LR for smaller batches
    fp16=False,                          # Changed from bf16 (better Colab compatibility)
    optim="adamw_torch",
    report_to="none",
)

[34m[1mwandb[0m: Currently logged in as: [33mking-sniper-ks2000[0m ([33mking-sniper-ks2000-student[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
import evaluate
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {key: value * 100 for key, value in result.items()}
    return result

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"].select(range(500)),
    eval_dataset=tokenized_datasets["validation"].select(range(100)),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()

Step,Training Loss




TrainOutput(global_step=38, training_loss=5.1055860017475325, metrics={'train_runtime': 476.303, 'train_samples_per_second': 1.26, 'train_steps_per_second': 0.08, 'total_flos': 1300262761267200.0, 'train_loss': 5.1055860017475325, 'epoch': 2.0})

In [None]:
metrics = trainer.evaluate()
print(metrics)

# Notebook Summary & Key insights

## Summary

This notebook explores text summarization using both extractive and abstractive methods on the CNN/DailyMail dataset.

1.  **Data Loading and Preparation:** The notebook starts by loading the CNN/DailyMail dataset using `kagglehub` and `pandas`. It then preprocesses the data for both extractive and abstractive summarization.
2.  **Extractive Summarization (TextRank):** TextRank is used for extractive summarization, where key sentences are extracted from the original article. The `spacy` and `pytextrank` libraries are utilized for this purpose.
3.  **Abstractive Summarization (BART):** A pre-trained BART model (`facebook/bart-large-cnn`) is loaded using the `transformers` library. The model is then fine-tuned on a subset of the CNN/DailyMail dataset for abstractive summarization, which involves generating new sentences for the summary.
4.  **Evaluation:** ROUGE metrics are used to evaluate the performance of both the pre-trained BART model and the fine-tuned BART model.

## Key Insights

*   The pre-trained BART model achieves reasonable ROUGE scores on the summarization task, indicating its effectiveness out-of-the-box.
*   The extractive summarization with TextRank provides a different perspective on summarization by selecting important sentences from the original text.
*   Fine-tuning the BART model on the specific dataset can potentially improve performance, although the current training run is limited in epochs and dataset size. Further training and hyperparameter tuning could yield better results.
*   The notebook demonstrates the workflow for both extractive and abstractive text summarization using popular Python libraries.