# The CNN/DailyMail Dataset

In [1]:
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", version="3.0.0")
print(f"Features: {dataset['train'].column_names}")

Downloading builder script:   0%|          | 0.00/8.33k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/9.88k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

Downloading and preparing dataset cnn_dailymail/default to C:/Users/jenny/.cache/huggingface/datasets/cnn_dailymail/default/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset cnn_dailymail downloaded and prepared to C:/Users/jenny/.cache/huggingface/datasets/cnn_dailymail/default/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Features: ['article', 'highlights', 'id']


In [2]:
sample = dataset["train"][1]
print(f"""
      Article (excerpt of 500 characters, total length: {len(sample["article"])}):
      """)
print(f'\nsummary (length: {len(sample["highlights"])}):')
print(sample["highlights"])


      Article (excerpt of 500 characters, total length: 4051):
      

summary (length: 281):
Mentally ill inmates in Miami are housed on the "forgotten floor"
Judge Steven Leifman says most are there as a result of "avoidable felonies"
While CNN tours facility, patient shouts: "I am the son of the president"
Leifman says the system is unjust and he's fighting for change .


# Text Summarization Pipelines

In [4]:
sample_text = dataset["train"][1]["article"][:2000]
#We'll collect the generated summaries of each model in a dict
summaries = {}

In [5]:
import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")
string = "The U.S. are a country. The U.N. is an organization."
sent_tokenize(string)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jenny\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


['The U.S. are a country.', 'The U.N. is an organization.']

### Summarization Baseline

In [None]:
def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])

summaries["baseline"] = three_sentence_summary(sample_text)

# GPT-2

In [6]:
from transformers import pipeline, set_seed

set_seed(42)
pipe = pipeline("text-generation", model="gpt2-xl")
gpt2_query = sample_text + "\nTL;DR:\n"
pipe_out = pipe(gpt2_query, max_length=512, clean_up_tokenization_spaces=True)
summaries["gpt2"] = "\n".join(
    sent_tokenize(pipe_out[0]["generated_text"][len(gpt2_query) :])
)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


# T5

In [8]:
pipe = pipeline("summarization", model="t5-large")
pipe_out = pipe(sample_text)
summaries["t5"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

Downloading model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


# BART

In [None]:
pipe = pipeline("summarization", model="facebook/bart-large-cnn")
pipe_out = pipe(sample_text)
summaries["bart"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

# PEGASUS

In [None]:
pipe = pipeline("summarization", model="google/pegasus-cnn_dailymail")
pipe_out = pipe(sample_text)
summaries["pegasus"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"])).replace(". <n>", ".\n")

# Comparing Different Summaries

In [None]:
print("GROUND TRUTH")
print(dataset["train"][1]["highlights"])

for model_name in summaries:
    print(model_name.upper())
    print(summaries[model_name])
    print("")

# Measuring the Quality of Generated Text

# BLEU

In [None]:
from datasets import load_metric

bleu_metric = load_metric("sacrebleu")

In [None]:
import pandas as pd
import numpy as np 

bleu_metric.add(
    prediction = "the the the the the the", reference=["the cat is on the mat"])
results = bleu_metric.compute(smooth_method = "floor", smooth_value=0)
results["precisions"] = [np.round(p,2) for p in results["precisions"]]
pd.DataFrame.from_dict(results, orient="index", columns=["Value"])

In [None]:
bleu_metric.add(
    prediction = "the cat is on mat", reference=["the cat is on the mat"])
results = bleu_metric.compute(smooth_method = "floor", smooth_value=0)
results["precisions"] = [np.round(p,2) for p in results["precisions"]]
pd.DataFrame.from_dict(results, orient="index", columns=["Value"])

# ROUGE

In [None]:
rouge_metric = load_metric("rouge")

In [None]:
reference = dataset["train"][1]["highlights"]
records = []
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

for model_name in summaries:
    rouge_metric.add(prediction=summaries[model_name], reference=reference)
    score = rouge_metric.compute()
    #rouge_dict = dict((rn, score[rn].mid.fmeasure))

# Evaluating PEGASUS on the CNN/DailyMail Dataset

# Training a Summarization Model

# Evaluating PEGASUS on SAMSum

# Fine-Tuning PEGASUS

### Generating Dialogue Summaries