In this colab we:
1. Import CNN dailymail (news, summarization) pairs.
2. SUMMARIZE first example (about Hary Potter)
    - baseline = first 3 senteces of article
    - GPT-2 = append TL;DR and generate next tokens
    

In [None]:
# !pip install transformers datasets rouge_score py7zr

In [1]:
from datasets import load_dataset
import nltk
nltk.download('punkt')
import transformers
from datasets import load_metric

from huggingface_hub import notebook_login
notebook_login()

dataset = load_dataset("cnn_dailymail", version="3.0.0")
dataset['train'].column_names

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nikit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Found cached dataset cnn_dailymail (C:/Users/nikit/.cache/huggingface/datasets/cnn_dailymail/default/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


  0%|          | 0/3 [00:00<?, ?it/s]

['article', 'highlights', 'id']

In [2]:
sample = dataset['train'][0]
print(sample['article'][29:211], '...')
print('- - - - -')
print(sample['highlights'])

Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel ...
- - - - -
Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Young actor says he has no plans to fritter his cash away .
Radcliffe's earnings from first five Potter films have been held in trust fund .


In [None]:
summaries = {}

# Baseline = first 3 senteces
def three_sentence_summary(text):
    return " ".join(nltk.sent_tokenize(text)[:3])
summaries['baseline'] = three_sentence_summary(sample['article'][:1000])

# GPT-2 (not trained to SUMMARIZE but at least we can give a shot by appending "TL;DR")
if 'COLAB_GPU' in os.environ:
    gpt2_query = sample['article']  + "\nTL;DR:\n"
    gpt2_pipe = transformers.pipeline("text-generation", model="gpt2-xl")
    gpt2_out = pipe(gpt2_query, max_length=512, clean_up_tokenization_spaces=True)
    summaries['gpt2'] = "".join(nltk.sent_tokenize(gpt2_out[0]["generated_text"][len(gpt2_query) :]))

# T5. "summarization" pipeline is ~ T5Model.from_pretrained('t5-large') + prompt: "summarize: <ARTICLE>"
if 'COLAB_GPU' in os.environ:
    t5_pipe = transformers.pipeline("summarization", model="t5-large")
    t5_out = pipe(sample['article'])
    summaries['t5'] = "".join(nltk.sent_tokenize(t5_out[0]["summary_text"]))

# BART fine-tuned on the CNN/DailyMail
if 'COLAB_GPU' in os.environ:
    bart_pipe = transformers.pipeline("summarization", model="facebook/bart-large-cnn")
    bart_out = pipe(sample['article'])
    summaries['bart'] = "".join(nltk.sent_tokenize(bart_out[0]["summary_text"]))

# PEGASUS
if 'COLAB_GPU' in os.environ:
    pegasus_pipe = transformers.pipeline("summarization", model="google/pegasus-cnn_dailymail")
    pegasus_out = pipe(sample['article'])
    summaries['pegasus'] = "".join(nltk.sent_tokenize(pegasus_out[0]["summary_text"])).replace(" .<n>", ".\n")

print("GROUND TRUTH")
print(sample['highlights'], '\n')

for model_name in summaries:
    print(model_name.upper())
    print(summaries[model_name], '\n')


Results:
- GPT-2 instead of text summary, summarized the characters, aka GPT-2 "hallucination"


# Let's compute BLEU and also ROUGE

First on Harry Potter sample, then on the whole CNN/DailyMail

In [None]:
# bleu = load_metric("sacrebleu")
rouge_metric = load_metric("rouge")

bleus= []
rouge_records = []
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

for model_name in summaries:
    # bleu.add(prediction=summaries[model_name], reference=sample['highlights'])
    rouge_metric.add(prediction=summaries[model_name], reference=sample['highlights'])

    # bleus.append(bleu.compute())
    rouge_score = rouge_metric.compute()

    rouge_dict = dict((rn, rouge_score[rn].mid.fmeasure) for rn in rouge_names)
    rouge_records.append(rouge_dict)

# pd.DataFrame.from_records(records, index=summaries.keys())
# pd.DataFrame(bleus)


In [21]:
def evaluate_summaries_baseline(dataset, metric,
                                column_text="article",
                                column_summary="highlights"):
    summaries = [three_sentence_summary(text) for text in dataset[column_text]]
    metric.add_batch(predictions=summaries,
                     references=dataset[column_summary])
    score = metric.compute()
    return score

test_sampled = dataset['test'].shuffle(seed=42).select(range(1000))

score = evaluate_summaries_baseline(test_sampled, rouge_metric)
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in ["rouge1", "rouge2", "rougeL", "rougeLsum"])
pd.DataFrame.from_dict(rouge_dict, orient="index", columns=["baseline"]).T

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
baseline,0.390732,0.176082,0.248922,0.324405


In [None]:
def chunks(list_of_elements, batch_size):
    """Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def evaluate_summaries_pegasus(dataset, metric, model, tokenizer,
                               batch_size=16, device=device,
                               column_text="article",
                               column_summary="highlights"):
    article_batches = list(chunks(dataset[column_text], batch_size))
    target_batches = list(chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):

        inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
                        padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device),
                         length_penalty=0.8, num_beams=8, max_length=128)

        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                clean_up_tokenization_spaces=True)
               for s in summaries]
        decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]
        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    score = metric.compute()
    return score


from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_ckpt = "google/pegasus-cnn_dailymail"
pegasus_tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
pegasus_model = AutoModelForSeq2SeqLM.from_pertrained(model_ckpt).to(device)
score = evaluate_summaries_pegasus(test_sampled, rouge_metric, pegasus_model, pegasus_tokenizer, batch_size=8)
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)

# published paper results: 
# R1 - 0.439, R2 - 0.212, RL - 0.407
pd.DataFrame(rouge_dict, index=['pegasus'])

# Fine-tune Pegasus
Consider Summarization for another dataset : Dialogues (SAMSun).
- The summarization should be more abstract and written from third-person-like

In [33]:
dataset_samsum = load_dataset('samsum')
print(dataset_samsum)

samsum_sample = dataset_samsum['test'][0]
print('Dialogue:')
print(samsum_sample['dialogue'])
print('\nSummary:')
print(samsum_sample['summary'])



  0%|          | 0/3 [00:00<?, ?it/s]

Dialogue:
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.


In [None]:
# Let's zero-shot PEGASUS on Hannah example

pegasus_out = pegasus_pipe(dataset_samsum['test'][0]['dialogue'])
print('Pegasus summary:')
print(pegasus_out[0]['summary_text'].replace(' .<n>', '.\n'))

- the model tries to summarize by extracting the key sentences. 
- That is OK for CNN/DailyMail but not SAMSum

Let's compute zero-shot **Rouge** of PEGASUS on whole SAMSum

In [None]:
score = evaluate_summaries_pegasus(dataset_samsum["test"], rouge_metric, model,
                                   tokenizer, column_text="dialogue",
                                   column_summary="summary", batch_size=8)

rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
pd.DataFrame(rouge_dict, index=["pegasus"])

# HG book: R1 - 0.296, R2 - 0.088, RL - 0.230, RLsum - 0.230

In [None]:
def cast_dataset_to_tensors(batch, tokenizer):

    input_encodings = tokenizer(batch['dialogue'], max_length=1024, truncation=True)
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(batch['summary'], max_length=128, truncation=True)
    
    return {'inputs_ids' : input_encodings['input_ids'],
            'attention_mask' : input_encodings['attention_mask'],
            'labels' : target_encodings['input_ids']}

dataset_samsum_pt = dataset_samsum.map(cast_dataset_to_tensors, batched=True)

columns = ['input_ids', 'labels', 'attention_mask']
dataset_samsun_pt.set_format(type='torch', columns=columns)

Dataccolator:
- stack all tensors from batch
- prepare decoder targets (teacher forcing)

In [None]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(pegasus_tokenizer, model=pegasus_model)

from transformers import TrainingArguments, Trainer

trainings_args = TrainingArguments(output_dir='pegasus-samsum', num_train_epochs=1,
warmup_steps=500, per_device_train_batch_size=1, per_device_eval_batch_size=1,
weight_decay=0.01, logging_steps=10, push_to_hub=True, evaluation_strategy='steps',
eval_steps=500, save_steps=1e6, gradient_accumulation_steps=16)