In this colab we:
1. Import CNN dailymail (news, summarization) pairs.
2. SUMMARIZE first example (about Hary Potter)
    - baseline = first 3 senteces of article
    - GPT-2 = append TL;DR and generate next tokens
    

In [None]:
# !pip install transformers datasets rouge_score sacrebleu evaluate py7zr pynvml xformers sentencepiece 

In [None]:
from huggingface_hub import notebook_login
notebook_login() #             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

In [50]:
import os
import json
import pandas as pd
import torch

import transformers
import evaluate
from datasets import load_dataset

import nltk
nltk.download('punkt')

device = 'cuda'

dataset = load_dataset("cnn_dailymail", version="3.0.0")
dataset['train'].column_names

summaries = {}

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nikit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  0%|          | 0/3 [00:00<?, ?it/s]

In [48]:
sample = dataset['train'][0]
print(sample['article'][29:211], '...')
print('- - - - -')
print(sample['highlights'])

Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel ...
- - - - -
Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Young actor says he has no plans to fritter his cash away .
Radcliffe's earnings from first five Potter films have been held in trust fund .


In [61]:
# Baseline = first 3 senteces
def three_sentence_summary(text):
    return " ".join(nltk.sent_tokenize(text)[:3])
summaries['baseline'] = three_sentence_summary(sample['article'][:1000])

# GPT-2 (not trained to SUMMARIZE but at least we can give a shot by appending "TL;DR")
if 'COLAB_GPU' in os.environ:
    gpt2_query = sample['article']  + "\nTL;DR:\n"
    gpt2_pipe = transformers.pipeline("text-generation", model="gpt2-xl")
    gpt2_out = gpt2_pipe(gpt2_query, max_length=1024, clean_up_tokenization_spaces=True)
    summaries['gpt2'] = "".join(nltk.sent_tokenize(gpt2_out[0]["generated_text"][len(gpt2_query) :]))

    # T5 fine-tuned on Summarization (CNN/DailyMail included)
    t5_pipe = transformers.pipeline("summarization", model="t5-large")
    t5_out = t5_pipe(sample['article'])
    summaries['t5'] = "".join(nltk.sent_tokenize(t5_out[0]["summary_text"]))
    clear_memory(t5_pipe)

    # BART exclusively fine-tuned on CNN/DailyMail
    bart_pipe = transformers.pipeline("summarization", model="facebook/bart-large-cnn")
    bart_out = bart_pipe(sample['article'])
    summaries['bart'] = "".join(nltk.sent_tokenize(bart_out[0]["summary_text"]))
    clear_memory(bart_pipe)

    # PEGASUS exclusively fine-tuned on CNN/DailyMail
    pegasus_pipe = transformers.pipeline("summarization", model="google/pegasus-cnn_dailymail")
    pegasus_out = pegasus_pipe(sample['article'])
    summaries['pegasus'] = "".join(nltk.sent_tokenize(pegasus_out[0]["summary_text"])).replace(" .<n>", ".\n")
    clear_memory(pegasus_pipe)
    
else: # import results
    with open('summaries.json', 'r') as file:
        summaries = json.load(file)
    

print("GROUND TRUTH")
print(sample['highlights'], '\n')    

for model_name in summaries:
    print(model_name.upper())
    print(summaries[model_name], '\n')


GROUND TRUTH
Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Young actor says he has no plans to fritter his cash away .
Radcliffe's earnings from first five Potter films have been held in trust fund . 

BASELINE
LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. 

GPT2
Rudyard Kipling's youngest son is coming of age in "Harry Potter and the Order of the Phoenix."Ex

# Let's compute BLEU and also ROUGE

First on Harry Potter sample, then on the whole CNN/DailyMail

In [74]:
# Evaluate Bleu & Rouge on "Harry Potter"

bleu_metric = evaluate.load("sacrebleu")
rouge_metric = evaluate.load("rouge")

records = []

rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

for model_name in summaries:
    
    rouge_metric.add(prediction=[summaries[model_name]], reference=[sample['highlights']])
    rouge_score = rouge_metric.compute()
    # record_dict = dict((rn, rouge_score[rn].mid.fmeasure) for rn in rouge_names)
    record_dict = rouge_score

    bleu_metric.add(prediction=[summaries[model_name]], reference=[sample['highlights']])
    bleu_score = bleu_metric.compute() 
    record_dict['sacre_bleu'] = bleu_score['score']

    records.append(record_dict)

print('"Harry Potter" scores:')
pd.DataFrame.from_records(records, index=summaries.keys())

"Harry Potter" scores:


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum,sacre_bleu
baseline,0.309677,0.235294,0.283871,0.283871,12.417712
gpt2,0.152381,0.019417,0.133333,0.133333,1.222497
t5,0.266667,0.194175,0.228571,0.228571,13.933045
bart,0.590909,0.348837,0.522727,0.522727,26.486346
pegasus,0.8,0.692308,0.8,0.8,47.368184


In [76]:
# Evaluate BASELINE Rouge on 1k CNN/DailyMail
test_sampled = dataset['test'].shuffle(seed=42).select(range(1000))

rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("sacrebleu")

def evaluate_summaries_baseline(dataset, metric,
                                column_text="article",
                                column_summary="highlights"):
    summaries = [three_sentence_summary(text) for text in dataset[column_text]]
    metric.add_batch(predictions=[summaries],
                     references=[dataset[column_summary]])
    score = metric.compute()
    return score

if 'COLAB_GPU' in os.environ:
    rouge_score = evaluate_summaries_baseline(test_sampled, rouge_metric)
    bleu_score = evaluate_summaries_baseline(test_sampled, bleu_metric)

    # metrics = dict((rn, rouge_score[rn].mid.fmeasure) for rn in ["rouge1", "rouge2", "rougeL", "rougeLsum"])
    metrics = rouge_score
    metrics['sacre_bleu'] = bleu_score['score']
else:
    with open('metrics.json', 'r') as file:
        metrics = json.load(file)

# evaluated only on 10 examples
#	        rouge1	    rouge2	    rougeL	    rougeLsum
#  pegasus	0.421244	0.192581	0.305494	0.363489

pd.DataFrame.from_dict(metrics, orient="index", columns=["baseline"]).T

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
baseline,0.666425,0.295089,0.246468,0.246468


In [None]:
# Evaluate PEGASUS Rouge on CNN/DailyMail
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from tqdm import tqdm
import evaluate
test_sampled = dataset['test'].shuffle(seed=42).select(range(1000))
rouge_metric = evaluate.load("rouge")


def chunks(list_of_elements, batch_size):
    """Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def evaluate_summaries_pegasus(dataset, metric, model, tokenizer,
                               batch_size=16, device=device,
                               column_text="article",
                               column_summary="highlights"):
    article_batches = list(chunks(dataset[column_text], batch_size))
    target_batches = list(chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):

        inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
                        padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device),
                         length_penalty=0.8, num_beams=8, max_length=128)

        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                clean_up_tokenization_spaces=True)
               for s in summaries]
        decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]
        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    score = metric.compute()
    return score



if 'COLAB_GPU' in os.environ:
    model_ckpt = "google/pegasus-cnn_dailymail"
    pegasus_tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
    pegasus_model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)
    score = evaluate_summaries_pegasus(test_sampled, rouge_metric, pegasus_model, pegasus_tokenizer, batch_size=8)
else:
    with open('pegasus_cnn_score.json', 'r') as file:
        score = json.load(file)

# published paper results: 
# R1 - 0.439, R2 - 0.212, RL - 0.407
pd.DataFrame(score, index=['pegasus'])

# Fine-tune Pegasus on SAMSum
Consider Summarization for another dataset : Dialogues (SAMSum).
- The summarization should be more abstract and written from third-person-like

In [33]:
dataset_samsum = load_dataset('samsum')
print(dataset_samsum)

samsum_sample = dataset_samsum['test'][0]
print('Dialogue:')
print(samsum_sample['dialogue'])
print('\nSummary:')
print(samsum_sample['summary'])



  0%|          | 0/3 [00:00<?, ?it/s]

Dialogue:
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.


In [None]:
# Let's zero-shot PEGASUS on Hannah example

pegasus_out = pegasus_pipe(dataset_samsum['test'][0]['dialogue'])
print('Pegasus summary:')
print(pegasus_out[0]['summary_text'].replace(' .<n>', '.\n'))

- the model tries to summarize by extracting the key sentences. 
- That is OK for CNN/DailyMail but not SAMSum

Let's compute zero-shot **Rouge** of PEGASUS on whole SAMSum

In [None]:
score = evaluate_summaries_pegasus(dataset_samsum["test"], rouge_metric, model,
                                   tokenizer, column_text="dialogue",
                                   column_summary="summary", batch_size=8)

rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)

# HG book: R1 - 0.296, R2 - 0.088, RL - 0.230, RLsum - 0.230
pd.DataFrame(rouge_dict, index=["pegasus"])

In [None]:
def cast_dataset_to_tensors(batch, tokenizer):

    input_encodings = tokenizer(batch['dialogue'], max_length=1024, truncation=True)
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(batch['summary'], max_length=128, truncation=True)
    
    return {'input_ids' : input_encodings['input_ids'],
            'attention_mask' : input_encodings['attention_mask'],
            'labels' : target_encodings['input_ids']}

dataset_samsum_pt = dataset_samsum.map(cast_dataset_to_tensors, batched=True)

columns = ['input_ids', 'labels', 'attention_mask']
dataset_samsum_pt.set_format(type='torch', columns=columns)

Dataccolator:
- stack all tensors from batch
- prepare decoder targets (teacher forcing)

In [None]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(pegasus_tokenizer, model=pegasus_model)

from transformers import TrainingArguments, Trainer

trainings_args = TrainingArguments(output_dir='pegasus-samsum', num_train_epochs=1,
warmup_steps=500, per_device_train_batch_size=1, per_device_eval_batch_size=1,
weight_decay=0.01, logging_steps=10, push_to_hub=True, evaluation_strategy='steps',
eval_steps=500, save_steps=1e6, gradient_accumulation_steps=16)

trainer.train()

score = evaluate_summaries_pegasus(dataset['test'], rouge_metric, trainer.model, pegasus_tokenizer,
batch_size=2, column_text='dialogue', column_summary='summary')

rouge_ft_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
pd.DataFrame(rouge_ft_dict, index=['pegasus'])

In [None]:
trainer.push_to_hub("Fine-tuning PEGASUS on SAMSum complete!")

## Let's generate Dialogue summaries

In [None]:
gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}
sample_text = dataset_samsum["test"][0]["dialogue"]


reference = dataset_samsum["test"][0]["summary"]
pipe = pipeline("summarization", model="nikitakapitan/pegasus-samsum")

print('Dialogue:')
print(samsum_sample['dialogue'])
print('\nReference Summary:')
print(samsum_sample['summary'])
print('\nPegasus SAMSum summary:')
print(pipe(sample_text, **gen_kwargs)[0]['summary_text'])