In [None]:
!pip install transformers[sentencepiece] datasets rouge_score py7zr

In [None]:
!nvidia-smi

In [None]:
from transformers import pipeline, set_seed

import matplotlib.pyplot as plt
from datasets import load_dataset
import pandas as pd
from datasets import load_dataset, load_metric

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

nltk.download("punkt")

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

In [None]:
# Load 1% of the training set
train_dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")

# Load 1% of the validation set
valid_dataset = load_dataset("cnn_dailymail", "3.0.0", split="validation[:1%]")

# Load 1% of the test set
test_dataset = load_dataset("cnn_dailymail", "3.0.0", split="test[:1%]")
train_dataset

In [None]:
print(f"Features: {train_dataset.column_names}")

Features: ['article', 'highlights', 'id']


In [None]:
print("\nDocument:")
print(test_dataset[0]["article"])

print("\nSummary:")

print(test_dataset[0]["highlights"])

In [None]:
test_dataset[1]['article']

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# Load the model and tokenizer
model_ckpt = "google/pegasus-xsum"
tokenizer = PegasusTokenizer.from_pretrained(model_ckpt)
model = PegasusForConditionalGeneration.from_pretrained(model_ckpt)

# Prepare input sequence
input_sequence = tokenizer(test_dataset[1]['article'], truncation=True, padding='longest', return_tensors="pt")



In [None]:
y = (len(test_dataset[1]['article']))
y

In [None]:
# Generate summary
generated_summary_ids = model.generate(
    
    input_sequence['input_ids'],
    attention_mask=input_sequence['attention_mask'],
    max_length=int(y*0.05),
    min_length=int(y*0.01),
    num_beams=4,
    length_penalty=0.4,
    early_stopping=True
)




In [None]:
reference_summaries = []
for i in range(50):
    reference_summaries.append(test_dataset[i]['highlights'])


# Print summary
print(reference_summaries[1])

In [None]:
# Decode summary
generated_summaries = []
for i in range(50):
    # generated_summary = tokenizer.decode(generated_summary_ids[i], skip_special_tokens=True)
    # generated_summaries.append(generated_summary)
    # Prepare input sequence
    input_sequence = tokenizer(test_dataset[i]['article'], truncation=True, padding='longest', return_tensors="pt")

    # Generate summary
    generated_summary_ids = model.generate(
        input_sequence['input_ids'],
        attention_mask=input_sequence['attention_mask'],
        max_length=64,
        num_beams=4,
        length_penalty=0.6,
        early_stopping=True
    )

    # Decode summary
    generated_summary = tokenizer.decode(generated_summary_ids[0], skip_special_tokens=True)
    generated_summaries.append(generated_summary)
# Print summary
print(generated_summaries[1])

In [None]:
#pipe = pipeline('summarization', model = model_ckpt )

In [None]:
#pipe_out = pipe(test_dataset_multi_news[1]['document'] )
#print(pipe_out)

In [None]:
#print(pipe_out[0]['summary_text'].replace(" .", ".\n"))

In [None]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

In [None]:
!pip install rouge
from rouge import Rouge
rouge = Rouge(metrics=['rouge-1','rouge-2','rouge-l'])

In [None]:
scores = rouge.get_scores(generated_summaries[:50], reference_summaries[:50], avg=True)

# Print the ROUGE scores and their averages for the dataset
for metric, values in scores.items():
    print(f"\nROUGE-{metric} scores: \n")
    print(values)
    print(f"Average ROUGE-{metric} score: {values['f']}\n")

In [None]:
# def calculate_metric_on_test_ds(dataset, metric, model, tokenizer, 
#                                batch_size=16, device=device, 
#                                column_text="article", 
#                                column_summary="highlights"):
#     article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
#     target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

#     for article_batch, target_batch in tqdm(
#         zip(article_batches, target_batches), total=len(article_batches)):
        
#         inputs = tokenizer(article_batch, max_length=1024,  truncation=True, 
#                         padding="max_length", return_tensors="pt")
        
#         summaries = model.generate(input_ids=inputs["input_ids"].to(device),
#                          attention_mask=inputs["attention_mask"].to(device), 
#                          length_penalty=0.8, num_beams=8, max_length=128)
#         ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''
        
#         # Finally, we decode the generated texts, 
#         # replace the  token, and add the decoded texts with the references to the metric.
#         decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, 
#                                 clean_up_tokenization_spaces=True) 
#                for s in summaries]      
        
#         decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
        
        
#         metric.add_batch(predictions=decoded_summaries, references=target_batch)
        
#     #  Finally compute and return the ROUGE scores.
#     score = metric.compute()
#     return score

In [None]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['article'] , max_length = 1024, truncation = True )
    
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['highlights'], max_length = 128, truncation = True )
        
    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }
    
train_dataset_pt = train_dataset.map(convert_examples_to_features, batched = True)
val_dataset_pt = valid_dataset.map(convert_examples_to_features, batched = True)
test_dataset_pt = test_dataset.map(convert_examples_to_features, batched = True)

Map:   0%|          | 0/2871 [00:00<?, ? examples/s]



Map:   0%|          | 0/134 [00:00<?, ? examples/s]

Map:   0%|          | 0/115 [00:00<?, ? examples/s]

In [None]:
train_dataset_pt[0]

{'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office char

In [None]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [None]:
from transformers import TrainingArguments, Trainer

trainer_args = TrainingArguments(
    output_dir='pegasus-cnn', num_train_epochs=1,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10,
    evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16
) 

In [None]:
#dataset_multi_news_pt["train"] = dataset_multi_news_pt["train"][:500]
#dataset_multi_news_pt["validation"] = dataset_multi_news_pt["validation"][:200]

In [None]:
trainer = Trainer(model=model, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=train_dataset_pt, 
                  eval_dataset=val_dataset_pt)

RuntimeError: ignored

In [None]:
trainer.train()



RuntimeError: ignored

In [None]:
# scores = rouge.get_scores(generated_summaries[:1000], reference_summaries[:1000], avg=True)

# # Print the ROUGE scores and their averages for the dataset
# for metric, values in scores.items():
#     print(f"\nROUGE-{metric} scores: \n")
#     print(values)
#     print(f"Average ROUGE-{metric} score: {values['f']}\n")

In [None]:
## Save model
model.save_pretrained("pegasus-cnn")

In [None]:
## Save tokenizer
tokenizer.save_pretrained("cnn_tokenizer")

In [None]:
# Generate summary
generated_summary_ids = model.generate(
    
    input_sequence['input_ids'],
    attention_mask=input_sequence['attention_mask'],
    max_length=int(y*0.05),
    min_length=int(y*0.01),
    num_beams=4,
    length_penalty=0.4,
    early_stopping=True
)


In [None]:
# Decode summary
generated_summaries = []
for i in range(50):
    # generated_summary = tokenizer.decode(generated_summary_ids[i], skip_special_tokens=True)
    # generated_summaries.append(generated_summary)
    # Prepare input sequence
    input_sequence = tokenizer(test_dataset[i]['article'], truncation=True, padding='longest', return_tensors="pt")

    # Generate summary
    generated_summary_ids = model.generate(
        input_sequence['input_ids'],
        attention_mask=input_sequence['attention_mask'],
        max_length=64,
        num_beams=4,
        length_penalty=0.6,
        early_stopping=True
    )

    # Decode summary
    generated_summary = tokenizer.decode(generated_summary_ids[0], skip_special_tokens=True)
    generated_summaries.append(generated_summary)
# Print summary
print(generated_summaries[1])

In [None]:
scores = rouge.get_scores(generated_summaries[:50], reference_summaries[:50], avg=True)

# Print the ROUGE scores and their averages for the dataset
for metric, values in scores.items():
    print(f"\nROUGE-{metric} scores: \n")
    print(values)
    print(f"Average ROUGE-{metric} score: {values['f']}\n")