In [None]:
%%capture
!pip install transformers
!pip install sentencepiece
!pip install datasets
!pip install torch == 1.9.0
!pip install py7zr
!pip install torchtext == 0.10.0
!pip install sacrebleu
!pip install rogue_score

In [None]:
from datasets import load_dataset
import torch
import pandas as pd
import numpy as np
import csv
from tqdm import tqdm
import torchtext

In [None]:
dataset_name = "multi_news"
multi_news_dataset = load_dataset(dataset_name)
multi_news_dataset

Downloading builder script:   0%|          | 0.00/3.83k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.82k [00:00<?, ?B/s]



Downloading and preparing dataset multi_news/default (download: 721.73 MiB, generated: 664.42 MiB, post-processed: Unknown size, total: 1.35 GiB) to /root/.cache/huggingface/datasets/multi_news/default/1.0.0/2f1f69a2bedc8ad1c5d8ae5148e4755ee7095f465c1c01ae8f85454342065a72...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/58.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/66.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.30M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/69.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.31M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/44972 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5622 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5622 [00:00<?, ? examples/s]

Dataset multi_news downloaded and prepared to /root/.cache/huggingface/datasets/multi_news/default/1.0.0/2f1f69a2bedc8ad1c5d8ae5148e4755ee7095f465c1c01ae8f85454342065a72. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['document', 'summary'],
        num_rows: 44972
    })
    validation: Dataset({
        features: ['document', 'summary'],
        num_rows: 5622
    })
    test: Dataset({
        features: ['document', 'summary'],
        num_rows: 5622
    })
})

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

Downloading:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

In [None]:
def convert_examples_to_features(example_batch):
  input_encodings = tokenizer.batch_encode_plus(example_batch["document"], pad_to_max_length=True, max_length=1024, truncation=True)
  target_encodings = tokenizer.batch_encode_plus(example_batch["summary"], pad_to_max_length=True, max_length=1024, truncation=True)

  return {
      "input_ids":input_encodings["input_ids"],
      "attention_mask": input_encodings["attention_mask"],
      "labels": target_encodings["input_ids"]
  }

multi_news_dataset_pt = multi_news_dataset.map(convert_examples_to_features, batched=True)
columns = ["input_ids", "attention_mask", "labels"]
multi_news_dataset_pt.set_format(type="torch", columns=columns)

  0%|          | 0/45 [00:00<?, ?ba/s]



  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [None]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir = "finetuned_bart_large_cnn_on_multinews",
    num_train_epochs = 1,
    warmup_steps = 500,
    weight_decay = 0.01, logging_steps=10,
    evaluation_strategy = "steps",
    eval_steps = 500,
    push_to_hub=True,
    save_steps = 1e6,
    gradient_accumulation_steps = 16
)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [None]:
trainer = Trainer(model=model, args=training_args, tokenizer=tokenizer,
                  data_collator = seq2seq_data_collator,
                  train_dataset = multi_news_dataset_pt["train"],
                  eval_dataset = multi_news_dataset_pt["validation"])

Cloning https://huggingface.co/abdulmatinomotoso/finetuned_bart_large_cnn_on_multinews into local empty directory.


In [None]:
#trainer.train()

In [None]:
trainer.push_to_hub("fine-tuning-complete")


In [None]:
from datasets import load_metric

bleu_metric = load_metric("sacrebleu")
rouge_metric = load_metric("rouge")

rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

In [None]:
def chunks(list_of_elements, batch_size):
  """ yield successive batch-sized chunks from list_of_elements"""

  for i in range(0, len(list_of_elements), batch_size):
    yield list_of_elements[i: i+batch_size]


In [None]:
def evaluate_summaries_distilbart(dataset, metric, model, tokenizer,
                                  batch_size=16, device=device,
                                  column_text ="document" ,
                                  column_summary= "summary"):
  article_batches = list(chunks(dataset[column_text], batch_size))
  target_batches = list(chunks(dataset[column_summary], batch_size))

  for article_batche, target_batche in tqdm(
      zip(article_batches, target_batches), total = len(article_batches)):

    inputs = tokenizer(article_batche, max_length=1024, truncation=True,
                       padding="max_length", return_tensors="pt")
    summaries = model.generate(input_ids = inputs["input_ids"].to(device),
                               attention_mask= inputs["attention_mask"].to(device),
                               length_penalty=0.8, num_beams=8, max_length=128)

    decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                          clean_up_tokenization_spaces=True)
    for s in summaries]

    decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]
    metric.add_batch(predictions=decoded_summaries, reference=target_batches)

  score = metric.compute()
  return score

In [None]:
score = evaluate_summaries_distilbart(
    multi_news_dataset["test"], rouge_metric, trainer.model, tokenizer,
    batch_size=2, column_text="document", column_summary="summary"
)

rouge_dict = dict((rn, score[rn].mid.fmeasures) for rn in rouge_names)
pd.DataFrame(rouge_dict,index=[f"bart"])

In [None]:
import transformers
from transformers import pipeline
transformers.logging.set_verbositiy_error()

In [None]:
gen_kwargs = {"length_penalty":0.8, "num_beams":8, max_length:128}
sample_text = multi_news_dataset[0]["document"]
reference = multi_news_dataset[0]["summary"]
pipe = pipeline("summarization", model="abdulmatinomotoso/finetuned_bart_large_cnn_on_multinews")

print("Article")
print(sample_text)
print("\mReferenced Summary: ")
print(reference)
print("\nModel Summary")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])