Install and import all the required libraries.

In [42]:
!pip install torch
!pip install transformers
!pip install sentencepiece
!pip install accelerate -U
!pip install transformers[torch]
!pip install datasets
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
import torch
from datasets import load_dataset



In [2]:
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'

The huggingface defination of a Pegasus datatset for finetuning.

In [48]:

class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item
    def __len__(self):
        return len(self.labels['input_ids'])


def prepare_data(model_name,
                 train_texts, train_labels,
                 val_texts=None, val_labels=None,
                 test_texts=None, test_labels=None):

  tokenizer = PegasusTokenizer.from_pretrained(model_name)

  prepare_val = False if val_texts is None or val_labels is None else True
  prepare_test = False if test_texts is None or test_labels is None else True

  def tokenize_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True)
    decodings = tokenizer(labels, truncation=True, padding=True)
    dataset_tokenized = PegasusDataset(encodings, decodings)
    return dataset_tokenized

  train_dataset = tokenize_data(train_texts, train_labels)
  val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None
  test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None

  return train_dataset, val_dataset, test_dataset, tokenizer


def prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset=None, freeze_encoder=False, output_dir='./results'):
  """
  Prepare configurations and base model for fine-tuning
  """
  torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
  model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

  if freeze_encoder:
    for param in model.model.encoder.parameters():
      param.requires_grad = False


  training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=5,           # We fintuned the model on 5 epochs.
    per_device_train_batch_size=1,
    save_steps=500,
    save_total_limit=5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
  )

  trainer = Trainer(
    model=model,                         # Pegasus_large model
    args=training_args,
    train_dataset=train_dataset,         # MRPC
    tokenizer=tokenizer
  )

  return trainer

Load the MRPC dataset

In [49]:
mrpc_dataset = load_dataset("glue", "mrpc",split='train')
mrpc_dataset.set_format(type='torch', columns=['sentence1', 'sentence2', 'label', 'idx'])

Obseerve the structure of the dataset

In [53]:
mrpc_dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

Get the train and test labels from the dataset

In [50]:
train_texts, train_labels = mrpc_dataset['sentence1'][:1000], mrpc_dataset['sentence2'][:1000]


The base pretrained model is going to be the pegasus-large model

In [14]:
model_name = 'google/pegasus-large'

In [51]:
train_dataset, _, _, tokenizer = prepare_data(model_name, train_texts, train_labels)

In [52]:
trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset, freeze_encoder = True)


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Finetune the model for 5 epochs

In [17]:
trainer.train()

Step,Training Loss
10,6.9821
20,7.4255
30,8.2796
40,8.8424
50,8.1736
60,6.1606
70,6.5197
80,7.9541
90,6.5315
100,6.861


TrainOutput(global_step=5000, training_loss=4.3143234455108646, metrics={'train_runtime': 1429.2686, 'train_samples_per_second': 3.498, 'train_steps_per_second': 3.498, 'total_flos': 733653073920000.0, 'train_loss': 4.3143234455108646, 'epoch': 5.0})

Save the model

In [18]:
trainer.save_model("./pegasus-fine-tuned-model-mrpc")
tokenizer.save_pretrained("./pegasus-fine-tuned-model-mrpc")

('./pegasus-fine-tuned-model-mrpc/tokenizer_config.json',
 './pegasus-fine-tuned-model-mrpc/special_tokens_map.json',
 './pegasus-fine-tuned-model-mrpc/spiece.model',
 './pegasus-fine-tuned-model-mrpc/added_tokens.json')

In [19]:
model = PegasusForConditionalGeneration.from_pretrained("./pegasus-fine-tuned-model-mrpc")
tokenizer = PegasusTokenizer.from_pretrained("./pegasus-fine-tuned-model-mrpc")

In [37]:
input_text = "rain boston"

In [40]:
batch = tokenizer.prepare_seq2seq_batch([input_text],
                                        truncation=False,
                                        padding='longest',
                                        max_length=200, return_tensors="pt")

batch_output = model.generate(**batch, max_length=200,
                            num_beams=3,
                            num_return_sequences=2,
                            temperature=0.5)

output = tokenizer.batch_decode(batch_output, skip_special_tokens=True)



In [39]:
print(output)

['rain boston', 'Rain is forecast to continue through the weekend in Boston and New York City.']
