# Fine Tuning

In [None]:
!pip install datasets
!pip install transformers
!pip install evaluate
!pip install rouge_score
!pip install sentencepiece

## Import libraries

In [15]:
from tqdm import tqdm
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_metric
from datasets import load_dataset
import pandas as pd
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
import evaluate
import pandas as pd

## Dataset

In [None]:
dataset = load_dataset("json", data_files={'train':'train.json', 'test': 'test.json', 'validation':'validate.json'})


## Model selection

In [5]:
#Select the hardware to use
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
#Model selection
models = [('t5-base', 't5-base-tos'), ('google/pegasus-cnn_dailymail', 'pegasus-tos')]

In [7]:
def chunks(list_of_elements, batch_size):
    """Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

In [8]:
def evaluate_summaries(dataset, metric, model, tokenizer, batch_size=1, device=device, column_text="plain_text", column_summary="summary"):
    tos_batches = list(chunks(dataset[column_text], batch_size))
    target_batches = list(chunks(dataset[column_summary], batch_size))
    for tos_batch, target_batch in tqdm(
        zip(tos_batches, target_batches), total=len(tos_batches)):
        inputs = tokenizer(tos_batch, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                        attention_mask=inputs["attention_mask"].to(device),
                         length_penalty=0.8, num_beams=8, max_length=128)
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                 clean_up_tokenization_spaces=True) for s in summaries]
        decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]
        metric.add_batch(predictions=decoded_summaries, references=target_batch)
    
    score = metric.compute()
    return score

## Tokenization
## Pag. 142 tokens limitation

## Training

In [None]:
training_args = TrainingArguments(
 output_dir='training_log', num_train_epochs=1, warmup_steps=500,
 per_device_train_batch_size=1, per_device_eval_batch_size=1,
 weight_decay=0.01, logging_steps=1, push_to_hub=False,
 evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
 gradient_accumulation_steps=16)

In [None]:
for model_ckpt, folder in models:
    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

    def convert_examples_to_features(example_batch):
      input_encodings = tokenizer(example_batch["plain_text"], max_length=1024, truncation=True)
      with tokenizer.as_target_tokenizer():
          target_encodings = tokenizer(example_batch["summary"], max_length=1024, truncation=True)
      return {"input_ids": input_encodings["input_ids"], "attention_mask": input_encodings["attention_mask"],
              "labels": target_encodings["input_ids"]}


    dataset_tos = dataset.map(convert_examples_to_features, batched=True)
    columns = ["input_ids", "labels", "attention_mask"]
    dataset_tos.set_format(type="torch", columns=columns)

    pretrained_model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)
    seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=pretrained_model)

    trainer = Trainer(model=pretrained_model, args=training_args,
     tokenizer=tokenizer, data_collator=seq2seq_data_collator,
     train_dataset=dataset_tos["train"],
     eval_dataset=dataset_tos["validation"])

    trainer.train()
    trainer.model.save_pretrained(folder)

## Evaluation

In [None]:
from transformers import pipeline
import nltk
from nltk.tokenize import sent_tokenize
from datasets import load_dataset

nltk.download('punkt')

In [18]:
rouge_metric = evaluate.load("rouge")
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

In [None]:
models_name = [('t5-base', 't5-base'), ('t5-base-tos', 't5-base'), ('google/pegasus-cnn_dailymail', 'google/pegasus-cnn_dailymail'), ('pegasus-tos', 'google/pegasus-cnn_dailymail')]
df = pd.DataFrame(columns = rouge_names)

for model_ckpt, tokenizer_name in models_name:
    model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    score_rouge = evaluate_summaries(
        dataset["test"], rouge_metric, model, tokenizer,
        batch_size=1, column_text="plain_text", column_summary="summary")
    df.loc[f"{model_ckpt}"] = [score_rouge['rouge1'], score_rouge['rouge2'], score_rouge['rougeL'], score_rouge['rougeLsum']]

df