In [1]:
%%capture
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate

In [3]:
# DATA-PREP
from datasets import load_dataset

spanish_dataset = load_dataset("amazon_reviews_multi", "es")
english_dataset = load_dataset("amazon_reviews_multi", "en")
english_dataset



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 200000
    })
    validation: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 5000
    })
})

In [4]:
def show_samples(dataset, num_samples=3, seed=42):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n'>> Title: {example['review_title']}'")
        print(f"'>> Review: {example['review_body']}'")


show_samples(english_dataset)




'>> Title: Worked in front position, not rear'
'>> Review: 3 stars because these are not rear brakes as stated in the item description. At least the mount adapter only worked on the front fork of the bike that I got it for.'

'>> Title: meh'
'>> Review: Does it’s job and it’s gorgeous but mine is falling apart, I had to basically put it together again with hot glue'

'>> Title: Can't beat these for the money'
'>> Review: Bought this for handling miscellaneous aircraft parts and hanger "stuff" that I needed to organize; it really fit the bill. The unit arrived quickly, was well packaged and arrived intact (always a good sign). There are five wall mounts-- three on the top and two on the bottom. I wanted to mount it on the wall, so all I had to do was to remove the top two layers of plastic drawers, as well as the bottom corner drawers, place it when I wanted and mark it; I then used some of the new plastic screw in wall anchors (the 50 pound variety) and it easily mounted to the wall. 

In [5]:
def filter_books(example):
    return (
        example["product_category"] == "book"
        or example["product_category"] == "digital_ebook_purchase"
    )

In [6]:
spanish_books = spanish_dataset.filter(filter_books)
english_books = english_dataset.filter(filter_books)
#show_samples(english_books)



In [None]:
english_books

In [7]:
# CONCAT
from datasets import concatenate_datasets, DatasetDict

books_dataset = DatasetDict()

for split in english_books.keys():
    books_dataset[split] = concatenate_datasets(
        [english_books[split], spanish_books[split]]
    )
    books_dataset[split] = books_dataset[split].shuffle(seed=42)

# Peek at a few examples
show_samples(books_dataset)




'>> Title: Easy to follow!!!!'
'>> Review: I loved The dash diet weight loss Solution. Never hungry. I would recommend this diet. Also the menus are well rounded. Try it. Has lots of the information need thanks.'

'>> Title: PARCIALMENTE DAÑADO'
'>> Review: Me llegó el día que tocaba, junto a otros libros que pedí, pero la caja llegó en mal estado lo cual dañó las esquinas de los libros porque venían sin protección (forro).'

'>> Title: no lo he podido descargar'
'>> Review: igual que el anterior'


In [8]:
# FILTER SHORT SUMMARIES
books_dataset = books_dataset.filter(lambda x: len(x["review_title"].split()) > 2)



In [None]:
books_dataset

In [9]:
import datasets
def sample_datasets(data, train_split_name,
                    valid_split_name,
                    size=2000):
  
    d = {'train': data[train_split_name].shuffle(seed=42).select(range(size)),
         'validation': data[valid_split_name].shuffle(seed=42).select(range(int(0.33*size)))
        }
    return datasets.dataset_dict.DatasetDict(d)

In [10]:
books_dataset = sample_datasets(books_dataset, "train", "validation", size=200)



In [11]:
books_dataset

DatasetDict({
    train: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 200
    })
    validation: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 66
    })
})

In [12]:
books_dataset.save_to_disk("book_ds")

Saving the dataset (0/1 shards):   0%|          | 0/200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/66 [00:00<?, ? examples/s]

In [13]:

!pip install rouge_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [16]:
%%writefile Seq2SeqSummarization.py

import argparse
import os

parser = argparse.ArgumentParser()

parser.add_argument("--input_dataset", type=str, help="input_dataset")
parser.add_argument("--max_input_length", type=int, help="max_input_length")
parser.add_argument("--max_target_length", type=int, help="max_target_length")
parser.add_argument("--model_checkpoint", type=str, help="model_checkpoint")
parser.add_argument("--metric_data_load", type=str, help="metric_data_load")
parser.add_argument("--output_dir", type=str, help="output_dir")
parser.add_argument("--num_train_epochs", type=int, help="num_train_epochs")
parser.add_argument("--learning_rate", type=float, help="learning_rate")
parser.add_argument("--batch_size", type=int, help="batch_size")

args = parser.parse_args()

input_dataset=args.input_dataset
max_input_length=args.max_input_length
max_target_length=args.max_target_length
model_checkpoint=args.model_checkpoint
metric_data_load=args.metric_data_load
output_dir=args.output_dir
num_train_epochs=args.num_train_epochs
learning_rate=args.learning_rate
batch_size=args.batch_size





# PARAM
# max_input_length = 512
# max_target_length = 30
# model_checkpoint = "google/mt5-small"
# num_train_epochs = 3
# output_dir = "results-mt5-finetuned-squad-accelerate"
# batch_size = 8


# Libs
from transformers import AutoTokenizer
import numpy as np
import evaluate
import nltk
from nltk.tokenize import sent_tokenize

from transformers import AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from accelerate import Accelerator
from transformers import get_scheduler
from tqdm.auto import tqdm
import datasets


tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["review_body"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["review_title"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    #result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # ROUGE expects a newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels



def main():

  # read from disk
  raw_datasets= datasets.load_from_disk(input_dataset)
  print("raw_datasets is read")



  tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
  # added for correction???
  tokenized_datasets = tokenized_datasets.remove_columns(raw_datasets["train"].column_names)

  rouge_score = evaluate.load(metric_data_load)
  #nltk.download("punkt")

  #def three_sentence_summary(text):
  #    return "\n".join(sent_tokenize(text)[:3])


  model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
  data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


  train_dataloader = DataLoader(
      tokenized_datasets["train"],
      shuffle=True,
      collate_fn=data_collator,
      batch_size=batch_size,
  )
  eval_dataloader = DataLoader(
      tokenized_datasets["validation"],
      collate_fn=data_collator, 
      batch_size=batch_size
  )

  optimizer = AdamW(model.parameters(), lr=learning_rate)


  accelerator = Accelerator()
  model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
      model, optimizer, train_dataloader, eval_dataloader
  )


  num_update_steps_per_epoch = len(train_dataloader)
  num_training_steps = num_train_epochs * num_update_steps_per_epoch

  lr_scheduler = get_scheduler(
      "linear",
      optimizer=optimizer,
      num_warmup_steps=0,
      num_training_steps=num_training_steps,
  )


  progress_bar = tqdm(range(num_training_steps))

  for epoch in range(num_train_epochs):
      # Training
      model.train()
      for step, batch in enumerate(train_dataloader):
          outputs = model(**batch)
          loss = outputs.loss
          accelerator.backward(loss)

          optimizer.step()
          lr_scheduler.step()
          optimizer.zero_grad()
          progress_bar.update(1)

      # Evaluation
      model.eval()
      for step, batch in enumerate(eval_dataloader):
          with torch.no_grad():
              generated_tokens = accelerator.unwrap_model(model).generate(
                  batch["input_ids"],
                  attention_mask=batch["attention_mask"],
              )

              generated_tokens = accelerator.pad_across_processes(
                  generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
              )
              labels = batch["labels"]

              # If we did not pad to max length, we need to pad the labels too
              labels = accelerator.pad_across_processes(
                  batch["labels"], dim=1, pad_index=tokenizer.pad_token_id
              )

              generated_tokens = accelerator.gather(generated_tokens).cpu().numpy()
              labels = accelerator.gather(labels).cpu().numpy()

              # Replace -100 in the labels as we can't decode them
              labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
              
              if isinstance(generated_tokens, tuple):
                  generated_tokens = generated_tokens[0]
              decoded_preds = tokenizer.batch_decode(
                  generated_tokens, skip_special_tokens=True
              )
              decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

              decoded_preds, decoded_labels = postprocess_text(
                  decoded_preds, decoded_labels
              )

              rouge_score.add_batch(predictions=decoded_preds, references=decoded_labels)

      # Compute metrics
      result = rouge_score.compute()
      # Extract the median ROUGE scores
      #result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
      result = {key: value * 100 for key, value in result.items()}
      result = {k: round(v, 4) for k, v in result.items()}
      print(f"Epoch {epoch}:", result)

      # Save and upload
      accelerator.wait_for_everyone()
      unwrapped_model = accelerator.unwrap_model(model)
      unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
      if accelerator.is_main_process:
          tokenizer.save_pretrained(output_dir)
        


if __name__=="__main__":
    main()


Overwriting Seq2SeqSummarization.py


In [None]:
# PARAM
# max_input_length = 512
# max_target_length = 30
# model_checkpoint = "google/mt5-small"
# num_train_epochs = 3
# output_dir = "results-mt5-finetuned-squad-accelerate"
# batch_size = 8

In [19]:
!python ./Seq2SeqSummarization.py --input_dataset="book_ds" \
                                  --max_input_length=512 \
                                  --max_target_length=30 \
                                  --model_checkpoint="google/mt5-small" \
                                  --metric_data_load="rouge" \
                                  --output_dir="seq2seq-mt5-book" \
                                  --num_train_epochs=2 \
                                  --learning_rate=2e-5 \
                                  --batch_size=8

raw_datasets is read
  0% 0/50 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Epoch 0: {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
 98% 49/50 [00:09<00:00,  9.62it/s]Epoch 1: {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
100% 50/50 [00:12<00:00,  3.85it/s]


In [18]:
!ls

book_ds					sample_data
results-mt5-finetuned-squad-accelerate	Seq2SeqSummarization.py


In [24]:
# INFERENCE

def print_summary(idx):
    review = books_dataset["validation"][idx]["review_body"]
    title = books_dataset["validation"][idx]["review_title"]
    summary = summarizer(books_dataset["validation"][idx]["review_body"])[0]["summary_text"]
    print(f"'>>> Review: {review}'")
    print(f"\n'>>> Title: {title}'")
    print(f"\n'>>> Summary: {summary}'")

from transformers import pipeline
output_dir="./seq2seq-mt5-book"
#hub_model_id = output_dir

summarizer = pipeline("summarization", model=output_dir)

print_summary(4)

'>>> Review: Este libro aporta un poco más que la mayoría de sus contemporáneos de suspense, trabaja y profundiza bien en el personaje principal y Es original el enclave geográfico en el que transcurre. Es bastante mejor que los libros anglosajones que están tan de moda tipo la Chica del tren, nada que ver, este libro es para gente que le gusta leer más, lo recomiendo aunque tampoco deslumbra'

'>>> Title: Es un libro correcto'

'>>> Summary: <extra_id_0>.'


In [25]:
print_summary(12)

'>>> Review: El libro es muy bueno dependiendo de en que situación y humor estés... creo que ayuda mucho y los playlist son fenomenales!'

'>>> Title: Depende del mood con que lo leas'

'>>> Summary: <extra_id_0>.'


In [22]:
books_dataset

DatasetDict({
    train: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 200
    })
    validation: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 66
    })
})

# DATA PREP FOR SUMMARIZATION

In [10]:
# TOKENIZE
books_dataset

DatasetDict({
    train: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 9672
    })
    validation: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 238
    })
    test: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 245
    })
})

In [16]:
from transformers import AutoTokenizer

model_checkpoint = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]



In [17]:
max_input_length = 512
max_target_length = 30


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["review_body"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["review_title"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [18]:
tokenized_datasets = books_dataset.map(preprocess_function, batched=True)
# added for correction???
tokenized_datasets = tokenized_datasets.remove_columns(books_dataset["train"].column_names)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/66 [00:00<?, ? examples/s]

In [19]:
# added for correction???
tokenized_datasets = tokenized_datasets.remove_columns(books_dataset["train"].column_names)

In [20]:
tokenized_datasets 

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 66
    })
})

In [21]:
!pip install rouge_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=395867b86e305136637087ddca8320b52ee4837c208beba6bf7447d84f454106
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [22]:
# ADAPT OUTPUT FOR METRICS
#%%capture
#!pip install rouge_score

import evaluate

rouge_score = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [23]:
import nltk

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [24]:
import nltk

nltk.download("punkt")

from nltk.tokenize import sent_tokenize


def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])

In [None]:
#def evaluate_baseline(dataset, metric):
 #   summaries = [three_sentence_summary(text) for text in dataset["review_body"]]
  #  return metric.compute(predictions=summaries, references=dataset["review_title"])

In [25]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    #result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [26]:
# MODEL
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

Downloading pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# TRAIN - FINE_TUNE

In [28]:
from torch.utils.data import DataLoader

batch_size = 8
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batch_size,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"],
     collate_fn=data_collator, 
     batch_size=batch_size
)

from torch.optim import AdamW
from accelerate import Accelerator
from transformers import get_scheduler

optimizer = AdamW(model.parameters(), lr=2e-5)


accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)


num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

output_dir = "results-mt5-finetuned-squad-accelerate"

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # ROUGE expects a newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels


from tqdm.auto import tqdm
import torch
import numpy as np

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for step, batch in enumerate(train_dataloader):
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
            )

            generated_tokens = accelerator.pad_across_processes(
                generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
            )
            labels = batch["labels"]

            # If we did not pad to max length, we need to pad the labels too
            labels = accelerator.pad_across_processes(
                batch["labels"], dim=1, pad_index=tokenizer.pad_token_id
            )

            generated_tokens = accelerator.gather(generated_tokens).cpu().numpy()
            labels = accelerator.gather(labels).cpu().numpy()

            # Replace -100 in the labels as we can't decode them
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            
            if isinstance(generated_tokens, tuple):
                generated_tokens = generated_tokens[0]
            decoded_preds = tokenizer.batch_decode(
                generated_tokens, skip_special_tokens=True
            )
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            decoded_preds, decoded_labels = postprocess_text(
                decoded_preds, decoded_labels
            )

            rouge_score.add_batch(predictions=decoded_preds, references=decoded_labels)

    # Compute metrics
    result = rouge_score.compute()
    # Extract the median ROUGE scores
    #result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    result = {key: value * 100 for key, value in result.items()}
    result = {k: round(v, 4) for k, v in result.items()}
    print(f"Epoch {epoch}:", result)

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        #repo.push_to_hub(
        #    commit_message=f"Training in progress epoch {epoch}", blocking=False
        #)

In [31]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # ROUGE expects a newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

In [32]:
output_dir = "results-mt5-finetuned-squad-accelerate"

In [None]:
tokenized_datasets["train"][0]

{'input_ids': [653,
  1957,
  1314,
  261,
  2757,
  1280,
  435,
  259,
  29166,
  263,
  269,
  774,
  5547,
  1],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [298, 259, 5994, 269, 774, 5547, 1]}

In [33]:
from tqdm.auto import tqdm
import torch
import numpy as np

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for step, batch in enumerate(train_dataloader):
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
            )

            generated_tokens = accelerator.pad_across_processes(
                generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
            )
            labels = batch["labels"]

            # If we did not pad to max length, we need to pad the labels too
            labels = accelerator.pad_across_processes(
                batch["labels"], dim=1, pad_index=tokenizer.pad_token_id
            )

            generated_tokens = accelerator.gather(generated_tokens).cpu().numpy()
            labels = accelerator.gather(labels).cpu().numpy()

            # Replace -100 in the labels as we can't decode them
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            
            if isinstance(generated_tokens, tuple):
                generated_tokens = generated_tokens[0]
            decoded_preds = tokenizer.batch_decode(
                generated_tokens, skip_special_tokens=True
            )
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            decoded_preds, decoded_labels = postprocess_text(
                decoded_preds, decoded_labels
            )

            rouge_score.add_batch(predictions=decoded_preds, references=decoded_labels)

    # Compute metrics
    result = rouge_score.compute()
    # Extract the median ROUGE scores
    #result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    result = {key: value * 100 for key, value in result.items()}
    result = {k: round(v, 4) for k, v in result.items()}
    print(f"Epoch {epoch}:", result)

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        #repo.push_to_hub(
        #    commit_message=f"Training in progress epoch {epoch}", blocking=False
        #)

  0%|          | 0/75 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
Epoch 1: {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
Epoch 2: {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}


In [None]:
result 
{key: value * 100 for key, value in result.items()}

{'rouge1': 3.581342751779469,
 'rouge2': 0.8099148750409253,
 'rougeL': 3.4712107289693326,
 'rougeLsum': 3.453264251825365}

In [None]:
# INFERENCE

def print_summary(idx):
    review = books_dataset["test"][idx]["review_body"]
    title = books_dataset["test"][idx]["review_title"]
    summary = summarizer(books_dataset["test"][idx]["review_body"])[0]["summary_text"]
    print(f"'>>> Review: {review}'")
    print(f"\n'>>> Title: {title}'")
    print(f"\n'>>> Summary: {summary}'")

from transformers import pipeline

hub_model_id = output_dir

summarizer = pipeline("summarization", model=hub_model_id)

print_summary(100)

In [None]:
def print_summary(idx):
    review = books_dataset["test"][idx]["review_body"]
    title = books_dataset["test"][idx]["review_title"]
    summary = summarizer(books_dataset["test"][idx]["review_body"])[0]["summary_text"]
    print(f"'>>> Review: {review}'")
    print(f"\n'>>> Title: {title}'")
    print(f"\n'>>> Summary: {summary}'")

In [None]:
print_summary(100)