# Finetuning of the language model T5 base on a Question-Answering task (QA) with the dataset SQuAD 1.1 Portuguese

## Configuration

In [None]:
from google.colab import drive 
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!nvidia-smi

Fri Nov 11 18:33:52 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
model_checkpoint = "unicamp-dl/ptt5-base-portuguese-vocab"

In [None]:
%%capture
!pip install datasets transformers[sentencepiece] 

In [None]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


In [None]:
%%capture
!apt install git-lfs

In [None]:
import transformers
transformers.logging.set_verbosity_info()

print(transformers.__version__)
# 4.15.0

4.24.0


In [None]:
# get QA classes
!wget https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/question-answering/trainer_seq2seq_qa.py

--2022-11-11 18:35:33--  https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/question-answering/trainer_seq2seq_qa.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6222 (6.1K) [text/plain]
Saving to: ‘trainer_seq2seq_qa.py’


2022-11-11 18:35:34 (72.7 MB/s) - ‘trainer_seq2seq_qa.py’ saved [6222/6222]



In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

from trainer_seq2seq_qa import QuestionAnsweringSeq2SeqTrainer

from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    set_seed,
)

from transformers.trainer_utils import EvalLoopOutput, EvalPrediction

from datasets import load_dataset, load_metric

import numpy as np
import json 
import pathlib
from pathlib import Path

In [None]:
# get tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

Could not locate the tokenizer configuration file, will try to use the model config instead.


Downloading:   0%|          | 0.00/456 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unicamp-dl--ptt5-base-portuguese-vocab/snapshots/f8b910de7ba773bc2025cbad98f825f310c55885/config.json
Model config T5Config {
  "_name_or_path": "unicamp-dl/ptt5-base-portuguese-vocab",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "transformers_version": "4.24.0",
  "use_cache": true,
  "vocab_size": 32128
}



Downloading:   0%|          | 0.00/756k [00:00<?, ?B/s]

loading file spiece.model from cache at /root/.cache/huggingface/hub/models--unicamp-dl--ptt5-base-portuguese-vocab/snapshots/f8b910de7ba773bc2025cbad98f825f310c55885/spiece.model
loading file tokenizer.json from cache at None
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unicamp-dl--ptt5-base-portuguese-vocab/snapshots/f8b910de7ba773bc2025cbad98f825f310c55885/config.json
Model config T5Config {
  "_name_or_path": "unicamp-dl/ptt5-base-portuguese-vocab",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon":

In [None]:
max_input_length = 384 # 512
max_target_length = 32 # 32
val_max_answer_length = max_target_length

pad_to_max_length = True
padding = "max_length" if pad_to_max_length else False
ignore_pad_token_for_loss = True

max_seq_length = min(max_input_length, tokenizer.model_max_length)
generation_max_length = None
max_eval_samples = None

version_2_with_negative = False # squad 1.1

answer_column = "answers"

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unicamp-dl--ptt5-base-portuguese-vocab/snapshots/f8b910de7ba773bc2025cbad98f825f310c55885/config.json
Model config T5Config {
  "_name_or_path": "unicamp-dl/ptt5-base-portuguese-vocab",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "transformers_version": "4.24.0",
  "use_cache": true,
  "vocab_size": 32128
}



Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--unicamp-dl--ptt5-base-portuguese-vocab/snapshots/f8b910de7ba773bc2025cbad98f825f310c55885/pytorch_model.bin
All model checkpoint weights were used when initializing T5ForConditionalGeneration.

All the weights of T5ForConditionalGeneration were initialized from the model checkpoint at unicamp-dl/ptt5-base-portuguese-vocab.
If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForConditionalGeneration for predictions without further training.


Let's define all hyperparameters of our training job.

In [None]:
# do training and evaluation
do_train = True
do_eval= True

# batch
batch_size = 4
gradient_accumulation_steps = 3
per_device_train_batch_size = batch_size
per_device_eval_batch_size = per_device_train_batch_size*16

# LR, wd, epochs
learning_rate = 1e-4
weight_decay = 0.01
num_train_epochs = 10
fp16 = True

# logs
logging_strategy = "steps"
logging_first_step = True 
logging_steps = 3000     # if logging_strategy = "steps"
eval_steps = logging_steps 

# checkpoints
evaluation_strategy = logging_strategy
save_strategy = logging_strategy
save_steps = logging_steps
save_total_limit = 3

# best model
load_best_model_at_end = True
metric_for_best_model = "f1" #"loss"
if metric_for_best_model == "loss":
  greater_is_better = False
else:
  greater_is_better = True  

# evaluation
num_beams = 1

# folders
model_name = model_checkpoint.split("/")[-1]
folder_model = 'e' + str(num_train_epochs) + '_lr' + str(learning_rate)
output_dir = '/content/drive/MyDrive/' + str(model_name) + '/checkpoints/' + folder_model
Path(output_dir).mkdir(parents=True, exist_ok=True)    #python 3.5 above
logging_dir = '/content/drive/MyDrive/' + str(model_name) + '/logs/' + folder_model
Path(logging_dir).mkdir(parents=True, exist_ok=True)    #python 3.5 above

In [None]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    do_train=do_train,
    do_eval=do_eval,
    evaluation_strategy=evaluation_strategy,
    learning_rate=learning_rate,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    weight_decay=weight_decay,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    save_steps=save_steps,
    save_total_limit=save_total_limit,  
    save_strategy=save_strategy,
    load_best_model_at_end=load_best_model_at_end,
    metric_for_best_model=metric_for_best_model,
    greater_is_better=greater_is_better,
    logging_dir=logging_dir,         # directory for storing logs
    logging_strategy=logging_strategy,
    logging_steps=logging_steps,     # if logging_strategy = "steps" 
    fp16=fp16,
    push_to_hub=False, 
)

using `logging_steps` to initialize `eval_steps` to 3000
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


### Get SQuAD 1.1 pt from the Web and convert it to a DatasetDict()

In [None]:
# Get dataset SQUAD in Portuguese
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1Q0IaIlv2h2BC468MwUFmUST0EyN7gNkn' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1Q0IaIlv2h2BC468MwUFmUST0EyN7gNkn" -O squad-pt.tar.gz && rm -rf /tmp/cookies.txt

--2022-11-07 20:30:27--  https://docs.google.com/uc?export=download&confirm=t&id=1Q0IaIlv2h2BC468MwUFmUST0EyN7gNkn
Resolving docs.google.com (docs.google.com)... 142.251.163.100, 142.251.163.139, 142.251.163.102, ...
Connecting to docs.google.com (docs.google.com)|142.251.163.100|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-0s-20-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/u9u04f178okfk1ub5ocub4g0r44ltcil/1667853000000/03445611175480770093/*/1Q0IaIlv2h2BC468MwUFmUST0EyN7gNkn?e=download&uuid=70959b9f-4c6c-41f5-8015-8da449a77242 [following]
--2022-11-07 20:30:27--  https://doc-0s-20-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/u9u04f178okfk1ub5ocub4g0r44ltcil/1667853000000/03445611175480770093/*/1Q0IaIlv2h2BC468MwUFmUST0EyN7gNkn?e=download&uuid=70959b9f-4c6c-41f5-8015-8da449a77242
Resolving doc-0s-20-docs.googleusercontent.com (doc-0s-20-docs.googleusercontent.com)... 142.251.163.

In [None]:
!tar -xvf squad-pt.tar.gz

squad-train-v1.1.json
squad-dev-v1.1.json


In [None]:
%%time
# new

# Get the train and validation json file in the HF script format 
# inspiration: file squad.py at https://github.com/huggingface/datasets/tree/master/datasets/squad

files = ['squad-train-v1.1.json','squad-dev-v1.1.json']

for file in files:
    
    # Opening JSON file & returns JSON object as a dictionary 
    f = open(file, encoding="utf-8") 
    data = json.load(f) 
    
    # Iterating through the json list 
    entry_list = list()
    id_list = list()

    for row in data['data']: 
        title = row['title']
        
        for paragraph in row['paragraphs']:
            context = paragraph['context']

            for qa in paragraph['qas']:
                entry = {}

                qa_id = qa['id']
                question = qa['question']
                answers = qa['answers']
                
                entry['id'] = qa_id
                # entry['title'] = title.strip()
                # entry['context'] = context.strip()
                # entry['question'] = question.strip()
                
                entry['input_ids'] = 'question: %s  context: %s' % (question.strip(), context.strip())
                
                answer_starts = [answer["answer_start"] for answer in answers]

                # keep unique texts
                answer_texts = [answer["text"].strip() for answer in answers]
                sorted_values, index_values = np.unique(answer_texts, return_index=True)
                answer_texts = (np.array(answer_texts)[index_values]).tolist()
                answer_starts = (np.array(answer_starts)[index_values]).tolist()

                # if len(answer_starts) > 1:
                #   print(qa_id)

                entry['answers'] = {}
                entry['answers']['answer_start'] = answer_starts
                entry['answers']['text'] = answer_texts

                #entry['labels'] = '%s' % answer_texts

                entry_list.append(entry)
                
    reverse_entry_list = entry_list[::-1]
    
    # for entries with same id, keep only last one (corrected texts by he group Deep Learning Brasil)
    unique_ids_list = list()
    unique_entry_list = list()
    for entry in reverse_entry_list:
        qa_id = entry['id']
        if qa_id not in unique_ids_list:
            unique_ids_list.append(qa_id)
            unique_entry_list.append(entry)
        
    # Closing file 
    f.close() 

    new_dict = {}
    new_dict['data'] = unique_entry_list

    file_name = 'pt_' + str(file)
    with open(file_name, 'w') as json_file:
        json.dump(new_dict, json_file)

FileNotFoundError: ignored

In [None]:
raw_datasets = load_dataset('json', 
                        data_files={'train': 'pt_squad-train-v1.1.json', 'validation': 'pt_squad-dev-v1.1.json'}, 
                        field='data')

FileNotFoundError: ignored

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'input_ids', 'answers'],
        num_rows: 87510
    })
    validation: Dataset({
        features: ['id', 'input_ids', 'answers'],
        num_rows: 10570
    })
})

### Push to the datasets hub of Hugging Face

In order to save our `DatasetDict()`, we push it to the [datasets hub of Hugging Face](https://huggingface.co/datasets). 

However, as we are not the owner of this dataset, we push it in the private mode.

In [None]:
raw_datasets.push_to_hub("GuiSales404/QA-CompLin", private=True)

NameError: ignored

### Get the dataset SQuAD 1.1 pt from the datasets hub of Hugging Face

In [None]:
API_TOKEN = "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" # use an API TOKEN of your HF perfil
raw_datasets = load_dataset("GuiSales404/QA-CompLin", use_auth_token=API_TOKEN)

Downloading readme:   0%|          | 0.00/578 [00:00<?, ?B/s]



Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/GuiSales404___parquet/GuiSales404--QA-CompLin-dd39e900dbcc41c8/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/23.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.00M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/GuiSales404___parquet/GuiSales404--QA-CompLin-dd39e900dbcc41c8/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

The `dataset` object itself is [`DatasetDict`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasetdict), which contains one key for the training and validation set:

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'input_ids', 'answers'],
        num_rows: 87510
    })
    validation: Dataset({
        features: ['id', 'input_ids', 'answers'],
        num_rows: 10570
    })
})

To access an actual element, you need to select a split first, then give an index:

In [None]:
raw_datasets["train"][0]

{'id': '5735d259012e2f140011a0a1',
 'input_ids': 'question: De que KMC é um inicialismo?  context: A Cidade Metropolitana de Catmandu (KMC), a fim de promover as relações internacionais, criou uma Secretaria de Relações Internacionais (IRC). O primeiro relacionamento internacional da KMC foi estabelecido em 1975 com a cidade de Eugene, Oregon, Estados Unidos. Essa atividade foi aprimorada ainda mais com o estabelecimento de relações formais com outras 8 cidades: Cidade de Motsumoto, Japão, Rochester, EUA, Yangon (antiga Rangum) de Mianmar, Xian da República Popular da China, Minsk da Bielorrússia e Pyongyang de República Democrática da Coréia. O esforço constante da KMC é aprimorar sua interação com os países da SAARC, outras agências internacionais e muitas outras grandes cidades do mundo para alcançar melhores programas de gestão urbana e desenvolvimento para Katmandu.',
 'answers': {'answer_start': [2],
  'text': ['Cidade Metropolitana de Catmandu']}}

To get a sense of what the data looks like, the following function will show some examples picked randomly in the dataset.

In [None]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=5):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [None]:
show_random_elements(raw_datasets["train"])

Unnamed: 0,id,input_ids,answers
0,57296fe53f37b319004783b0,"question: Quais são os principais efeitos no clima de New Haven por ser uma cidade costeira? context: New Haven está na transição entre um clima continental úmido (classificação climática Köppen: Dfa) e um clima subtropical úmido (Köppen Cfa), mas com mais características do primeiro, como é típico de grande parte da área metropolitana de Nova York. Os verões são úmidos e quentes, com temperaturas superiores a 90 ° F (32 ° C) em 7 a 8 dias por ano. Os invernos são frios, com queda de neve moderada, intercalada com chuva e, ocasionalmente, precipitação mista. Os padrões climáticos que afetam New Haven resultam de uma direção principalmente offshore, reduzindo assim a influência marinha do Long Island Sound - embora, como outras áreas marinhas, as diferenças de temperatura entre as áreas ao longo da costa e áreas de uma milha ou duas no interior possam ser grandes. às vezes.","{'answer_start': [654], 'text': ['diferenças de temperatura entre as áreas']}"
1,572788bf5951b619008f8ca3,"question: Qual é a densidade populacional do maior cantão de Graubunden, localizado nos Alpes? context: Estendendo-se pelo lado norte e sul dos Alpes na Europa central-oeste, a Suíça abrange uma grande diversidade de paisagens e climas em uma área limitada de 41.285 quilômetros quadrados (15.940 milhas quadradas). A população é de cerca de 8 milhões, resultando em uma densidade populacional média de cerca de 195 pessoas por quilômetro quadrado (500 / sq mi). A metade sul do país, mais montanhosa, é muito mais escassamente povoada que a metade norte. No maior cantão de Grisões, localizado inteiramente nos Alpes, a densidade populacional cai para 27 / km² (70 / sq mi).","{'answer_start': [549], 'text': ['27 / km² (70 / sq mi)']}"
2,57112c66a58dae1900cd6cf6,"question: O primeiro console de jogos da Índia também era um clone, chamado de quê? context: Um mercado próspero de clones de hardware NES não licenciados surgiu durante o clímax da popularidade do console. Inicialmente, esses clones eram populares em mercados onde a Nintendo nunca emitiu uma versão legítima do console. Em particular, o Dendy (russo: Де́нди), um clone de hardware não licenciado produzido em Taiwan e vendido na antiga União Soviética, emergiu como o console de videogame mais popular de sua época naquele cenário e gozava de um grau de fama aproximadamente equivalente a experimentado pelo NES / Famicom na América do Norte e no Japão. Um clone da Famicom foi comercializado na Argentina sob o nome de ""Family Game"", semelhante ao design do hardware original. O Micro Genius (chinês simplificado: 小 天才) foi comercializado no sudeste da Ásia como uma alternativa ao Famicom; Samurai era a alternativa popular do PAL para o NES; e na Europa Central, especialmente na Polônia, o Pegasus estava disponível. O Samurai também estava disponível na Índia no início dos anos 90, que foi a primeira instância de jogos de console na Índia.","{'answer_start': [932], 'text': ['Samurai']}"
3,571cef645efbb31900334e54,"question: Qual era o nome comercial da anfetamina como descongestionante nasal? context: Embora altamente eficaz, o requisito de injeção limitou o uso de norepinefrina [esclarecimentos necessários] e foram buscados derivados ativos por via oral. Um composto estruturalmente semelhante, a efedrina, foi identificado pelos químicos japoneses na fábrica de Ma Huang e comercializado por Eli Lilly como tratamento oral para a asma. Após o trabalho de Henry Dale e George Barger em Burroughs-Wellcome, o químico acadêmico Gordon Alles sintetizou a anfetamina e a testou em pacientes com asma em 1929. A droga provou ter apenas modestos efeitos anti-asma, mas produziu sensações de alegria e palpitações. A anfetamina foi desenvolvida por Smith, Kline e French como descongestionante nasal sob o nome comercial de Benzedrine Inalador. A anfetamina foi desenvolvida para o tratamento da narcolepsia, parkinsonismo pós-encefelético e elevação do humor na depressão e outras indicações psiquiátricas. Recebeu a aprovação como remédio novo e não oficial da Associação Médica Americana para esses usos em 1937 e permaneceu em uso comum para a depressão até o desenvolvimento de antidepressivos tricíclicos na década de 1960.","{'answer_start': [719], 'text': ['Benzedrine Inalador']}"
4,56f990289b226e1400dd1590,"question: O que foi removido dos helicópteros Chinook? context: Em abril de 2008, foi assinado um contrato de 90 milhões de libras com a Boeing para uma solução de ""solução rápida"", para que eles possam voar até 2010: o QinetiQ fará o downgrade dos Chinooks - retirando alguns de seus equipamentos mais avançados.","{'answer_start': [221], 'text': ['equipamentos mais avançados']}"


## Preprocessing the data

Before we can feed those texts to our model, we need to preprocess them. This is done by a 🤗 Transformers `Tokenizer` which will (as the name indicates) tokenize the inputs (including converting the tokens to their corresponding IDs in the pretrained vocabulary) and put it in a format the model expects, as well as generate the other inputs that the model requires.

To do all of this, we instantiate our tokenizer with the `AutoTokenizer.from_pretrained` method, which will ensure:

- we get a tokenizer that corresponds to the model architecture we want to use,
- we download the vocabulary used when pretraining this specific checkpoint.

That vocabulary will be cached, so it's not downloaded again the next time we run the cell.

You can directly call this tokenizer on one sentence or a pair of sentences:

In [None]:
tokenizer("Qual o gosto de espumante?")

{'input_ids': [15715, 9, 6618, 4, 8, 6, 2104, 9178, 1854, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

Depending on the model you selected, you will see different keys in the dictionary returned by the cell above. They don't matter much for what we're doing here (just know they are required by the model we will instantiate later), you can learn more about them in [this tutorial](https://huggingface.co/transformers/preprocessing.html) if you're interested.

Instead of one sentence, we can pass along a list of sentences:

In [None]:
tokenizer(["Olá, é uma frase!", "é uma segunda frase!"])

{'input_ids': [[28, 2647, 3, 21, 17, 5477, 1310, 1], [21, 17, 363, 5477, 1310, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

To prepare the targets for our model, we need to tokenize them inside the `as_target_tokenizer` context manager. This will make sure the tokenizer uses the special tokens corresponding to the targets:

In [None]:
with tokenizer.as_target_tokenizer():
    print(tokenizer(["Olá, é uma frase!", "é uma segunda frase!"]))

{'input_ids': [[28, 2647, 3, 21, 17, 5477, 1310, 1], [21, 17, 363, 5477, 1310, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}


  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "


If you are using one of the five T5 checkpoints for summarization we have to prefix the inputs with "summarize:" (the model can also translate and it needs the prefix to know which task it has to perform). But in our case, we will fine-tune a PTT5 to a unique new downstream task (QA). Then, a prefix is not obrigatory.

In [None]:
# if model_checkpoint in ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"]:
#     prefix = "summarize: "
# else:
#     prefix = ""

prefix = ""

We can then write the function that will preprocess our samples. We just feed them to the `tokenizer` with the argument `truncation=True`. This will ensure that an input longer that what the model selected can handle will be truncated to the maximum length accepted by the model. The padding will be dealt with later on (in a data collator) so we pad examples to the longest length in the batch and not the whole dataset.

In [None]:
def preprocess_squad_batch(examples):
  targets = [answer["text"][0] if len(answer["text"]) > 0 else "" for answer in examples['answers']]
  return examples['input_ids'], targets

In [None]:
# train preprocessing
def preprocess_train_function(examples):

    inputs, targets = preprocess_squad_batch(examples)

    # inputs = [prefix + doc for doc in inputs]
    model_inputs = tokenizer(inputs, max_length=max_input_length, padding=padding, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length" and ignore_pad_token_for_loss:
      labels["input_ids"] = [
                             [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
                             ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# Validation preprocessing
def preprocess_validation_function(examples):
  inputs, targets = preprocess_squad_batch(examples)

  # inputs = [prefix + doc for doc in inputs]
  model_inputs = tokenizer(inputs, max_length=max_seq_length, padding=padding, truncation=True,
                           return_overflowing_tokens=True,
                           return_offsets_mapping=True,)
  
  # Setup the tokenizer for targets
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)

  # Since one example might give us several features if it has a long context, we need a map from a feature to
  # its corresponding example. This key gives us just that.
  sample_mapping = model_inputs.pop("overflow_to_sample_mapping")

  # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
  # corresponding example_id and we will store the offset mappings.
  model_inputs["example_id"] = []
  labels_mapping = {}
  labels_mapping['input_ids'] = []
  labels_mapping['attention_mask'] = []

  for i in range(len(model_inputs["input_ids"])):
    # One example can give several spans, this is the index of the example containing this span of text.
    sample_index = sample_mapping[i]
    model_inputs["example_id"].append(examples["id"][sample_index])
    labels_mapping['input_ids'].append(labels['input_ids'][sample_index])
    labels_mapping['attention_mask'].append(labels['attention_mask'][sample_index])

  # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
  # padding in the loss.
  if padding == "max_length" and ignore_pad_token_for_loss:
    labels["input_ids"] = [
                           [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels_mapping["input_ids"]
                           ]

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

This function works with one or several examples. In the case of several examples, the tokenizer will return a list of lists for each key:

In [None]:
# preprocess_train_function(raw_datasets['train'][:2])

To apply this function on all the pairs of sentences in our dataset, we just use the `map` method of our `dataset` object we created earlier. This will apply the function on all the elements of all the splits in `dataset`, so our training, validation and testing data will be preprocessed in one single command.

In [None]:
train_dataset = raw_datasets["train"]
eval_examples = raw_datasets["validation"]

In [None]:
column_names = raw_datasets["train"].column_names

# Create train feature from dataset
with training_args.main_process_first(desc="train dataset map pre-processing"):
  train_dataset = train_dataset.map(
      preprocess_train_function,
      batched=True,
      num_proc=None,
      remove_columns=column_names,
      load_from_cache_file=True,
      desc="Running tokenizer on train dataset",
      )

Running tokenizer on train dataset:   0%|          | 0/88 [00:00<?, ?ba/s]

In [None]:
column_names = raw_datasets["validation"].column_names

with training_args.main_process_first(desc="validation dataset map pre-processing"):
  eval_dataset = eval_examples.map(
      preprocess_validation_function,
      batched=True,
      num_proc=None,
      remove_columns=column_names,
      load_from_cache_file=True,
      desc="Running tokenizer on validation dataset",
      )

Running tokenizer on validation dataset:   0%|          | 0/11 [00:00<?, ?ba/s]

In [None]:
# set format for pytorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
eval_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels', 'example_id', 'offset_mapping'])

In [None]:
from datasets import load_from_disk

model_name = model_checkpoint.split("/")[-1]

# save
tokenized_datasets_dir = '/content/drive/MyDrive/' + str(model_name) + '/tokenized_datasets/train/'
train_dataset.save_to_disk(tokenized_datasets_dir)
tokenized_datasets_dir = '/content/drive/MyDrive/' + str(model_name) + '/tokenized_datasets/validation/'
eval_dataset.save_to_disk(tokenized_datasets_dir)

# load
tokenized_datasets_dir = '/content/drive/MyDrive/' + str(model_name) + '/tokenized_datasets/train/'
train_dataset = load_from_disk(tokenized_datasets_dir)
tokenized_datasets_dir = '/content/drive/MyDrive/' + str(model_name) + '/tokenized_datasets/validation/'
eval_dataset = load_from_disk(tokenized_datasets_dir)

In [None]:
train_dataset, eval_dataset

(Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 87510
 }), Dataset({
     features: ['input_ids', 'attention_mask', 'offset_mapping', 'example_id', 'labels'],
     num_rows: 10884
 }))

In [None]:
eval_examples

Dataset({
    features: ['id', 'input_ids', 'answers'],
    num_rows: 10570
})

Even better, the results are automatically cached by the 🤗 Datasets library to avoid spending time on this step the next time you run your notebook. The 🤗 Datasets library is normally smart enough to detect when the function you pass to map has changed (and thus requires to not use the cache data). For instance, it will properly detect if you change the task in the first cell and rerun the notebook. 🤗 Datasets warns you when it uses cached files, you can pass `load_from_cache_file=False` in the call to `map` to not use the cached files and force the preprocessing to be applied again.

Note that we passed `batched=True` to encode the texts by batches together. This is to leverage the full benefit of the fast tokenizer we loaded earlier, which will use multi-threading to treat the texts in a batch concurrently.

## Fine-tuning the model

Then, we need a special kind of data collator, which will not only pad the inputs to the maximum length in the batch, but also the labels:

In [None]:
# Data collator
label_pad_token_id = -100 if ignore_pad_token_for_loss else tokenizer.pad_token_id
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8 if training_args.fp16 else None,
    )

The last thing to define for our `Seq2SeqTrainer` is how to compute the metrics from the predictions. We need to define a function for this, which will just use the `metric` we loaded earlier, and we have to do a bit of pre-processing to decode the predictions into texts:

In [None]:
metric = load_metric("squad_v2" if version_2_with_negative else "squad")

def compute_metrics(p):
  return metric.compute(predictions=p.predictions, references=p.label_ids)

  """Entry point for launching an IPython kernel.


Downloading builder script:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

In [None]:
metric

Metric(name: "squad", features: {'predictions': {'id': Value(dtype='string', id=None), 'prediction_text': Value(dtype='string', id=None)}, 'references': {'id': Value(dtype='string', id=None), 'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None)}}, usage: """
Computes SQuAD scores (F1 and EM).
Args:
    predictions: List of question-answers dictionaries with the following key-values:
        - 'id': id of the question-answer pair as given in the references (see below)
        - 'prediction_text': the text of the answer
    references: List of question-answers dictionaries with the following key-values:
        - 'id': id of the question-answer pair (see above),
        - 'answers': a Dict in the SQuAD dataset format
            {
                'text': list of possible texts for the answer, as a list of strings
                'answer_start': list of start positions for the answer, as a list of ints
   

In [None]:
# Post-processing:
def post_processing_function(examples, features, outputs, stage="eval"):
  # Decode the predicted tokens.
  preds = outputs.predictions
  if isinstance(preds, tuple):
    preds = preds[0]
  
  decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

  # Build a map example to its corresponding features.
  example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
  # print('example_id_to_index:',example_id_to_index)
  # print('features:',features)
  feature_per_example = {example_id_to_index[feature["example_id"]]: i for i, feature in enumerate(features)}
  predictions = {}
  # Let's loop over all the examples!
  for example_index, example in enumerate(examples):
    # This is the index of the feature associated to the current example.
    feature_index = feature_per_example[example_index]
    predictions[example["id"]] = decoded_preds[feature_index]

  # Format the result to the format the metric expects.
  if version_2_with_negative:
    formatted_predictions = [
                             {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
                             ]
  else:
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
  
  references = [{"id": ex["id"], "answers": ex[answer_column]} for ex in examples]

  return EvalPrediction(predictions=formatted_predictions, label_ids=references)

Then we just need to pass all of this along with our datasets to the `Seq2SeqTrainer`:

In [None]:
from transformers.trainer_callback import EarlyStoppingCallback

early_stopping_patience = save_total_limit

# Initialize our Trainer
trainer = QuestionAnsweringSeq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if do_train else None, #.shard(num_shards=400, index=0)
    eval_dataset=eval_dataset if do_eval else None, #.shard(num_shards=400, index=0)
    eval_examples=eval_examples if do_eval else None, #.shard(num_shards=400, index=0)
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    post_process_function=post_processing_function,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)],
    )

Using cuda_amp half precision backend


We can now finetune our model by just calling the `train` method:

In [None]:
trainer.train()

***** Running training *****
  Num examples = 87510
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 3
  Total optimization steps = 72920
  Number of trainable parameters = 222903552
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Exact Match,F1
3000,0.7857,0.754915,61.958373,75.242643
6000,0.5469,0.722327,65.525071,77.577365
9000,0.4562,0.718141,66.310312,78.283942
12000,0.3971,0.724101,66.650899,78.753025
15000,0.3819,0.748548,67.076632,79.084087


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: offset_mapping, example_id. If offset_mapping, example_id are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10884
  Batch size = 64
Saving model checkpoint to /content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-3000
Configuration saved in /content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-3000/config.json
Model weights saved in /content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-3000/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoi

In [None]:
dir_checkpoint = str('/content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-12000')
trainer.train(dir_checkpoint)

Loading model from /content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-12000.
***** Running training *****
  Num examples = 87510
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 3
  Total optimization steps = 72920
  Number of trainable parameters = 222903552
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 1
  Continuing training from global step 12000
  Will skip the first 1 epochs then the first 14124 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/14124 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Exact Match,F1
15000,0.3819,0.748548,67.076632,79.084087
18000,0.3001,0.724814,67.038789,78.639421
21000,0.3041,0.743073,66.849574,79.238919
24000,0.2546,0.776795,67.152318,79.256883


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: example_id, offset_mapping. If example_id, offset_mapping are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10884
  Batch size = 64
Saving model checkpoint to /content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-15000
Configuration saved in /content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-15000/config.json
Model weights saved in /content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-15000/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-15000/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/chec

In [None]:
dir_checkpoint = str('/content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-21000')
trainer.train(dir_checkpoint)

Loading model from /content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-21000).
***** Running training *****
  Num examples = 87510
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 3
  Total optimization steps = 72920
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 2
  Continuing training from global step 21000
  Will skip the first 2 epochs then the first 19248 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/19248 [00:00<?, ?it/s]

Step,Training Loss,Validation Loss,Exact Match,F1
24000,0.2516,No log,67.275307,78.929923
27000,0.2375,No log,66.972564,79.333612
30000,0.2205,No log,66.915799,79.236574
33000,0.1826,No log,67.029328,78.964212
36000,0.1906,No log,66.982025,79.086125


The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: example_id, offset_mapping.
***** Running Evaluation *****
  Num examples = 10884
  Batch size = 64
Saving model checkpoint to /content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-24000
Configuration saved in /content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-24000/config.json
Model weights saved in /content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-24000/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-24000/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-24000/special_tokens_map.json
Copy vocab file to /content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr

TrainOutput(global_step=36000, training_loss=0.09023599921332465, metrics={'train_runtime': 12905.7852, 'train_samples_per_second': 67.807, 'train_steps_per_second': 5.65, 'total_flos': 1.9731331778347008e+17, 'train_loss': 0.09023599921332465, 'epoch': 4.94})

In [None]:
# save_steps = 3
# steps = 3000
dir_checkpoint = str('/content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-24000')
trainer.train(dir_checkpoint)

Loading model from /content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-24000.
***** Running training *****
  Num examples = 87510
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 3
  Total optimization steps = 72920
  Number of trainable parameters = 222903552
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 3
  Continuing training from global step 24000
  Will skip the first 3 epochs then the first 6372 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/6372 [00:00<?, ?it/s]

Step,Training Loss,Validation Loss,Exact Match,F1
27000,0.2211,0.80329,66.915799,79.07927
30000,0.2134,0.847947,66.641438,79.038896
33000,0.1888,0.827774,67.067171,79.271756
36000,0.1926,0.83366,67.218543,79.295909
39000,0.1583,0.893415,67.256386,79.414136
42000,0.1536,0.893946,67.265847,79.382121


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: offset_mapping, example_id. If offset_mapping, example_id are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10884
  Batch size = 64


Saving model checkpoint to /content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-27000
Configuration saved in /content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-27000/config.json
Model weights saved in /content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-27000/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-27000/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-27000/special_tokens_map.json
Copy vocab file to /content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-27000/spiece.model
Deleting older checkpoint [/content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-18000] due to args.save_total_limit
The following columns in the evaluation set don't have a c

## Evaluation of the model

In [None]:
max_length=32
num_beams=1
early_stopping=True

In [None]:
dir_checkpoint = str('/content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-36000')

In [49]:
trainer.train(dir_checkpoint)

Loading model from /content/drive/MyDrive/ptt5-base-portuguese-vocab/checkpoints/e10_lr0.0001/checkpoint-36000.
***** Running training *****
  Num examples = 87510
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 3
  Total optimization steps = 72920
  Number of trainable parameters = 222903552
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 4
  Continuing training from global step 36000
  Will skip the first 4 epochs then the first 20496 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/20496 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


KeyboardInterrupt: ignored

### Just one QA

In [50]:
input_text  = 'question: Quando foi descoberta a Covid-19? context: A pandemia de COVID-19, também conhecida como pandemia de coronavírus, é uma pandemia em curso de COVID-19, uma doença respiratória aguda causada pelo coronavírus da síndrome respiratória aguda grave 2 (SARS-CoV-2). A doença foi identificada pela primeira vez em Wuhan, na província de Hubei, República Popular da China, em 1 de dezembro de 2019, mas o primeiro caso foi reportado em 31 de dezembro do mesmo ano.'
label = '1 de dezembro de 2019'

inputs = trainer.tokenizer(input_text, return_tensors="pt").to('cuda') 

outputs = trainer.model.generate(inputs["input_ids"], 
                             max_length=max_target_length, 
                             num_beams=num_beams, 
                             early_stopping=early_stopping
                            )
print('true answer |',label)
print('pred        |',tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))

true answer | 1 de dezembro de 2019
pred        | 1 de dezembro de 2019


## Evaluation

In [52]:
results = {}
max_length = (generation_max_length if generation_max_length is not None else val_max_answer_length)
num_beams = num_beams if num_beams is not None else generation_num_beams

if do_eval:
  print("*** Evaluate ***")
  metrics = trainer.evaluate(max_length=max_length, num_beams=num_beams, metric_key_prefix="eval")
  max_eval_samples = max_eval_samples if max_eval_samples is not None else len(eval_dataset)
  metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
  
  trainer.log_metrics("eval", metrics)

  eval_dir = '/content/drive/MyDrive/' + str(model_name) + '/eval_metrics/' + folder_model
  Path(eval_dir).mkdir(parents=True, exist_ok=True)    #python 3.5 above
  trainer.save_metrics(eval_dir, metrics)

The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: example_id, offset_mapping. If example_id, offset_mapping are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10884
  Batch size = 64


*** Evaluate ***


Step,Training Loss,Validation Loss,Exact Match,F1
36006,0.1926,0.852509,67.511826,79.38013
36006,0.1926,0.852509,67.511826,79.38013


***** eval metrics *****
  epoch            =    4.94
  eval_exact_match = 67.5118
  eval_f1          = 79.3801
  eval_samples     =   10884


In [53]:
metrics

{'eval_exact_match': 67.51182592242195,
 'eval_f1': 79.38013030241412,
 'epoch': 4.94,
 'eval_samples': 10884}

## Save locally the model

In [54]:
model_name = model_checkpoint.split("/")[-1]
model_dir = '/content/drive/MyDrive/' + str(model_name) + '/models/' + folder_model
trainer.save_model(model_dir)

Saving model checkpoint to /content/drive/MyDrive/ptt5-base-portuguese-vocab/models/e10_lr0.0001
Configuration saved in /content/drive/MyDrive/ptt5-base-portuguese-vocab/models/e10_lr0.0001/config.json
Model weights saved in /content/drive/MyDrive/ptt5-base-portuguese-vocab/models/e10_lr0.0001/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/ptt5-base-portuguese-vocab/models/e10_lr0.0001/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/ptt5-base-portuguese-vocab/models/e10_lr0.0001/special_tokens_map.json
Copy vocab file to /content/drive/MyDrive/ptt5-base-portuguese-vocab/models/e10_lr0.0001/spiece.model


## Push the model to the Hugging Face model hub

In [68]:
model_name_hf = 'GuiSales404/t5-base-qa-squad-v1.1-portuguese'

### Method 2

In [69]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

loading file spiece.model
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file /content/drive/MyDrive/ptt5-base-portuguese-vocab/models/e10_lr0.0001/config.json
Model config T5Config {
  "_name_or_path": "/content/drive/MyDrive/ptt5-base-portuguese-vocab/models/e10_lr0.0001",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "torch_dtype": "float32",

1. [Creating a repository in the HF model hub](https://huggingface.co/docs/hub/adding-a-model#creating-a-repository)

2. [Clone your model repository](https://huggingface.co/docs/hub/adding-a-model#uploading-your-files)

In [70]:
# source: https://github.com/huggingface/transformers/issues/12572
from huggingface_hub import HfFolder
import os
os.environ['HF_AUTH'] = HfFolder().get_token()

In [71]:
%cd /content

/content


In [74]:
# Clone the repo with authentication
!git clone https://user:$HF_AUTH@huggingface.co/GuiSales404/e10_lr0.0001

Cloning into 'e10_lr0.0001'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (3/3), done.


3. Add your files to the repository

In [76]:
!cp {model_dir}/* /content/e10_lr0.0001

4. Commit and push your files

In [77]:
%cd /content/e10_lr0.0001

/content/e10_lr0.0001


In [82]:
! git config --global user.email "guilherme_sales@atlantico.com.br"
! git config --global user.name "Guilherme"

In [83]:
!git add .
!git commit -m "First model version"
!git push

[main e0c9b2f] First model version
 7 files changed, 129679 insertions(+)
 create mode 100644 config.json
 create mode 100644 pytorch_model.bin
 create mode 100644 special_tokens_map.json
 create mode 100644 spiece.model
 create mode 100644 tokenizer.json
 create mode 100644 tokenizer_config.json
 create mode 100644 training_args.bin
Git LFS: (3 of 3 files) 851.12 MB / 851.12 MB
Counting objects: 9, done.
Delta compression using up to 2 threads.
Compressing objects: 100% (9/9), done.
Writing objects: 100% (9/9), 519.73 KiB | 5.20 MiB/s, done.
Total 9 (delta 1), reused 0 (delta 0)
remote: Scanning LFS files for validity, may be slow...[K
remote: LFS file scan complete.[K
To https://huggingface.co/GuiSales404/e10_lr0.0001
   6bfaf21..e0c9b2f  main -> main
