# Projeto: Dataset MS Marco Traduzido para Português
### Autores : Graziella Cardoso Bonadia e Matheus Gustavo Alves Sasso


In [1]:
#@title Configurações gerais
experiment_name = 'NaturalQuestionsPortuguese'  #@param {type:"string"}
model_name = 't5-small'  #@param ["t5-base","t5-small","t5-large"] {type:"string"}
TARGET_LANGUAGE = 'portuguese'  #@param {type:"string"}
SOURCE_LANGUAGE = 'english' #@param {type:"string"}
PREFIX = 'translate English to Portuguese: '

# Instalação de pacotes , imports e configurações gerais

### Pacotes Externos

In [2]:
! pip install pytorch-lightning==0.7.6 --quiet
! pip install transformers --quiet
! pip install nvidia-smi --quiet
! pip install ftfy --quiet
! pip install jsonlines --quiet
! pip install sacrebleu --quiet

### Funções auxiliares 

In [3]:
%%capture
!wget -nc https://raw.githubusercontent.com/Matheus158257/ms-marco-passage-ranking-dense-vectors-with-doc2query/master/read_ms_marco.py
!wget -nc https://raw.githubusercontent.com/Matheus158257/ms-marco-passage-ranking-dense-vectors-with-doc2query/master/read_ms_marco2.py

### Paths

Root

In [26]:
import os
data_base_dir = '/content/drive/My Drive/Mestrado/PLN/Projeto/Data/Traducao/MS_MARCO'
check_path = '/content/drive/My Drive/Mestrado/PLN/Projeto/Data/Traducao/Checkpoints_V2'

In [27]:
# NO CHUNCK
collections_path =  data_base_dir + '/df_collecions.csv'
queries_path =  data_base_dir + '/df_queries.csv'


### Imports


In [6]:
#Bibliotecas Padrão
import os
import random
from typing import Dict
from typing import List
from typing import Tuple
import re
import gzip
import math
import jsonlines
import pdb
import ftfy


#Bibliotecas Data Science
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder
import numpy as np
import pandas as pd

#Bibliotecas Pytorch
import torch
from torch.utils.data import Dataset
from torch import Tensor
from torch.utils.data import DataLoader
import torch.nn.functional as F

#Bibliotecas Lightning
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

#Bibliotecas transfromers
from transformers import T5ForConditionalGeneration
from transformers import T5Tokenizer

#import das funções que vem do github
import collections
import functools
import traceback
import sys
import os
from read_ms_marco2 import load_qrels
from read_ms_marco2 import load_queries
from read_ms_marco import load_collection
from read_ms_marco import load_doc2query
from read_ms_marco import load_triple
from read_ms_marco2 import load_txts_topk #load_txts_topk(folder,k=1,n=18,encoding="cp1252")

# Score Metrics
import sacrebleu

# Initialize tensorboard
print("\nInitialize=ing Tensorboard...\n")
%load_ext tensorboard

# Device
if torch.cuda.is_available(): 
   dev = "cuda:0"
else: 
   dev = "cpu" 
print(dev)
device = torch.device(dev)


Initialize=ing Tensorboard...

cuda:0


### Fix seeds 

In [7]:
seed = 123
random.seed(seed)
# np.random.seed(seed)
torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed)

### Montar o Drive

In [8]:
#Mount drive
print("\nMounting Drive...\n")
from google.colab import drive
drive.mount('/content/drive')


Mounting Drive...

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Impedir excesso de logs

In [9]:
import logging
logging.getLogger("transformers.configuration_utils").setLevel(logging.WARNING)
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARNING)
logging.getLogger("lightning").setLevel(logging.WARNING)

###Impedir quebra de memória

In [10]:
# https://docs.fast.ai/troubleshoot.html#memory-leakage-on-exception
import functools, traceback
def gpu_mem_restore(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except:
            type, val, tb = sys.exc_info()
            traceback.clear_frames(tb)
            raise type(val).with_traceback(tb) from None
    return wrapper


###Hardware Data

In [11]:
import nvidia_smi
import psutil
from multiprocessing import cpu_count
nvidia_smi.nvmlInit()
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
print("\nGetting Hardware Statatus...\n")
def hardware_stats():
    '''
    Returns a dict containing some hardware related stats
    '''
    res = nvidia_smi.nvmlDeviceGetUtilizationRates(handle)
    return {"cpu": str(psutil.cpu_percent()) + '%',
            "mem": str(psutil.virtual_memory().percent) + '%',
            "gpu": str(res.gpu) + '%',
            "gpu_mem": str(res.memory) + '%'}

print(f"Imports loaded succesfully. Current GPU: {torch.cuda.get_device_name(0)}, number of CPU cores: {cpu_count()}")


Getting Hardware Statatus...

Imports loaded succesfully. Current GPU: Tesla P4, number of CPU cores: 2


In [12]:
print(f"Pytorch Lightning Version: {pl.__version__}")
nvidia_smi.nvmlInit()
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
print(f"Device name: {nvidia_smi.nvmlDeviceGetName(handle)}")
print(f"Number of CPU cores: {cpu_count()}")

def gpu_usage():
    global handle
    return str(nvidia_smi.nvmlDeviceGetUtilizationRates(handle).gpu) + '%'

Pytorch Lightning Version: 0.7.6
Device name: b'Tesla P4'
Number of CPU cores: 2


# Trabalhando o modelo

In [13]:
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1197.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=242136741.0, style=ProgressStyle(descri…




In [14]:
extra_tokens = ['À' , 'È' , 'Ì' , 'Ò' , 'Ù' , 'à' , 'è' , 'ì' , 'ò' , 'ù' , 'Á' , 'É' , 'Í' , 'Ó' , 'Ú' , 'á' , 'é' , 'í' , 'ó' , 'ú' , 'Â' , 'Ê' , 'Î' , 'Ô' , 'Û' , 'â' , 'ê' , 'î' , 'ô' , 'û'  , 'Ã' , 'Õ'  , 'ã', 'õ' , 'Ë', 'ä' , 'ë' , 'ï' , 'ö' , 'ü']

In [15]:
tokenizer.add_tokens('não')

1

In [16]:
# tokenizer.add_tokens('<CHUNK>')

Alguns dos tokens de acentuação já existem no dataset. Checamos isso fazendo o encoding do token. Se existir, ele retorna um token x. Se não, retorna 2. Se for 2, a gente adiciona para o tokenizador

In [17]:
added_tokens = []
for tok in extra_tokens:
    enc = tokenizer.encode(tok)
    if 2 in enc:
        added_tokens.append(tok)
        tokenizer.add_tokens(tok)

In [18]:
def fix_accent_breaks(text):
    """
    A ideia é fazer a junção de letras com acento de volta para frases na validação.
    Isso serve para melhorar o BLEU.
    
    Args
        text: texto que terá acentuação corrigida
        
    Returns:
        Texto completo com acentuação corrigida
    
    
    
    """
    words = text.split(" ")
    out_words = []
    merge_pos = [idx for idx, dat in enumerate(words) if dat in added_tokens]
    for pos in sorted(merge_pos, reverse=True):
        if pos==0:
            new_word = words[pos]+words[pos+1]
            for i in range(2): words.pop
            words.pop(pos+1)
            words.pop(pos)        
            words.insert(pos, new_word)
        elif pos==len(words)-1:
            new_word = words[pos-1]+words[pos]
            words.pop(pos)
            words.pop(pos-1)
            words.insert(pos-1, new_word)
        else:
            new_word = words[pos-1]+words[pos]+words[pos+1]
            words.pop(pos+1)
            words.pop(pos)
            words.pop(pos-1)
            words.insert(pos-1, new_word)

    return " ".join(words)

# Etapas do paracrawl -> Treino




## Data Prep

In [19]:
! wget -nc https://storage.googleapis.com/neuralresearcher_data/unicamp/ia376e_2020s1/paracrawl_enpt_train.tsv.gz
! wget -nc https://storage.googleapis.com/neuralresearcher_data/unicamp/ia376e_2020s1/paracrawl_enpt_test.tsv.gz

--2020-06-24 22:14:35--  https://storage.googleapis.com/neuralresearcher_data/unicamp/ia376e_2020s1/paracrawl_enpt_train.tsv.gz
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.194.128, 2404:6800:4003:c00::80
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.194.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 106548256 (102M) [text/tab-separated-values]
Saving to: ‘paracrawl_enpt_train.tsv.gz’


2020-06-24 22:14:40 (26.6 MB/s) - ‘paracrawl_enpt_train.tsv.gz’ saved [106548256/106548256]

--2020-06-24 22:14:42--  https://storage.googleapis.com/neuralresearcher_data/unicamp/ia376e_2020s1/paracrawl_enpt_test.tsv.gz
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.194.128, 2404:6800:4003:c04::80
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.194.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2139168 (2.0M) [text/tab-separated-values]
Saving to: ‘par

In [20]:
def load_text_pairs(path):
    text_pairs = []
    for line in gzip.open(path, mode='rt'):
        text_pairs.append(line.strip().split('\t'))
    return text_pairs

ds_train = load_text_pairs('paracrawl_enpt_train.tsv.gz')
ds_test = load_text_pairs('paracrawl_enpt_test.tsv.gz')

# Embaralhamos o treino para depois fazermos a divisão treino/val.
random.shuffle(ds_train)

# Truncamos o dataset para 100k pares de treino e 5k pares de validação.
ds_val = ds_train[100000:105000]
ds_train = ds_train[:100000]

for set_name, x in [('treino', ds_train), ('validação',ds_val), ('test', ds_test)]:
    print(f'\n{len(x)} amostras de {set_name}')
    print(f'3 primeiras amostras {set_name}:')
    for i, (source, target) in enumerate(x[:3]):
        print(f'{i}: source: {source}\n   target: {target}')


100000 amostras de treino
3 primeiras amostras treino:
0: source: More Croatian words and phrases
   target: Mais palavras e frases em croata
1: source: Jerseys and pullovers, containing at least 50Â % by weight of wool and weighing 600Â g or more per article 6110 11 10 (PCE)
   target: Camisolas e pulôveres, com pelo menos 50 %, em peso, de lã e pesando 600g ou mais por unidade 6110 11 10 (PCE)
2: source: Atex Colombia SAS makes available its lead product, 100% natural liquid latex, excellent quality and price. ... Welding manizales caldas Colombia a DuckDuckGo
   target: Atex Colômbia SAS torna principal produto está disponível, látex líquido 100% natural, excelente qualidade e preço. ...

5000 amostras de validação
3 primeiras amostras validação:
0: source: «You have hidden these things from the wise and the learned you have revealed them to the childlike»
   target: «Escondeste estas coisas aos sábios e entendidos e as revelaste aos pequenos»
1: source: Repair of computers, applic

## Classe de dataset

In [21]:
class ParacrawlDataset(Dataset):
    def __init__(self, text_pairs: List[Tuple[str]], tokenizer,
                 source_max_length: int = 32, target_max_length: int = 32):
        self.tokenizer = tokenizer
        self.text_pairs = text_pairs
        self.source_max_length = source_max_length
        self.target_max_length = target_max_length
        
    def __len__(self):
        return len(self.text_pairs)
    
    def __getitem__(self, idx):
        source, target = self.text_pairs[idx]
        source_modified = 'translate English to Portuguese: '+ source + self.tokenizer.eos_token
        target_modified = target + self.tokenizer.eos_token
        
        source_tok = tokenizer.batch_encode_plus([source_modified], add_special_tokens=True,
                                                max_length=self.source_max_length, pad_to_max_length = True, 
                                                return_tensors='pt')
        target_tok = tokenizer.batch_encode_plus([target_modified], add_special_tokens=True,
                                                max_length=self.target_max_length, pad_to_max_length = True, 
                                                return_tensors='pt')
        
        return (source_tok['input_ids'][0], source_tok['attention_mask'][0], target_tok['input_ids'][0], 
                target_tok['attention_mask'][0], source, target)

In [22]:
text_pairs = [('Dear friends, the history of this nation includes many examples of the Church’s commitment in this regard.',
               'Queridos amigos, a história desta Nação oferece numerosos exemplos do compromisso da Igreja a este propósito.')]
dataset_debug = ParacrawlDataset(
    text_pairs=text_pairs,
    tokenizer=tokenizer,
    source_max_length=50,
    target_max_length=50)

dataloader_debug = DataLoader(dataset_debug, batch_size=10, shuffle=True, 
                              num_workers=0)

source_token_ids, source_mask, target_token_ids, target_mask, _, _ = next(iter(dataloader_debug))
print('source_tokens:\n', tokenizer.tokenize(text_pairs[0][0]))
print('source_token_ids:\n', source_token_ids)
print('source_mask:\n', source_mask)
print('target_tokens:\n', tokenizer.tokenize(text_pairs[0][1]))
print('target_token_ids:\n', target_token_ids)
print('target_mask:\n', target_mask)

print('source_token_ids.shape:', source_token_ids.shape)
print('source_mask.shape:', source_mask.shape)
print('target_token_ids.shape:', target_token_ids.shape)
print('target_mask.shape:', target_mask.shape)

source_tokens:
 ['▁Dear', '▁friends', ',', '▁the', '▁history', '▁of', '▁this', '▁nation', '▁includes', '▁many', '▁examples', '▁of', '▁the', '▁Church', '’', 's', '▁commitment', '▁in', '▁this', '▁regard', '.']
source_token_ids:
 tensor([[13959,  1566,    12, 21076,    10, 19451,   803,     6,     8,   892,
            13,    48,  2982,   963,   186,  4062,    13,     8,  2345,    22,
             7,  3148,    16,    48,  3553,     5,     1,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]])
source_mask:
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]])
target_tokens:
 ['▁Qu', 'er', 'idos', '▁', 'ami', 'go', 's', ',', '▁', 'a', '▁his', 't', 'ó', 'r', 'i', 'a', '▁de', 'sta', '▁Na', 'ç', 'ã', '▁', 'o', '▁of', 'er', 'e', 'ce', '▁numero'

# Etapa NaturalQuestions -> Inferência

## Data prep

In [28]:
collection_df = pd.read_csv(collections_path)
queries_df = pd.read_csv(queries_path)

In [29]:
collection_df.head()

Unnamed: 0,PID,PASSAGE
0,0,The presence of communication amid scientific ...
1,0,The only cloud hanging over the impressive ach...
2,1,The Manhattan Project and its atomic bomb help...
3,1,Its legacy of peaceful uses of atomic energy c...
4,2,Essay on The Manhattan Project - The Manhattan...


In [30]:
queries_df.head()

Unnamed: 0,QID,QUERY
0,1048578,cost of endless pools/swim spa
1,1048579,what is pcnt
2,1048580,what is pcb waste
3,1048581,what is pbis?
4,1048582,what is paysky


## Classe Data set

In [31]:
phrase = 'Eu não gostaria de fazer parte desta pandemia'
encoder = tokenizer.encode_plus(phrase,
                      max_length =15,
                      pad_to_max_length=True,
                      add_special_tokens = True)
print('TOKENIZAÇÃO: ',tokenizer.tokenize(phrase))
print('ENCODER: ',encoder)
print('DECODIFICANDO: ', tokenizer.decode(encoder['input_ids']))

TOKENIZAÇÃO:  ['▁Eu', '▁', 'n', 'ã', '▁', 'o', '▁go', 's', 'tari', 'a', '▁de', '▁', 'f', 'a', 'zer', '▁parte', '▁de', 'sta', '▁pan', 'd', 'emia']
ENCODER:  {'input_ids': [4491, 3, 29, 32120, 3, 32, 281, 7, 5310, 9, 20, 3, 89, 9, 2558], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
DECODIFICANDO:  Eu n ã o gostaria de fazer


In [32]:
collection_df_debug =collection_df
queries_df_debug = queries_df

Query

In [35]:
class QueryDataset(Dataset):
    def __init__(self, query,tokenizer,max_length,training_step=False):
        self.query = query
        self.max_length = max_length
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.query)

    def __getitem__(self, idx):
        data = self.query.iloc[idx]
        qid = data['QID']
        query = data['QUERY']
        q_tok, q_mask, q_type = self.encode_plus(ftfy.fix_text(query))

        return  q_tok, q_mask, q_type, qid

    def encode_plus(self, text):
        tokens = self.tokenizer.encode_plus(text=text, max_length=self.max_length,
                                       pad_to_max_length=True, add_special_tokens = True)
        tok =  torch.tensor(tokens["input_ids"]).type(torch.long)
        mask = torch.tensor(tokens['attention_mask']).type(torch.long)
        tok_type = torch.tensor(tokens['token_type_ids']).type(torch.long)
        return tok,mask,tok_type

In [37]:
queries_debug = QueryDataset(
    query = queries_df_debug,
    tokenizer=tokenizer,
    max_length=20)

dataloader_debug = DataLoader(queries_debug, batch_size=2, shuffle=True,num_workers=cpu_count())

q_tok, q_mask, q_type, qid = next(iter(dataloader_debug))
print(qid)
print(q_tok)
print(q_mask)
print(qid)

tensor([1048581, 1048578])
tensor([[  125,    19,     3,   102, 11514,    58,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  583,    13,  9590, 14652,    87,     7,   210,   603,  4174,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]])
tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([1048581, 1048578])


Passages

In [38]:
class CollectionsDataset(Dataset):
    def __init__(self, passages,tokenizer,max_length,training_step=False):
        self.passages = passages
        self.max_length = max_length
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.passages)

    def __getitem__(self, idx):
        data = self.passages.iloc[idx]
        pid = data['PID']
        passages = data['PASSAGE']
        p_tok, p_mask, p_type = self.encode_plus(ftfy.fix_text(passages))

        return  p_tok, p_mask, p_type, pid

    def encode_plus(self, text):
        tokens = self.tokenizer.encode_plus(text=text, max_length=self.max_length,
                                       pad_to_max_length=True, add_special_tokens = True)
        tok =  torch.tensor(tokens["input_ids"]).type(torch.long)
        mask = torch.tensor(tokens['attention_mask']).type(torch.long)
        tok_type = torch.tensor(tokens['token_type_ids']).type(torch.long)
        return tok,mask,tok_type

In [39]:
passages_debug = CollectionsDataset(
    passages = collection_df_debug,
    tokenizer=tokenizer,
    max_length=20)

dataloader_debug = DataLoader(passages_debug, batch_size=2, shuffle=True,num_workers=cpu_count())

p_tok, p_mask, p_type, pid = next(iter(dataloader_debug))
print(pid)
print(p_tok)
print(p_mask)
print(p_type)

tensor([3, 4])
tensor([[   94,  2401,     7,  3346,    12,     8,  1059,    13,     8,   516,
            45,   957,   591,     3,   233,   204,  4481,  4448,   365,     8],
        [   37,  2126,    13,  5528,    11, 11523, 10179,    11,     8,   868,
         23907,  3684,     0,     0,     0,     0,     0,     0,     0,     0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


# Hyperparâmetros

In [40]:
classification_specific = False #@param {type:"boolean"}
batch_size =  32#@param {type:"integer"}
max_epochs = 2 #@param {type:"integer"}
accumulate_grad_batches = 8  #@param {type:"integer"}
source_max_length = 64  #@param {type:"integer"}
target_max_length = 128  #@param {type:"integer"}
max_lenght_MSMARCO = 500  #@param {type:"integer"}
learning_rate = 5e-3  #@param {type:"number"}
context_size = 20

In [41]:
hyperparms = {'model_name':model_name,'tokenizer':tokenizer,'learning_rate':learning_rate,'batch_size':batch_size,'source_max_length':source_max_length,'target_max_length':target_max_length,'context_size':context_size} 

## Criando o BERT com Pytorch Lightning

In [42]:
class T5Finetuner(pl.LightningModule):

    def __init__(self, 
                 all_data, 
                 hyperparms,
                 criterion = torch.nn.CrossEntropyLoss(),
                 overfit=False):
      

        super(T5Finetuner, self).__init__()

        #---------- Hyperparametros
        self.model_name = hyperparms['model_name']
        self.tokenizer = hyperparms['tokenizer']
        self.learning_rate = hyperparms['learning_rate']
        self.batch_size = hyperparms['batch_size']
        self.target_max_length = hyperparms['target_max_length']
        self.source_max_length = hyperparms['source_max_length']
        self.overfit = overfit
        self.training = False

      
        self.model = T5ForConditionalGeneration.from_pretrained(self.model_name)

       #---------- Carregamento datasets (Para eu poder variar self.max_length)
        if overfit:
          self.train_dataset = ParacrawlDataset(all_data[0], tokenizer=self.tokenizer,source_max_length = self.source_max_length, target_max_length =  self.target_max_length)
          self.train_dataset = ParacrawlDataset(all_data[0], tokenizer=self.tokenizer,source_max_length = self.source_max_length, target_max_length =  self.target_max_length)
          self.train_dataset = ParacrawlDataset(all_data[0], tokenizer=self.tokenizer,source_max_length = self.source_max_length, target_max_length =  self.target_max_length)          
        else:
          self.train_dataset = ParacrawlDataset(all_data[0], tokenizer=self.tokenizer, source_max_length = self.source_max_length, target_max_length =  self.target_max_length)
          self.valid_dataset = ParacrawlDataset(all_data[1], tokenizer=self.tokenizer,source_max_length = self.source_max_length, target_max_length =  self.target_max_length)
          self.test_dataset = ParacrawlDataset(all_data[2], tokenizer=self.tokenizer,source_max_length = self.source_max_length, target_max_length =  self.target_max_length)
        #---------- Loss Function
        self.loss_funtion = criterion

        
    def forward(self, source_token_ids, source_mask, target_token_ids=None,target_mask=None, training=False):

      if training:
          #peguei a ideia da documentação oficial. Setar -100 pro pad faz ele ser ignorado no crossentropy.
          target_token_ids[target_token_ids == self.tokenizer.pad_token_id] = -100 
          
          loss = self.model(source_token_ids, attention_mask = source_mask, lm_labels = target_token_ids)
          return loss[0]

      else:
          #gerador de tokens de saída - GREEDY
          predicted_token_ids = self.model.generate(input_ids=source_token_ids, max_length=self.target_max_length)
          return predicted_token_ids


    def training_step(self, batch, batch_nb):
        # batch
        source_token_ids, source_mask, target_token_ids, target_mask, _, _ = batch
         
        # fwd
        loss = self.forward(source_token_ids, source_mask, target_token_ids, target_mask, training=True)

        # logs
        tensorboard_logs = {'train_loss': loss}
        progress_bar = {'gpu_usage': gpu_usage()}
        return {'loss': loss, 'log': tensorboard_logs,
                'progress_bar': progress_bar}


    def validation_step(self, batch, batch_nb):
        source_token_ids, source_mask, target_token_ids, target_mask, source, refs = batch
        predict = self(source_token_ids, source_mask).permute(0,1)
        sys = [fix_accent_breaks(self.tokenizer.decode(tokens)) for tokens in predict]
        avg_bleu = sacrebleu.corpus_bleu(sys, [refs]).score
        progress_bar = {'gpu_usage': gpu_usage()}
        return {'val_bleu': avg_bleu, 'progress_bar': progress_bar}


    def test_step(self, batch, batch_nb):
        source_token_ids, source_mask, target_token_ids, target_mask, source, refs = batch
        predict = self(source_token_ids, source_mask).permute(0,1)
        sys = [fix_accent_breaks(self.tokenizer.decode(tokens)) for tokens in predict]
        
        avg_bleu = sacrebleu.corpus_bleu(sys, [refs]).score
        
        progress_bar = {'gpu_usage': gpu_usage()}
        return {'test_bleu': avg_bleu, 'progress_bar': progress_bar}

    def validation_epoch_end(self, outputs):
        avg_bleu = sum([x['val_bleu'] for x in outputs]) / len(outputs)
        print("Avg Bleu val", avg_bleu)
        tensorboard_logs = {'avg_val_bleu': avg_bleu}
        
        return {'avg_val_bleu': avg_bleu, 'progress_bar': tensorboard_logs}


    def test_epoch_end(self, outputs):
        avg_bleu = sum([x['test_bleu'] for x in outputs]) / len(outputs)

        tensorboard_logs = {'avg_test_bleu': avg_bleu}

    def configure_optimizers(self):
        return torch.optim.AdamW(
            [p for p in self.parameters() if p.requires_grad],
            lr=self.learning_rate, eps=1e-08)
    
    @gpu_mem_restore
    def train_dataloader(self):
        shuffle = False if self.overfit else True
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=shuffle,num_workers=cpu_count())
    
    @gpu_mem_restore
    def val_dataloader(self):
        return DataLoader(self.valid_dataset, batch_size=self.batch_size, shuffle=False,num_workers=cpu_count())
    
    @gpu_mem_restore
    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size,shuffle=False, num_workers=cpu_count())

# Etapas de Treino e Avaliação

### Testando rapidamente o modelo em treino, validação e teste com um batch

Recuperando o número de parâmetros

In [44]:
all_data = [ds_train,ds_val,ds_test]
model = T5Finetuner(all_data,hyperparms) 
num_params = sum([torch.tensor(x.size()).prod() for x in model.parameters() if x.requires_grad]) # trainable parameters
print(num_params)

In [None]:
trainer = pl.Trainer(gpus=1, 
                     checkpoint_callback=False,  # Disable checkpoint saving.
                     fast_dev_run=True)
trainer.fit(model)

No environment variable for node rank defined. Set as 0.


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Avg Bleu val 0.7478088493046045



1

In [None]:
del model

### Treinamento e Validação no dataset todo

In [46]:
# checkpoint_path = check_path + '/epoch=0.ckpt'

checkpoint_path = check_path + '/epoch=14.ckpt'

checkpoint_dir = os.path.dirname(os.path.abspath(checkpoint_path))
print(f'Files in {checkpoint_dir}: {os.listdir(checkpoint_dir)}')
print(f'Saving checkpoints to {checkpoint_dir}')
checkpoint_callback = ModelCheckpoint(filepath=checkpoint_dir,
                                      save_top_k=-1,
                                      monitor="val_acc",
                                      mode="max")  # Keeps all checkpoints.


resume_from_checkpoint = None
if os.path.exists(checkpoint_path):
    print(f'Restoring checkpoint: {checkpoint_path}')
    resume_from_checkpoint = checkpoint_path


all_data = [ds_train,ds_val,ds_test]
trainer = pl.Trainer(gpus=1,
                     max_epochs=15,
                     check_val_every_n_epoch=1,
                     profiler=True,
                     checkpoint_callback=checkpoint_callback,
                     accumulate_grad_batches=accumulate_grad_batches,
                     progress_bar_refresh_rate=10,
                     resume_from_checkpoint=resume_from_checkpoint)
#  checkpoint_callback=checkpoint_callback,
model = T5Finetuner(all_data,hyperparms) 

trainer.fit(model)

No environment variable for node rank defined. Set as 0.


Files in /content/drive/My Drive/Mestrado/PLN/Projeto/Data/Traducao/Checkpoints_V2: ['epoch=0.ckpt', 'epoch=0_v0.ckpt', 'epoch=1.ckpt', 'epoch=2.ckpt', 'epoch=3.ckpt', 'epoch=4.ckpt', 'epoch=5.ckpt', 'epoch=6.ckpt', 'epoch=7.ckpt', 'epoch=8.ckpt', 'epoch=9.ckpt', 'epoch=10.ckpt', 'epoch=11.ckpt', 'epoch=12.ckpt', 'epoch=13.ckpt', 'epoch=14.ckpt']
Saving checkpoints to /content/drive/My Drive/Mestrado/PLN/Projeto/Data/Traducao/Checkpoints_V2
Restoring checkpoint: /content/drive/My Drive/Mestrado/PLN/Projeto/Data/Traducao/Checkpoints_V2/epoch=14.ckpt




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

Avg Bleu val 41.453424467921664




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




1

## Colocando no formato do MS MARCO

In [72]:
checkpoint_path = check_path + '/epoch=14.ckpt'
model = T5Finetuner.load_from_checkpoint(checkpoint_path = checkpoint_path,
                                         all_data = all_data,
                                         hyperparms=hyperparms).to(device)

### Query

In [73]:
queries_dataset = QueryDataset(
  query = queries_df,
  tokenizer=tokenizer,
  max_length=max_lenght_MSMARCO)


queries_dataloader = DataLoader(queries_dataset, batch_size=batch_size,num_workers=cpu_count())

In [63]:
ids = []
queries_portuguese = []


for batch in queries_dataloader:
    #qids
    ids.extend(batch[3].detach().cpu().tolist())    
    #query
    out = model(batch[0].to(device), batch[1].to(device))
    portuguese_translations = [fix_accent_breaks(tokenizer.decode(tokens)) for tokens in out]
    queries_portuguese.extend(portuguese_translations)


In [64]:
queries_df_portuguese = pd.DataFrame({'QID':ids, 'QUERY': queries_portuguese})

In [65]:
queries_df_portuguese.head()

Unnamed: 0,QID,QUERY
0,1048578,spa spa spa cu cu cu cu cu cu cu de pools infi...
1,1048579,o que é pcntt o que é pcntt o que é pcntt o qu...
2,1048580,pcb desperdi o que é pcb de desperdi o que é p...
3,1048581,O que é pbis?? o que é pbis?? o que é pbis?? o...
4,1048582,paysky O que é paysky O que é paysky O que é p...


In [66]:
compression_opts = dict(method='zip', archive_name='queries_df_portuguese.csv') 
queries_df_portuguese.to_csv('queries_df_portuguese.zip', index=False,compression=compression_opts)

### Collection

In [67]:
passages_dataset = CollectionsDataset(
    passages = collection_df,
    tokenizer=tokenizer,
    max_length=max_lenght_MSMARCO)

passages_dataloader = DataLoader(passages_dataset, batch_size=batch_size,num_workers=cpu_count())

In [68]:
ids = []
collection_portuguese = []


for batch in passages_dataloader:
    #ids
    ids.extend(batch[3].detach().cpu().tolist())    
    #passage
    out = model(batch[0].to(device), batch[1].to(device))
    portuguese_translations = [fix_accent_breaks(tokenizer.decode(tokens)) for tokens in out]
    collection_portuguese.extend(portuguese_translations)


In [69]:
collection_df_portuguese = pd.DataFrame({'PID': ids, 'PASSAGE': collection_portuguese})

In [71]:
collection_df_portuguese.head()

Unnamed: 0,PID,PASSAGE
0,0,A presena de comunicaço entre mentes cientient...
1,0,Aúnica nu so sobre a impressionante realiza re...
2,1,O Pro Pro Pro The Manhattan e sua bomba at atm...
3,1,Sua her legada de usos pacos de usos pacoos da...
4,2,O projet Manhattan Project A Manhattan Project...


In [57]:
compression_opts = dict(method='zip', archive_name='collection_df_portuguese.csv') 
collection_df_portuguese.to_csv('collection_df_portuguese.zip', index=False,compression=compression_opts)