# Projeto: Dataset MS Marco Traduzido para Português
### Autores : Graziella Cardoso Bonadia e Matheus Gustavo Alves Sasso


In [1]:
#@title Configurações gerais
experiment_name = 'NaturalQuestionsPortuguese'  #@param {type:"string"}
model_name = 't5-small'  #@param ["t5-base","t5-small","t5-large"] {type:"string"}
TARGET_LANGUAGE = 'portuguese'  #@param {type:"string"}
SOURCE_LANGUAGE = 'english' #@param {type:"string"}
PREFIX = 'translate English to Portuguese: '

# Instalação de pacotes , imports e configurações gerais

### Pacotes Externos

In [2]:
! pip install pytorch-lightning==0.7.6 --quiet
! pip install transformers --quiet
! pip install nvidia-smi --quiet
! pip install ftfy --quiet
! pip install jsonlines --quiet
! pip install sacrebleu --quiet

### Funções auxiliares 

In [3]:
%%capture
!wget -nc https://raw.githubusercontent.com/Matheus158257/ms-marco-passage-ranking-dense-vectors-with-doc2query/master/read_ms_marco.py
!wget -nc https://raw.githubusercontent.com/Matheus158257/ms-marco-passage-ranking-dense-vectors-with-doc2query/master/read_ms_marco2.py

### Paths

Root

In [4]:
import os
# data_base_dir = '/content/drive/My Drive/Mestrado/PLN/Projeto/Data/Traducao/Natural_Questions' #Matheus
#data_base_dir = '/content/drive/My Drive/Projeto/Data/Traducao/Natural_Questions'
data_base_dir = '/content/drive/My Drive/Natural_Questions'
# data_base_dir = '/content/drive/My Drive/Projeto/Data/Traducao/Natural_Questions' #Graziella
# check_path = '/content/drive/My Drive/Projeto/Data/Traducao/checkpoints'
check_path = '/content/drive/My Drive/Natural_Questions'

In [5]:
# NO CHUNCK
nq_doct_text_path =  data_base_dir + '/nq_doc_text.csv'
nq_long_answer_path =  data_base_dir + '/nq_long_answer.csv'
nq_short_answers_path =  data_base_dir + '/nq_short_answer.csv'
nq_question_path =  data_base_dir + '/nq_question.csv'
nq_infos_path =  data_base_dir + '/nq_infos.csv'

# NO CHUNCK
nq_doct_text_chunk_path =  data_base_dir + '/nq_doc_text_chunk.csv'
nq_long_answer_chunk_path =  data_base_dir + '/nq_long_answer_chunk.csv'
nq_short_answers_chunk_path =  data_base_dir + '/nq_short_answer_chunk.csv'
nq_question_chunk_path =  data_base_dir + '/nq_question_chunk.csv'
nq_infos_chunk_path =  data_base_dir + '/nq_infos_chunk.csv'

### Imports


In [6]:
#Bibliotecas Padrão
import os
import random
from typing import Dict
from typing import List
from typing import Tuple
import re
import gzip
import math
import jsonlines
import pdb
import ftfy


#Bibliotecas Data Science
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder
import numpy as np
import pandas as pd

#Bibliotecas Pytorch
import torch
from torch.utils.data import Dataset
from torch import Tensor
from torch.utils.data import DataLoader
import torch.nn.functional as F

#Bibliotecas Lightning
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

#Bibliotecas transfromers
from transformers import T5ForConditionalGeneration
from transformers import T5Tokenizer

#import das funções que vem do github
import collections
import functools
import traceback
import sys
import os
from read_ms_marco2 import load_qrels
from read_ms_marco2 import load_queries
from read_ms_marco import load_collection
from read_ms_marco import load_doc2query
from read_ms_marco import load_triple
from read_ms_marco2 import load_txts_topk #load_txts_topk(folder,k=1,n=18,encoding="cp1252")

# Score Metrics
import sacrebleu

# Initialize tensorboard
print("\nInitialize=ing Tensorboard...\n")
%load_ext tensorboard

# Device
if torch.cuda.is_available(): 
   dev = "cuda:0"
else: 
   dev = "cpu" 
print(dev)
device = torch.device(dev)


Initialize=ing Tensorboard...

cuda:0


### Fix seeds 

In [7]:
seed = 123
random.seed(seed)
# np.random.seed(seed)
torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed)

### Montar o Drive

In [8]:
#Mount drive
print("\nMounting Drive...\n")
from google.colab import drive
drive.mount('/content/drive')


Mounting Drive...

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Impedir excesso de logs

In [9]:
import logging
logging.getLogger("transformers.configuration_utils").setLevel(logging.WARNING)
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARNING)
logging.getLogger("lightning").setLevel(logging.WARNING)

###Impedir quebra de memória

In [10]:
# https://docs.fast.ai/troubleshoot.html#memory-leakage-on-exception
import functools, traceback
def gpu_mem_restore(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except:
            type, val, tb = sys.exc_info()
            traceback.clear_frames(tb)
            raise type(val).with_traceback(tb) from None
    return wrapper


###Hardware Data

In [11]:
import nvidia_smi
import psutil
from multiprocessing import cpu_count
nvidia_smi.nvmlInit()
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
print("\nGetting Hardware Statatus...\n")
def hardware_stats():
    '''
    Returns a dict containing some hardware related stats
    '''
    res = nvidia_smi.nvmlDeviceGetUtilizationRates(handle)
    return {"cpu": str(psutil.cpu_percent()) + '%',
            "mem": str(psutil.virtual_memory().percent) + '%',
            "gpu": str(res.gpu) + '%',
            "gpu_mem": str(res.memory) + '%'}

print(f"Imports loaded succesfully. Current GPU: {torch.cuda.get_device_name(0)}, number of CPU cores: {cpu_count()}")


Getting Hardware Statatus...

Imports loaded succesfully. Current GPU: Tesla K80, number of CPU cores: 2


In [12]:
print(f"Pytorch Lightning Version: {pl.__version__}")
nvidia_smi.nvmlInit()
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
print(f"Device name: {nvidia_smi.nvmlDeviceGetName(handle)}")
print(f"Number of CPU cores: {cpu_count()}")

def gpu_usage():
    global handle
    return str(nvidia_smi.nvmlDeviceGetUtilizationRates(handle).gpu) + '%'

Pytorch Lightning Version: 0.7.6
Device name: b'Tesla K80'
Number of CPU cores: 2


# Trabalhando o modelo

In [13]:
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [14]:
extra_tokens = ['À' , 'È' , 'Ì' , 'Ò' , 'Ù' , 'à' , 'è' , 'ì' , 'ò' , 'ù' , 'Á' , 'É' , 'Í' , 'Ó' , 'Ú' , 'á' , 'é' , 'í' , 'ó' , 'ú' , 'Â' , 'Ê' , 'Î' , 'Ô' , 'Û' , 'â' , 'ê' , 'î' , 'ô' , 'û'  , 'Ã' , 'Õ'  , 'ã', 'õ' , 'Ë', 'ä' , 'ë' , 'ï' , 'ö' , 'ü']

In [15]:
tokenizer.add_tokens('não')

1

In [16]:
# tokenizer.add_tokens('<CHUNK>')

Alguns dos tokens de acentuação já existem no dataset. Checamos isso fazendo o encoding do token. Se existir, ele retorna um token x. Se não, retorna 2. Se for 2, a gente adiciona para o tokenizador

In [17]:
added_tokens = []
for tok in extra_tokens:
    enc = tokenizer.encode(tok)
    if 2 in enc:
        added_tokens.append(tok)
        tokenizer.add_tokens(tok)

In [18]:
def fix_accent_breaks(text):
    """
    A ideia é fazer a junção de letras com acento de volta para frases na validação.
    Isso serve para melhorar o BLEU.
    
    Args
        text: texto que terá acentuação corrigida
        
    Returns:
        Texto completo com acentuação corrigida
    
    
    
    """
    words = text.split(" ")
    out_words = []
    merge_pos = [idx for idx, dat in enumerate(words) if dat in added_tokens]
    for pos in sorted(merge_pos, reverse=True):
        if pos==0:
            new_word = words[pos]+words[pos+1]
            for i in range(2): words.pop
            words.pop(pos+1)
            words.pop(pos)        
            words.insert(pos, new_word)
        elif pos==len(words)-1:
            new_word = words[pos-1]+words[pos]
            words.pop(pos)
            words.pop(pos-1)
            words.insert(pos-1, new_word)
        else:
            new_word = words[pos-1]+words[pos]+words[pos+1]
            words.pop(pos+1)
            words.pop(pos)
            words.pop(pos-1)
            words.insert(pos-1, new_word)

    return " ".join(words)

# Etapas do paracrawl -> Treino




## Data Prep

In [19]:
! wget -nc https://storage.googleapis.com/neuralresearcher_data/unicamp/ia376e_2020s1/paracrawl_enpt_train.tsv.gz
! wget -nc https://storage.googleapis.com/neuralresearcher_data/unicamp/ia376e_2020s1/paracrawl_enpt_test.tsv.gz

File ‘paracrawl_enpt_train.tsv.gz’ already there; not retrieving.

File ‘paracrawl_enpt_test.tsv.gz’ already there; not retrieving.



In [20]:
def load_text_pairs(path):
    text_pairs = []
    for line in gzip.open(path, mode='rt'):
        text_pairs.append(line.strip().split('\t'))
    return text_pairs

ds_train = load_text_pairs('paracrawl_enpt_train.tsv.gz')
ds_test = load_text_pairs('paracrawl_enpt_test.tsv.gz')
print(ds_test[:3])
# Embaralhamos o treino para depois fazermos a divisão treino/val.
random.shuffle(ds_train)

# Truncamos o dataset para 100k pares de treino e 5k pares de validação.
ds_val = ds_train[100000:101000]
ds_train = ds_train[:1000]

for set_name, x in [('treino', ds_train), ('validação',ds_val), ('test', ds_test)]:
    print(f'\n{len(x)} amostras de {set_name}')
    print(f'3 primeiras amostras {set_name}:')
    for i, (source, target) in enumerate(x[:3]):
        print(f'{i}: source: {source}\n   target: {target}')

[['In this way, the civil life of a nation matures, making it possible for all citizens to enjoy the fruits of genuine tolerance and mutual respect.', 'Deste modo, a vida civil de uma nação amadurece, fazendo com que todos os cidadãos gozem dos frutos da tolerância genuína e do respeito mútuo.'], ['1999 XIII. Winnipeg, Canada July 23 to August 8', '1999 XIII. Winnipeg, Canadá 23 de julho a 8 de agosto'], ["In the mystery of Christmas, Christ's light shines on the earth, spreading, as it were, in concentric circles.", 'No mistério do Natal, a luz de Cristo irradia-se sobre a terra, difundindo-se como círculos concêntricos.']]

1000 amostras de treino
3 primeiras amostras treino:
0: source: More Croatian words and phrases
   target: Mais palavras e frases em croata
1: source: Jerseys and pullovers, containing at least 50Â % by weight of wool and weighing 600Â g or more per article 6110 11 10 (PCE)
   target: Camisolas e pulôveres, com pelo menos 50 %, em peso, de lã e pesando 600g ou mai

## Classe de dataset

In [21]:
class ParacrawlDataset(Dataset):
    def __init__(self, text_pairs: List[Tuple[str]], tokenizer,
                 source_max_length: int = 32, target_max_length: int = 32):
        self.tokenizer = tokenizer
        self.text_pairs = text_pairs
        self.source_max_length = source_max_length
        self.target_max_length = target_max_length
        
    def __len__(self):
        return len(self.text_pairs)
    
    def __getitem__(self, idx):
        source, target = self.text_pairs[idx]
        source_modified = 'translate English to Portuguese: '+ source + self.tokenizer.eos_token
        target_modified = target + self.tokenizer.eos_token
        
        source_tok = tokenizer.batch_encode_plus([source_modified], add_special_tokens=True,
                                                max_length=self.source_max_length, pad_to_max_length = True, 
                                                return_tensors='pt')
        target_tok = tokenizer.batch_encode_plus([target_modified], add_special_tokens=True,
                                                max_length=self.target_max_length, pad_to_max_length = True, 
                                                return_tensors='pt')
        
        return (source_tok['input_ids'][0], source_tok['attention_mask'][0], target_tok['input_ids'][0], 
                target_tok['attention_mask'][0], source, target)

In [22]:
text_pairs = [('Dear friends, the history of this nation includes many examples of the Church’s commitment in this regard.',
               'Queridos amigos, a história desta Nação oferece numerosos exemplos do compromisso da Igreja a este propósito.')]
dataset_debug = ParacrawlDataset(
    text_pairs=text_pairs,
    tokenizer=tokenizer,
    source_max_length=50,
    target_max_length=50)

dataloader_debug = DataLoader(dataset_debug, batch_size=10, shuffle=True, 
                              num_workers=0)

source_token_ids, source_mask, target_token_ids, target_mask, _, _ = next(iter(dataloader_debug))
print('source_tokens:\n', tokenizer.tokenize(text_pairs[0][0]))
print('source_token_ids:\n', source_token_ids)
print('source_mask:\n', source_mask)
print('target_tokens:\n', tokenizer.tokenize(text_pairs[0][1]))
print('target_token_ids:\n', target_token_ids)
print('target_mask:\n', target_mask)

print('source_token_ids.shape:', source_token_ids.shape)
print('source_mask.shape:', source_mask.shape)
print('target_token_ids.shape:', target_token_ids.shape)
print('target_mask.shape:', target_mask.shape)

source_tokens:
 ['▁Dear', '▁friends', ',', '▁the', '▁history', '▁of', '▁this', '▁nation', '▁includes', '▁many', '▁examples', '▁of', '▁the', '▁Church', '’', 's', '▁commitment', '▁in', '▁this', '▁regard', '.']
source_token_ids:
 tensor([[13959,  1566,    12, 21076,    10, 19451,   803,     6,     8,   892,
            13,    48,  2982,   963,   186,  4062,    13,     8,  2345,    22,
             7,  3148,    16,    48,  3553,     5,     1,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]])
source_mask:
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]])
target_tokens:
 ['▁Qu', 'er', 'idos', '▁', 'ami', 'go', 's', ',', '▁', 'a', '▁his', 't', 'ó', 'r', 'i', 'a', '▁de', 'sta', '▁Na', 'ç', 'ã', '▁', 'o', '▁of', 'er', 'e', 'ce', '▁numero'

# Etapa NaturalQuestions -> Inferência

## Data prep

In [23]:
doct_text_df = pd.read_csv(nq_doct_text_path)
long_answer_df = pd.read_csv(nq_long_answer_path)
short_answers_df = pd.read_csv(nq_short_answers_path)
question_df = pd.read_csv(nq_question_path)
infos_df = pd.read_csv(nq_infos_path)

In [24]:
question_df.head()


Unnamed: 0,ID,QUESTION
0,5655493461695504401,which is the most common use of opt-in e-mail ...
1,5328212470870865242,how i.met your mother who is the mother
2,4435104480114867852,what type of fertilisation takes place in humans
3,5289242154789678439,who had the most wins in the nfl
4,5489863933082811018,what happened to the lost settlement of roanoke


In [25]:
question_df.head()

Unnamed: 0,ID,QUESTION
0,5655493461695504401,which is the most common use of opt-in e-mail ...
1,5328212470870865242,how i.met your mother who is the mother
2,4435104480114867852,what type of fertilisation takes place in humans
3,5289242154789678439,who had the most wins in the nfl
4,5489863933082811018,what happened to the lost settlement of roanoke


In [26]:
Q = question_df['QUESTION'].tolist()

In [27]:
text_pairs_question = []
for i in range(len(Q)):
    pairs = [Q[i], Q[i]]
    text_pairs_question.append(pairs)

In [28]:
ds_test = text_pairs_question

In [29]:
print(ds_test)

[['which is the most common use of opt-in e-mail marketing', 'which is the most common use of opt-in e-mail marketing'], ['how i.met your mother who is the mother', 'how i.met your mother who is the mother'], ['what type of fertilisation takes place in humans', 'what type of fertilisation takes place in humans'], ['who had the most wins in the nfl', 'who had the most wins in the nfl'], ['what happened to the lost settlement of roanoke', 'what happened to the lost settlement of roanoke'], ['what are the different regions of africa and how do they differ', 'what are the different regions of africa and how do they differ']]


In [30]:
# doct_text_df = pd.read_csv(nq_doct_text_chunck_path)
# long_answer_df = pd.read_csv(nq_long_answer_chunck_path)
# short_answers_df = pd.read_csv(nq_short_answers_chunck_path)
# question_df = pd.read_csv(nq_question_chunck_path)
# infos_df = pd.read_csv(nq_infos_chunck_path)

In [31]:
#doct_text_df.head()

In [32]:
#long_answer_df.head()

In [33]:
#short_answers_df.head()

In [34]:
#question_df.head()

In [35]:
#infos_df.head()

## Classe Data set

In [36]:
phrase = 'Eu não gostaria de fazer parte desta pandemia'
encoder = tokenizer.encode_plus(phrase,
                      max_length =15,
                      pad_to_max_length=True,
                      add_special_tokens = True)
print('TOKENIZAÇÃO: ',tokenizer.tokenize(phrase))
print('ENCODER: ',encoder)
print('DECODIFICANDO: ', tokenizer.decode(encoder['input_ids']))

TOKENIZAÇÃO:  ['▁Eu', '▁', 'n', 'ã', '▁', 'o', '▁go', 's', 'tari', 'a', '▁de', '▁', 'f', 'a', 'zer', '▁parte', '▁de', 'sta', '▁pan', 'd', 'emia']
ENCODER:  {'input_ids': [4491, 3, 29, 32120, 3, 32, 281, 7, 5310, 9, 20, 3, 89, 9, 2558], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
DECODIFICANDO:  Eu n ã o gostaria de fazer


In [37]:
doct_text_df_debug =doct_text_df[:10]
long_answer_df_debug = long_answer_df[:10]
short_answers_df_debug = short_answers_df[:10]
question_df_debug = question_df[:10]

In [38]:
class QuestionDataset(Dataset):
    def __init__(self, question,tokenizer,source_max_length: int = 32, target_max_length: int = 32,training_step=False):
        self.question = question
        self.tokenizer = tokenizer
        self.source_max_length = source_max_length
        self.target_max_length = target_max_length
    # def __init__(self, question,tokenizer,max_length,training_step=False):
    #     self.question = question
    #     self.max_length = max_length
    #     self.tokenizer = tokenizer

    def __len__(self):
        return len(self.question)

    def __getitem__(self, idx):
        data = self.question.iloc[idx]
        id_example = data['ID']
        question = data['QUESTION']
        q_tok, q_mask, q_type = self.encode_plus(ftfy.fix_text(question))

        return  q_tok, q_mask, q_type, id_example

    def encode_plus(self, text):
        tokens = self.tokenizer.encode_plus(text=text, max_length=self.max_length,
                                       pad_to_max_length=True, add_special_tokens = True)
        tok =  torch.tensor(tokens["input_ids"]).type(torch.long)
        mask = torch.tensor(tokens['attention_mask']).type(torch.long)
        tok_type = torch.tensor(tokens['token_type_ids']).type(torch.long)
        return tok,mask,tok_type

In [39]:
question_debug = QuestionDataset(
    question = question_df_debug,
    tokenizer=tokenizer,
    max_length=20)

dataloader_debug = DataLoader(question_debug, batch_size=2, shuffle=True,num_workers=cpu_count())

q_tok, q_mask, q_type, id_example = next(iter(dataloader_debug))
print(id_example)
print(q_tok)
print(q_mask)
print(q_type)


TypeError: ignored

In [58]:
class LongAnswerDataset(Dataset):
    def __init__(self, long_answer,tokenizer,max_length,training_step=False):
        self.long_answer = long_answer
        self.max_length = max_length
        self.tokenizer = tokenizer

    def __len__(self):
        return 5

    def __getitem__(self, idx):
        data = self.long_answer.iloc[idx]
        id_example = data['ID']
        long_answer = data['LONG_ANSWER']
        start = data['START_LA']
        end = data['END_LA']
        la_tok, la_mask, la_type = self.encode_plus(ftfy.fix_text(long_answer))

        return  la_tok, la_mask, la_type,id_example,start,end

    def encode_plus(self, text):
        tokens = self.tokenizer.encode_plus(text=text, max_length=self.max_length,
                                       pad_to_max_length=True, add_special_tokens = True)
        tok =  torch.tensor(tokens["input_ids"]).type(torch.long)
        mask = torch.tensor(tokens['attention_mask']).type(torch.long)
        tok_type = torch.tensor(tokens['token_type_ids']).type(torch.long)
        return tok,mask,tok_type

In [59]:
la_debug = LongAnswerDataset(
    long_answer = long_answer_df_debug,
    tokenizer=tokenizer,
    max_length=20)

dataloader_debug = DataLoader(la_debug, batch_size=2, shuffle=True,num_workers=cpu_count())

la_tok, la_mask, la_type,id_example,start,end= next(iter(dataloader_debug))

print(id_example)
print(start)
print(end)
print(la_tok)
print(la_mask)
print(la_type)

tensor([5655493461695504401, 5328212470870865242])
tensor([1718,  132])
tensor([1783,  228])
tensor([[   71,  1017,   677,    13,  6059,  1070,    19,     3,     9,  7288,
          1622,    12,    46,  3662,  1669,     3,    31,     7,   722,     3],
        [29464,  7040,   106, 10361,     3,     6,   394,   801,    38,     3,
             2,    37,  8007,     3,    31,    31,     3,     6,    19,     8]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


In [60]:
class ShortAnswerDataset(Dataset):
    def __init__(self, short_answer,tokenizer,max_length,training_step=False):
        self.short_answer = short_answer
        self.max_length = max_length
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.short_answer)

    def __getitem__(self, idx):
        data = self.short_answer.iloc[idx]
        id_example = data['ID']
        short_answer = data['SHORT_ANSWER']
        start = data['START_SA']
        end = data['END_SA']
        sa_tok, sa_mask, sa_type = self.encode_plus(ftfy.fix_text(short_answer))
        return  sa_tok, sa_mask, sa_type,id_example,start,end

    def encode_plus(self, text):
        tokens = self.tokenizer.encode_plus(text=text, max_length=self.max_length,
                                       pad_to_max_length=True, add_special_tokens = True)
        tok =  torch.tensor(tokens["input_ids"]).type(torch.long)
        mask = torch.tensor(tokens['attention_mask']).type(torch.long)
        tok_type = torch.tensor(tokens['token_type_ids']).type(torch.long)
        return tok,mask,tok_type

In [61]:
sa_debug = ShortAnswerDataset(
    short_answer = short_answers_df_debug,
    tokenizer=tokenizer,
    max_length=20)

dataloader_debug = DataLoader(sa_debug, batch_size=2, shuffle=True,num_workers=cpu_count())

sa_tok, sa_mask, sa_type,id_example,start,end= next(iter(dataloader_debug))
print(id_example)
print(start)
print(end)
print(sa_tok)
print(sa_mask)
print(sa_type)

tensor([5655493461695504401, 5328212470870865242])
tensor([1725,  132])
tensor([1734,  134])
tensor([[    3,     9,  7288,  1622,    12,    46,  3662,  1669,     3,    31,
             7,   722,     0,     0,     0,     0,     0,     0,     0,     0],
        [29464,  7040,   106, 10361,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


In [40]:
class DocTextDataset(Dataset):
    def __init__(self, doc_text,tokenizer,max_length,training_step=False):
        self.doc_text = doc_text
        self.max_length = max_length
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.doc_text)

    def __getitem__(self, idx):
        data = self.doc_text.iloc[idx]
        id_example = data['ID']
        doc_text = data['DOC_TEXT']
        dt_tok, dt_mask, dt_type = self.encode_plus(ftfy.fix_text(doc_text))

        return  dt_tok, dt_mask, dt_type,id_example

    def encode_plus(self, text):
        tokens = self.tokenizer.encode_plus(text=text, max_length=self.max_length,
                                       pad_to_max_length=True, add_special_tokens = True)
        tok =  torch.tensor(tokens["input_ids"]).type(torch.long)
        mask = torch.tensor(tokens['attention_mask']).type(torch.long)
        tok_type = torch.tensor(tokens['token_type_ids']).type(torch.long)
        return tok,mask,tok_type

In [63]:
doc_debug = DocTextDataset(
    doc_text = doct_text_df_debug,
    tokenizer=tokenizer,
    max_length=20)

dataloader_debug = DataLoader(doc_debug, batch_size=2, shuffle=True,num_workers=cpu_count())

dt_tok, dt_mask, dt_type,id_example= next(iter(dataloader_debug))
print(id_example)
print(dt_tok)
print(dt_mask)
print(dt_type)

tensor([5655493461695504401, 5655493461695504401])
tensor([[ 1762,  1230,     3,    61,    41,  4001,   149,    11,   116,    12,
          2036,    48,  3847,  1569,     3,    61,    41,  4001,   149,    11],
        [  863,   199,  1172,    48,  1108,    57,  2651,     3, 13903,     7,
            12,  3468,  2836,     3,     5,     0,     0,     0,     0,     0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


# Hyperparâmetros

In [41]:
classification_specific = False #@param {type:"boolean"}
batch_size =  32#@param {type:"integer"}
max_epochs = 2 #@param {type:"integer"}
accumulate_grad_batches = 8  #@param {type:"integer"}
source_max_length = 64  #@param {type:"integer"}
target_max_length = 128  #@param {type:"integer"}
max_lenght_NQ = 500  #@param {type:"integer"}
learning_rate = 5e-3  #@param {type:"number"}
context_size = 20

In [42]:
hyperparms = {'model_name':model_name,'tokenizer':tokenizer,'learning_rate':learning_rate,'batch_size':batch_size,'source_max_length':source_max_length,'target_max_length':target_max_length,'context_size':context_size,'max_lenght_NQ':max_lenght_NQ} 

## Criando o BERT com Pytorch Lightning

In [43]:
class T5Finetuner(pl.LightningModule):

    def __init__(self, 
                 all_data, 
                 hyperparms,
                 criterion = torch.nn.CrossEntropyLoss(),
                 overfit=False):
      

        super(T5Finetuner, self).__init__()

        #---------- Hyperparametros
        self.model_name = hyperparms['model_name']
        self.tokenizer = hyperparms['tokenizer']
        self.learning_rate = hyperparms['learning_rate']
        self.batch_size = hyperparms['batch_size']
        self.target_max_length = hyperparms['target_max_length']
        self.source_max_length = hyperparms['source_max_length']
        self.max_lenght_NQ = hyperparms['max_lenght_NQ']
        self.overfit = overfit
        self.training = False

      
        self.model = T5ForConditionalGeneration.from_pretrained(self.model_name)

       #---------- Carregamento datasets (Para eu poder variar self.max_length)
        if overfit:
          self.train_dataset = ParacrawlDataset(all_data[0], tokenizer=self.tokenizer,source_max_length = self.source_max_length, target_max_length =  self.target_max_length)
          self.train_dataset = ParacrawlDataset(all_data[0], tokenizer=self.tokenizer,source_max_length = self.source_max_length, target_max_length =  self.target_max_length)
          self.train_dataset = ParacrawlDataset(all_data[0], tokenizer=self.tokenizer,source_max_length = self.source_max_length, target_max_length =  self.target_max_length)          
          # self.valid_dataset = ParacrawlDataset(all_data[0], tokenizer=self.tokenizer, source_max_length = self.source_max_length, target_max_length =  self.target_max_length)
          # self.test_dataset =   ParacrawlDataset(all_data[0], tokenizer=self.tokenizer,source_max_length = self.source_max_length, target_max_length =  self.target_max_length)
        else:
          self.train_dataset = ParacrawlDataset(all_data[0], tokenizer=self.tokenizer, source_max_length = self.source_max_length, target_max_length =  self.target_max_length)
          self.valid_dataset = ParacrawlDataset(all_data[1], tokenizer=self.tokenizer,source_max_length = self.source_max_length, target_max_length =  self.target_max_length)
          self.test_dataset = ParacrawlDataset(all_data[2], tokenizer=self.tokenizer,source_max_length = self.source_max_length, target_max_length =  self.target_max_length)

          #self.valid_dataset = QuestionDataset(all_data[1], tokenizer=self.tokenizer, max_lenght_NQ =  self.max_lenght_NQ)
          #self.test_dataset = QuestionDataset(all_data[2],  tokenizer=self.tokenizer, source_max_length = self.source_max_length, target_max_length =  self.target_max_length)
        
        
        #---------- Loss Function
        self.loss_funtion = criterion

    def translate(self, source_token_ids, source_mask):
      predicted_token_ids = self.model.generate(input_ids=source_token_ids, max_length=self.target_max_length)
      return predicted_token_ids

       
    def forward(self, source_token_ids, source_mask, target_token_ids=None,target_mask=None, training=False):

      if training:
          #peguei a ideia da documentação oficial. Setar -100 pro pad faz ele ser ignorado no crossentropy.
          target_token_ids[target_token_ids == self.tokenizer.pad_token_id] = -100 
          
          loss = self.model(source_token_ids, attention_mask = source_mask, lm_labels = target_token_ids)
          return loss[0]

      else:
          #gerador de tokens de saída - GREEDY
          predicted_token_ids = self.model.generate(input_ids=source_token_ids, max_length=self.target_max_length)
          return predicted_token_ids


    def training_step(self, batch, batch_nb):
        # batch
        source_token_ids, source_mask, target_token_ids, target_mask, _, _ = batch
         
        # fwd
        loss = self.forward(source_token_ids, source_mask, target_token_ids, target_mask, training=True)

        # logs
        tensorboard_logs = {'train_loss': loss}
        progress_bar = {'gpu_usage': gpu_usage()}
        return {'loss': loss, 'log': tensorboard_logs,
                'progress_bar': progress_bar}


    def validation_step(self, batch, batch_nb):
        source_token_ids, source_mask, target_token_ids, target_mask, source, refs = batch
        predict = self(source_token_ids, source_mask).permute(0,1)
        sys = [fix_accent_breaks(self.tokenizer.decode(tokens)) for tokens in predict]
        avg_bleu = sacrebleu.corpus_bleu(sys, [refs]).score
        progress_bar = {'gpu_usage': gpu_usage()}
        return {'val_bleu': avg_bleu, 'progress_bar': progress_bar}


    def test_step(self, batch, batch_nb):
        source_token_ids, source_mask, target_token_ids, target_mask, source, refs = batch
        #dt_tok, dt_mask, dt_type,id_example = batch
        predict = self(source_token_ids, source_mask).permute(0,1)
        sys = [fix_accent_breaks(self.tokenizer.decode(tokens)) for tokens in predict]
        print(sys)
        #avg_bleu = sacrebleu.corpus_bleu(sys, [refs]).score
        
        progress_bar = {'gpu_usage': gpu_usage()}
        return {'progress_bar': progress_bar}

    def validation_epoch_end(self, outputs):
        avg_bleu = sum([x['val_bleu'] for x in outputs]) / len(outputs)
        print("Avg Bleu val", avg_bleu)
        tensorboard_logs = {'avg_val_bleu': avg_bleu}
        
        return {'avg_val_bleu': avg_bleu, 'progress_bar': tensorboard_logs}


    def test_epoch_end(self, outputs):
        #avg_bleu = sum([x['test_bleu'] for x in outputs]) / len(outputs)
        return {}
        #tensorboard_logs = {'avg_test_bleu': avg_bleu}

    def configure_optimizers(self):
        return torch.optim.AdamW(
            [p for p in self.parameters() if p.requires_grad],
            lr=self.learning_rate, eps=1e-08)
    
    @gpu_mem_restore
    def train_dataloader(self):
        shuffle = False if self.overfit else True
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=shuffle,num_workers=cpu_count())
    
    @gpu_mem_restore
    def val_dataloader(self):
        return DataLoader(self.valid_dataset, batch_size=self.batch_size, shuffle=False,num_workers=cpu_count())
    
    @gpu_mem_restore
    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size,shuffle=False, num_workers=cpu_count())

# Etapas de Treino e Avaliação

### Testando rapidamente o modelo em treino, validação e teste com um batch

Recuperando o número de parâmetros

In [44]:
all_data = [ds_train,ds_val,ds_test]
print(type(ds_train))
print(type(ds_val))
print(type(ds_test))
model = T5Finetuner(all_data,hyperparms) 
num_params = sum([torch.tensor(x.size()).prod() for x in model.parameters() if x.requires_grad]) # trainable parameters
print(num_params)

<class 'list'>
<class 'list'>
<class 'list'>
tensor(60506880)


In [None]:
trainer = pl.Trainer(gpus=1, 
                     checkpoint_callback=False,  # Disable checkpoint saving.
                     fast_dev_run=True)
trainer.fit(model)

In [None]:
del model

### Treinamento e Validação no dataset todo

In [45]:
# checkpoint_path = check_path + '/epoch=0.ckpt'
#checkpoint_path
checkpoint_path = check_path + '/epoch=14.ckpt'      # + '/Checkpoints_V2/epoch=15.ckpt'
print(checkpoint_path)
checkpoint_dir = os.path.dirname(os.path.abspath(checkpoint_path))
print(f'Files in {checkpoint_dir}: {os.listdir(checkpoint_dir)}')
print(f'Saving checkpoints to {checkpoint_dir}')
checkpoint_callback = ModelCheckpoint(filepath=checkpoint_dir,
                                      save_top_k=-1)    #monitor="val_acc", mode="max")  # Keeps all checkpoints.




resume_from_checkpoint = None
if os.path.exists(checkpoint_path):
    print(f'Restoring checkpoint: {checkpoint_path}')
    resume_from_checkpoint = checkpoint_path


all_data = [ds_train,ds_val,ds_test]
trainer = pl.Trainer(gpus=1,
                     max_epochs=15,
                     check_val_every_n_epoch=1,
                     profiler=True,
                     checkpoint_callback=checkpoint_callback,
                     accumulate_grad_batches=accumulate_grad_batches,
                     progress_bar_refresh_rate=50,
                     resume_from_checkpoint=resume_from_checkpoint)

model = T5Finetuner(all_data, hyperparms) 

trainer.fit(model)


No environment variable for node rank defined. Set as 0.


/content/drive/My Drive/Natural_Questions/epoch=14.ckpt
Files in /content/drive/My Drive/Natural_Questions: ['3_TranslationNQ.ipynb', 'v1.0-simplified_simplified-nq-train.jsonl.gz', 'clean_nq_format2.jsonl', 'clean_nq_format.jsonl', 'nq_doc_text_chunck.csv', 'nq_short_answer_chunck.csv', 'nq_infos_chunck.csv', 'nq_long_answer_chunck.csv', 'nq_question_chunck.csv', 'nq_question.csv', 'nq_short_answer.csv', 'nq_infos.csv', 'nq_long_answer.csv', 'nq_doc_text.csv', 'epoch=14.ckpt', 'nq_short_answer (1).gsheet', 'nq_short_answer.gsheet']
Saving checkpoints to /content/drive/My Drive/Natural_Questions
Restoring checkpoint: /content/drive/My Drive/Natural_Questions/epoch=14.ckpt




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

Avg Bleu val 41.453424467921664




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




1

In [46]:
trainer.test(model)






HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

['qual é o uso mais comum de marketing de e-mail opt-in', 'como eu.met sua mãe que é a mãe', 'que tipo de fertilização ocorre em humanos', 'quem teve as mais vitórias na nfl', 'o que aconteceu ao encontro perdido de roanoke', 'quais são as diferentes regiões daÁfrica e como diferem']
--------------------------------------------------------------------------------
TEST RESULTS
{}
--------------------------------------------------------------------------------



### Document

In [None]:
doc_text_dataset = DocTextDataset(
    doc_text = doct_text_df,
    tokenizer=tokenizer,
    max_length=max_lenght_NQ)

doc_text_dataloader = DataLoader(doc_text_dataset, batch_size=batch_size,num_workers=cpu_count())

In [None]:
ids = []
doc_text_portuguese = []

# return  dt_tok, dt_mask, dt_type,id_example

for batch in doc_text_dataloader:
    #ids
    ids.extend(batch[3].detach().cpu().tolist())    
    #doc_text
    
    out = model(batch[0].to(device), batch[1].to(device))
    portuguese_translations = [fix_accent_breaks(tokenizer.decode(tokens)) for tokens in out]
    doc_text_portuguese.extend(portuguese_translations)


KeyboardInterrupt: ignored

In [None]:
doct_text_df_portuguese = pd.DataFrame({'ID':ids, 'DOC_TEXT': doc_text_portuguese})

In [None]:
doct_text_df_portuguese.head()

In [None]:
compression_opts = dict(method='zip', archive_name='doct_text_df_portuguese.csv') 
doct_text_df_portuguese.to_csv('doct_text_df_portuguese.zip', index=False,compression=compression_opts)

### Question

In [136]:
question_dataset = QuestionDataset(
    question = question_df,
    tokenizer=tokenizer,
    max_length=max_lenght_NQ)

question_dataloader = DataLoader(question_dataset, batch_size=batch_size,num_workers=cpu_count())

In [137]:
ids = []
question_portuguese = []

# return  dt_tok, dt_mask, dt_type,id_example

for batch in question_dataloader:
    #ids
    ids.extend(batch[3].detach().cpu().tolist())    
    #question
    out = model.translate(batch[0].to(device), batch[1].to(device))
    portuguese_translations = [fix_accent_breaks(tokenizer.decode(tokens)) for tokens in out]
    question_portuguese.extend(portuguese_translations)


In [138]:
question_df_portuguese = pd.DataFrame({'ID': ids, 'QUESTION': question_portuguese})

In [139]:
question_df_portuguese.head()

Unnamed: 0,ID,QUESTION
0,5655493461695504401,o uso mais comum do opt-in opt-in e-mail marke...
1,5328212470870865242,met a su moara que é a mó e a mmm e mm e m e e...
2,4435104480114867852,realiza realiza em human humanos e é ffcliz e ...
3,5289242154789678439,nfl que tives as mais ganganganhas mais gangan...
4,5489863933082811018,O que acon acontece ao ensaldo perda de roanok...


In [140]:
compression_opts = dict(method='zip', archive_name='question_df_portuguese.csv') 
question_df_portuguese.to_csv('question_df_portuguese.zip', index=False,compression=compression_opts)

### Long Answer

In [None]:
long_answer_dataset = LongAnswerDataset(
    long_answer = long_answer_df,
    tokenizer=tokenizer,
    max_length=max_lenght_NQ)

long_answer_dataloader = DataLoader(long_answer_dataset, batch_size=batch_size,num_workers=cpu_count())

In [None]:
ids = []
long_answer_portuguese = []
start_token = []
end_token = []

# return  dt_tok, dt_mask, dt_type,id_example

for batch in long_answer_dataloader:
    #ids
    ids.extend(batch[3].detach().cpu().tolist()) 
    # Start token
    start_token.extend(batch[4].detach().cpu().tolist()) 
    # End token
    end_token.extend(batch[5].detach().cpu().tolist()) 
    #question
    out = model(batch[0].to(device), batch[1].to(device))
    portuguese_translations = [fix_accent_breaks(tokenizer.decode(tokens)) for tokens in out]
    long_answer_portuguese.extend(portuguese_translations)
    

In [None]:
long_answer_df_portuguese = pd.DataFrame({'ID': ids, 'LONG_ANSWER': long_answer_portuguese, 'START_LA': start_token,'END_LA':end_token})

In [None]:
long_answer_df_portuguese.head()

In [None]:
compression_opts = dict(method='zip', archive_name='long_answer_df_portuguese.csv') 
long_answer_df_portuguese.to_csv('long_answer_df_portuguese.zip', index=False,compression=compression_opts)

### Short Answer

In [None]:
short_answer_dataset = ShortAnswerDataset(
    short_answer = short_answers_df,
    tokenizer=tokenizer,
    max_length=max_lenght_NQ)

short_answer_dataloader = DataLoader(short_answer_dataset, batch_size=batch_size,num_workers=cpu_count())

In [None]:
ids = []
short_answer_portuguese = []
start_token = []
end_token = []

# return  dt_tok, dt_mask, dt_type,id_example

for batch in short_answer_dataloader:
    #ids
    ids.extend(batch[3].detach().cpu().tolist()) 
    # Start token
    start_token.extend(batch[4].detach().cpu().tolist()) 
    # End token
    end_token.extend(batch[5].detach().cpu().tolist()) 
    #question
    out = model(batch[0].to(device), batch[1].to(device))
    long_answer_portuguese = [fix_accent_breaks(tokenizer.decode(tokens)) for tokens in out]
    short_answer_portuguese.extend(long_answer_portuguese)

In [None]:
short_answer_df_portuguese = pd.DataFrame({'ID': ids, 'SHORT_ANSWER': short_answer_portuguese, 'START_SA': start_token,'END_SA':end_token})

In [None]:
short_answer_df_portuguese.head()

In [None]:
compression_opts = dict(method='zip', archive_name='short_answer_df_portuguese.csv') 
short_answer_df_portuguese.to_csv('short_answer_df_portuguese.zip', index=False,compression=compression_opts)