# Two Tower Passage Ranking applyed to MS MARCO
### Desenvolvido por Graziella Bonadia e Matheus Sasso

## Modelo Escolhido

In [1]:
#@title Configurações gerais
experiment_name = 'TwoTowerPassageRanking'  #@param {type:"string"}
model_name = 'bert-base-uncased'  #@param ["bert-base-uncased","bert-large-uncased","albert-base-v2"] {type:"string"}

## Instalações Externas e Download dos dados

Instalação de pacotes

In [2]:
! pip install pytorch-lightning  --quiet
! pip install transformers  --quiet
! pip install ftfy --quiet

In [3]:
%%capture
!wget  https://anaconda.org/pytorch/faiss-cpu/1.2.1/download/linux-64/faiss-cpu-1.2.1-py36_cuda9.0.176_1.tar.bz2
!tar xvjf faiss-cpu-1.2.1-py36_cuda9.0.176_1.tar.bz2
!cp -r lib/python3.6/site-packages/* /usr/local/lib/python3.6/dist-packages/
!pip install mkl
#!apt install libomp-dev
#!python -m pip install --upgrade faiss faiss-gpu

Recuperação de **funções auxiliares**

In [4]:
%%capture
!wget -nc https://raw.githubusercontent.com/Matheus158257/ms-marco-passage-ranking-dense-vectors-with-doc2query/master/read_ms_marco.py
!wget -nc https://raw.githubusercontent.com/Matheus158257/ms-marco-passage-ranking-dense-vectors-with-doc2query/master/read_ms_marco2.py
!wget -nc https://raw.githubusercontent.com/spacemanidol/MSMARCO/master/Ranking/Baselines/msmarco_eval.py

Importando google Drive

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Caminhos para os arquivos importandos

In [6]:
proj_dir = '/content/drive/My Drive/Mestrado/PLN/Projeto'#Matheus
data_base_dir = '/content/drive/My Drive/Mestrado/PLN/Projeto/Data/Vetores_Densos' #Matheus
# proj_dir = '/content/drive/My Drive/Projeto'#Matheus
# data_base_dir = '/content/drive/My Drive/Projeto/Data/Vetores_Densos' #Matheus
# proj_dir = '/content/drive/My Drive/Projeto'#Graziella
# data_base_dir = '/content/drive/My Drive/Projeto/Data/Vetores_Densos' #Graziella

In [16]:
#  Caminhos utilizados
inference_data_dev_top1000_path = data_base_dir + '/inference_data_dev_top1000.json'
qrels_dev_path = data_base_dir + '/collectionandqueries/qrels.dev.tsv'
queries_train_path = data_base_dir + '/collectionandqueries/queries.train.tsv'
queries_dev_path = data_base_dir + '/collectionandqueries/queries.dev.tsv'
queries_dev_small_path = data_base_dir + '/collectionandqueries/queries_dev_small.tsv'
qid_pid_nid_path = data_base_dir + '/qidpidnid/triples_ids_train.tsv'
queries_topk_path  = data_base_dir + '/top1_K/doc2query_onek.tsv'

# Outros caminhos para importar
# t5_zip_path = data_base_dir + '/t5_base/model.ckpt-1004000'
# qrels_train_path = data_base_dir + '/collectionandqueries/qrels.train.tsv'
# qrels_dev_small_path = data_base_dir + '/collectionandqueries/qrels.dev.small.tsv'
# qrels_valid_2019_path = data_base_dir + '/collectionandqueries/qrels_valid_2019.tsv'
# collection_path = data_base_dir + '/collectionandqueries/collection.tsv'
# queries_eval_path = data_base_dir + '/collectionandqueries/queries.eval.tsv'
# queries_eval_small_path = data_base_dir + '/collectionandqueries/queries.eval.small.tsv'
# queries_dev_small_path = data_base_dir + '/collectionandqueries/queries_dev_small.tsv'
# queries_topk_folder  = data_base_dir + '/predicted_queries_topk_sampling'
# train_triples_small_path = data_base_dir + '/train_triples_small/triples_smalls_chunk_1.csv'
# inference_data100_path = data_base_dir +'/inference_data100.json'
# inference_data100_eval_2020_path = data_base_dir + '/inference_data100_eval_2020.json'

## Imports

In [8]:
# General Libs
import gzip
import random
import pdb
import ftfy
import math
import logging
import functools
import traceback
import json


#Hardware Data
import psutil
from multiprocessing import cpu_count
import nvidia_smi

#import das funções que vem do github
import collections
import sys
import os
from read_ms_marco import load_qrels
from read_ms_marco import load_queries
from read_ms_marco import load_collection
from read_ms_marco import load_doc2query
from read_ms_marco import load_triple
from read_ms_marco import load_txts_topk #load_txts_topk(folder,k=1,n=18,encoding="cp1252")
from read_ms_marco import take_part
from read_ms_marco import reset_dict_keys
from read_ms_marco import train_val_test
import msmarco_eval

#Data Sciens Libs
import numpy as np
import pandas as pd

#Lightning
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

#Torch
import torch
from torch import Tensor
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch.nn.functional as F
from torch.autograd import Variable

#Faiss
import faiss

#Transformers
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained(model_name)

#Generic Types
from typing import Dict
from typing import List
from typing import Tuple


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Failed to load GPU Faiss: No module named 'faiss.swigfaiss_gpu'
Faiss falling back to CPU-only.


## Configurações Gerais

Imperdir Excesso de Logs (atenção, desabilita o pdb)

In [9]:
# logging.getLogger("transformers.configuration_utils").setLevel(logging.WARNING)
# logging.getLogger("transformers.modeling_utils").setLevel(logging.WARNING)
# logging.getLogger("lightning").setLevel(logging.WARNING)

Dados de CPU e GPU

In [10]:
nvidia_smi.nvmlInit()
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
print("\nGetting Hardware Statatus...\n")
def hardware_stats():
    '''
    Returns a dict containing some hardware related stats
    '''
    res = nvidia_smi.nvmlDeviceGetUtilizationRates(handle)
    return {"cpu": str(psutil.cpu_percent()) + '%',
            "mem": str(psutil.virtual_memory().percent) + '%',
            "gpu": str(res.gpu) + '%',
            "gpu_mem": str(res.memory) + '%'}

print(f"Imports loaded succesfully. Current GPU: {torch.cuda.get_device_name(0)}, number of CPU cores: {cpu_count()}")


Getting Hardware Statatus...

Imports loaded succesfully. Current GPU: Tesla P100-PCIE-16GB, number of CPU cores: 2


Decorator para impedir quebra de memório recorrente

In [11]:
#Impedir quebra de memória 
# https://docs.fast.ai/troubleshoot.html#memory-leakage-on-exception
def gpu_mem_restore(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except:
            type, val, tb = sys.exc_info()
            traceback.clear_frames(tb)
            raise type(val).with_traceback(tb) from None
    return wrapper

## Preparando Dados

Extracao dos tsvs em dicionários

In [12]:
original_queries_dev_small = load_queries(queries_dev_small_path)
original_queries_train = load_queries(queries_train_path)
original_queries_dev = load_queries(queries_dev_path)

Loading queries 0
Loading queries 0
Loading queries 100000
Loading queries 200000
Loading queries 300000
Loading queries 400000
Loading queries 500000
Loading queries 600000
Loading queries 700000
Loading queries 800000
Loading queries 0
Loading queries 100000


In [13]:
artificial_queries = load_doc2query(queries_topk_path)

Loading doc2query, doc 0
Loading doc2query, doc 1000000
Loading doc2query, doc 2000000
Loading doc2query, doc 3000000
Loading doc2query, doc 4000000
Loading doc2query, doc 5000000
Loading doc2query, doc 6000000
Loading doc2query, doc 7000000
Loading doc2query, doc 8000000


In [14]:
artificial_queries['842095']

'how do you know if a timing belt is replaced'

In [17]:
triples = load_triple(qid_pid_nid_path)


Loading triple 0
Loading triple 100000
Loading triple 200000
Loading triple 300000
Loading triple 400000


In [18]:
debug_train, debug_val, debug_test = train_val_test(triples, 10, 5, 3)

In [19]:
print(debug_train)
print(debug_val)
print(debug_test)

{'0': ('1000094', '5399011', '4239068'), '1': ('1000684', '6133670', '54955'), '2': ('1000938', '5203582', '4737056'), '3': ('1001876', '2354621', '3473806'), '4': ('1002233', '5049159', '4679862'), '5': ('1002416', '6210209', '1474923'), '6': ('1003328', '3712662', '8581416'), '7': ('1003418', '5937401', '2702745'), '8': ('1003840', '4063719', '5433200'), '9': ('100386', '6182295', '588060')}
{'0': ('1003900', '4788874', '6105642'), '1': ('1003956', '3926579', '1317168'), '2': ('1004080', '5943981', '8247016'), '3': ('1004274', '4383463', '7283587'), '4': ('1004329', '3907674', '4171719')}
{'0': ('1004979', '2741708', '825582'), '1': ('1005307', '3480594', '8683170'), '2': ('1006123', '3773327', '2284569')}


Dicionário que será utilizado em tempo de inferência

In [20]:
with open(inference_data_dev_top1000_path) as json_file:
    inference_dev_dict = json.load(json_file)

## Carregando o dataset

Nota: Evitar de olhar ao máximo o dataset de teste para não ficar enviseado no que será testado. Em aplicações reais, o dataset de teste só estará disponível no futuro, ou seja, é quando o usuário começa a testar o seu produto.

In [21]:
tokenizer.encode_plus('we like pizza',
                      max_length =10,
                      pad_to_max_length=True,
                      add_special_tokens = True)

{'input_ids': [101, 2057, 2066, 10733, 102, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]}

Dataset Treino


In [22]:
class TrainDataset(Dataset):
    def __init__(self, triples, queries, artificial_queries,
                 tokenizer,max_length: int =100,training_step=False):
      
        self.triples = triples
        self.queries = queries
        self.artificial_queries = artificial_queries
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.triples)

    def __getitem__(self, idx):
      # Numero(Chave)
        # import pdb;pdb.set_trace()
 
        qid, pid, nid = self.triples[str(idx)]
        
        #vetores densos - Texto(Valor)
        o_query = ftfy.fix_text(self.queries[qid])
        a_query_pos = ftfy.fix_text(self.artificial_queries[pid])
        a_query_neg = ftfy.fix_text(self.artificial_queries[nid])

        #Tensores
        q_tok, q_mask, q_type = self.encode_plus(o_query)
        p_tok, p_mask, p_type = self.encode_plus(a_query_pos)
        n_tok, n_mask, n_type = self.encode_plus(a_query_neg)

        return  (q_tok, q_mask, q_type,
                p_tok, p_mask, p_type,
                n_tok, n_mask, n_type,
                o_query, a_query_pos, a_query_neg)

    def encode_plus(self, text):
        tokens = self.tokenizer.encode_plus(text=text, max_length=self.max_length,
                                       pad_to_max_length=True, add_special_tokens = True)
        
        
        tok =  torch.tensor(tokens["input_ids"]).type(torch.long)
        mask = torch.tensor(tokens['attention_mask']).type(torch.long)
        tok_type = torch.tensor(tokens['token_type_ids']).type(torch.long)

        return tok,mask,tok_type

Dataset Inferência

In [23]:
class InferenceDataset(Dataset):

    def __init__(self,dicts, queries,artificial_queries,
                 tokenizer,max_length: int =100,training_step=False):
        self.dicts = dicts
        self.queries = queries
        self.passages = artificial_queries
        self.tokenizer = tokenizer
        self.max_length = max_length


    def __len__(self):
      return len(self.dicts)


    def __getitem__(self, idx):

        qid, pids = self.dicts[str(idx)]
        o_query = ftfy.fix_text(self.queries[qid])
        passages = [ftfy.fix_text(self.passages[pid]) for pid in pids]

        o_query_tok,o_query_mask = self.encode_plus(o_query) 
        passages_tensors = [self.encode_plus(passage) for passage in passages] #retorna tuplas com token e mask [(tok,mask),(),(),()] , tok([[tensor()]][tensor()][tensor()][tensor()])

        return  (qid,pids,o_query,passages,o_query_tok,o_query_mask,passages_tensors)

    def encode_plus(self, text):
        tokens = self.tokenizer.encode_plus(text=text, max_length=self.max_length,
                                       pad_to_max_length=True, add_special_tokens = True)
        
        
        tok =  torch.tensor(tokens["input_ids"]).type(torch.long)
        mask = torch.tensor(tokens['attention_mask']).type(torch.long)
        # tok_type = torch.tensor(tokens['token_type_ids']).type(torch.long)

        return tok,mask

## Testando o DataLoader

### Treino

In [24]:
dataset_debug = TrainDataset(
    triples =  debug_val,
    queries = original_queries_train,
    artificial_queries = artificial_queries,
    tokenizer=tokenizer,
    max_length=20)

dataloader_debug = DataLoader(dataset_debug, batch_size=2, shuffle=True,num_workers=cpu_count())

q_tok, q_mask, q_type,p_tok, p_mask, p_type,n_tok, n_mask, n_type,o_query, a_query_pos, a_query_neg = next(iter(dataloader_debug))

        
print('original_query_token_ids:\n', q_tok)
print('original_query_mask:\n', q_mask)
print('original_query_token_type:\n', q_type)

print('positive_query_token_ids:\n', p_tok)
print('positive_query_mask:\n', p_mask)
print('positive_query_token_type:\n', p_type)

print('negative_query_token_ids:\n', n_tok)
print('negative_query_mask:\n', n_mask)
print('negative_query_token_type:\n', n_type)


print('original_query_token_ids.shape:\n', q_tok.shape)
print('original_query_mask.shape:\n', q_mask.shape)
print('original_query_token_type.shape:\n', q_type.shape)

print('positive_query_token_ids.shape:\n', p_tok.shape)
print('positive_query_mask.shape:\n', p_mask.shape)
print('positive_query_token_type.shape:\n', p_type.shape)

print('negative_query_token_ids.shape:\n', n_tok.shape)
print('negative_query_mask.shape:\n', n_mask.shape)
print('negative_query_token_type.shape:\n', n_type.shape)

print(o_query)
print(a_query_pos)
print(a_query_neg)

original_query_token_ids:
 tensor([[  101,  2073,  2001,  1996,  3212,  8515,  5033,  4224,  2902,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2073,  2057,  2064,  2224, 10763,  2372,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]])
original_query_mask:
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
original_query_token_type:
 tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
positive_query_token_ids:
 tensor([[  101,  2040,  2001,  2141,  1999,  1996,  8515,  5033,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101, 10812,  6210,  2465,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,

### Inferência

Não vamos visualizar por ser muito grande

In [25]:
 dataset = InferenceDataset(
    dicts = inference_dev_dict,
    queries = original_queries_dev_small,
    artificial_queries = artificial_queries,
    tokenizer=tokenizer,
    max_length=100
)

dataloader_debug = DataLoader(dataset, batch_size=4, shuffle=False,num_workers=cpu_count())
qid,pids,o_query,passages,o_query_tok,o_query_mask,passages_tensors = next(iter(dataloader_debug))# passages_tensors -> lista de tuplas [(pd_tok1,pd_mask1),(pd_tok2,pd_mask2)...]

## Loss Funtion

***Documentation:***  
https://github.com/facebookresearch/faiss/wiki/Getting-started

***Motivation:***   
Dense Passage Retrieval for Open-Domain Question Answering
https://arxiv.org/abs/2004.04906

Faiss Para Negativos como Positivos dos outros valores do Batch

In [26]:
def faiss_loss(Query,Passage):

     X = Query.cpu()
     Y = Passage.cpu()
     X = X.detach().numpy()
     Y = Y.detach().numpy()

     d = np.size(X,1)      ##Acertar para torch                    # vector dimension
     batch_size = np.size(X,0)      ##Acertar para torch           # we want to see number of batch size nearest neighbors  
     k = batch_size                                                # Its possible to change this number, but for the sake of this project, it will remain the same number of the batch size
     index = faiss.IndexFlatL2(d)                                  # build the index
     index.add(Y)                                                  # add vectors to the index

     D, I = index.search(X, k)                                     # search. D is the matriz of similarities and I is the index of the vector in matriz ordered by similarity

     resgate = np.arange(batch_size)                               # retrieve index of queries
     resgate = resgate.reshape((batch_size,1))

     # This is necessary if there is a more similar vector then the target vector in the batch. Not probable, though.
     R = I - resgate
     T = R==0
     F = R!=0

    #  T = I==0
    #  F = I!=0  
                                                       
     mrr_ind = np.where(T == True)                                 # retrieve the rank from True matrix
     mrr_rank = mrr_ind[1] + 1                                     # adjust the rank to start in '1'
     mrr_inv = 1/mrr_rank                                          # inverse of rank
     MRR = np.sum(mrr_inv)/k                                       # "Accuracy"  indicator
     MRR = np.array(MRR)

     MRR = torch.from_numpy(MRR).float().to(device)

     D_exp = np.exp(D)                                             # exp of similarities matrix
     sim_pos = D_exp[T==True]                                      # retrieve the vector of similarities of the target, despite its order (counting as positive)
     sim_neg = np.sum(((F == True) * D_exp), axis = 1)             # retrieve the vector of similarities of all other vectors and sum (counting as negative)

     loss = - np.log(sim_pos/(sim_pos+sim_neg))                    # loss
     loss = torch.from_numpy(loss).float().to(device)
     loss = torch.sum(loss)
     loss = Variable(loss, requires_grad=True)


     return loss, MRR

Faiss para ranqueamento

In [27]:
def get_similarity(Query,Passages):
     X = Query.cpu()
     Y = Passages.cpu()
     X = X.detach().numpy()
     Y = Y.detach().numpy()
    
     k = len(Y)
     dim = np.size(X,1)           ##Acertar para torch                   # vector dimension
     index = faiss.IndexFlatL2(dim)                                  # build the index
     index.add(Y)                                                  # add vectors to the index

     D, I = index.search(X, k) 


     return D[0], I[0]

## Hyperparâmetros

In [28]:
#@title Hyperparâmetros
batch_size =  16#@param {type:"integer"}
max_epochs = 3 #@param {type:"integer"}
accumulate_grad_batches = 16  #@param {type:"integer"}
max_length =   128#@param {type:"integer"}
learning_rate = 5e-3  #@param {type:"number"}

In [29]:
hyperparams = {'model':model_name,'tokenizer':tokenizer,'learning_rate':learning_rate,'batch_size':batch_size,'max_length':max_length,'accumulate_grad_batches':accumulate_grad_batches} 

## Datasets

In [30]:
triples_train, _, _ = train_val_test(triples, 50000, 100, 100)

In [31]:
dataset_train = TrainDataset(triples =  triples_train, queries = original_queries_train,
                          artificial_queries = artificial_queries,
                          tokenizer=tokenizer,
                          max_length=max_length)
dataset_val  = InferenceDataset(
              dicts = inference_dev_dict,
              queries = original_queries_dev,
              artificial_queries = artificial_queries,
              tokenizer=tokenizer,
              max_length=max_length)
                              

dataset_test = InferenceDataset(
              dicts = inference_dev_dict,
              queries = original_queries_dev,
              artificial_queries = artificial_queries,
              tokenizer=tokenizer,
              max_length=max_length)

train_dataloader = DataLoader(dataset_train, batch_size=batch_size,
                              shuffle=True, num_workers=4)

val_dataloader = DataLoader(dataset_val, batch_size=batch_size, shuffle=False, 
                            num_workers=4)

test_dataloader = DataLoader(dataset_test, batch_size=batch_size,
                             shuffle=False, num_workers=4)

## Criando o Two Tower Passage Ranking com Pytorch Lightning

In [32]:
class TwoTowerFineTuner(pl.LightningModule):

    def __init__(self, 
                 hyperparams,
                 train_dataloader, val_dataloader,
                 test_dataloader,
                 overfit=False):
      
        super(TwoTowerFineTuner, self).__init__()
        
        
        #---------- Definição do modelo
        self.tower1 = BertModel.from_pretrained(model_name)
        self.tower2 = BertModel.from_pretrained(model_name)
        

        #---------- Hyperparametros
        self.tokenizer = hyperparams['tokenizer']
        self.learning_rate = hyperparams['learning_rate']
        self.batch_size = hyperparams['batch_size']
        self.max_length = hyperparams['max_length']
        self.overfit = overfit

        #---------- Camadas
        self.num_final_dim = 128
        self.dropout = torch.nn.Dropout(self.tower1.config.hidden_dropout_prob)
        self.final_layer = torch.nn.Linear(self.tower1.config.hidden_size, self.num_final_dim)


        #---------- Carregamento datasets (Para eu poder variar self.max_length)

        self._train_dataloader = train_dataloader
        self._val_dataloader = val_dataloader
        self._test_dataloader = test_dataloader
                    

    def infer(self,batch):

        qid,pids,o_query,passages,o_query_tok,o_query_mask,passages_tensors = batch


        batch_similarities = []
        batch_pids = []
        batch_qids = []
        MRR_batch =  np.array([])
        for i in range(0,len(qid)):
          o_query_toks_tensor = o_query_tok[i].view(1,-1) #1xL 
          o_query_masks_tensor = o_query_mask[i].view(1,-1) #1 X L 
          query_hat = self.forward(o_query_toks_tensor, o_query_masks_tensor,self.tower1)
          phs = []
          for tuple_tensor in passages_tensors: 
            passage_tok = tuple_tensor[0][i].view(1,-1) #1XL 
            passage_mask = tuple_tensor[1][i].view(1,-1)
            #passage_hat = self.forward(passage_tok, passage_mask, self.tower2)
            passage_hat = self.forward(passage_tok, passage_mask, self.tower1)
            phs.append(passage_hat[0])
          
          passage_hats = torch.stack([ph for ph in phs], dim=0)
          similarities, order = get_similarity(query_hat,passage_hats)#q = 1xL , p = 100 x L
          similarity_normalized = [np.float64(elem).item() for elem in similarities][::-1]#revrese
          order_list = [np.float64(elem).item() for elem in order]
          pids_phrase = [p[i] for p in pids]
          pids_ordered = [x for _,x in sorted(zip(order_list,pids_phrase))][::-1] #revrese
          qids = [qid[i]]*len(pids)
          
          MRR = np.sum(similarity_normalized)/len(pids)
          MRR_batch = np.append(MRR_batch,MRR)

          batch_pids.append(pids_ordered)
          batch_qids.append(qids)
          batch_similarities.append(similarity_normalized)

        mrr = torch.tensor(np.mean(MRR_batch))
        return mrr, batch_pids, batch_qids , batch_similarities
      

    def forward(self, input_ids, attention_mask, model,token_type_ids=None):    #model
        _, pooled_output = model(input_ids, attention_mask)
        pooled_output = self.dropout(pooled_output)
        logits = self.final_layer(pooled_output)

        return logits
    

    def training_step(self, batch, batch_nb):
        # batch
        query_token_ids, query_attention_mask, query_token_type_ids, qpos_token_ids, qpos_attention_mask, qpos_token_type_ids, n_tok, n_mask, n_type,o_query, a_query_pos, a_query_neg = batch
        
        # fwd
        query_hat = self.forward(query_token_ids, query_attention_mask,self.tower1 ,query_token_type_ids)
        #passage_hat = self.forward(qpos_token_ids, qpos_attention_mask,self.tower2 ,qpos_token_type_ids)
        passage_hat = self.forward(qpos_token_ids, qpos_attention_mask,self.tower1 ,qpos_token_type_ids)


        loss, MRR = faiss_loss(query_hat,passage_hat)
        
        # logs
        tensorboard_logs = {'train_loss': loss}
        return {'loss': loss, 'log': tensorboard_logs}

    def validation_step(self, batch, batch_nb): 
        val_mrr, batch_pids, batch_qids , batch_similarities = self.infer(batch)
        return {'val_mrr': val_mrr}
        
    def validation_epoch_end(self, outputs):
        avg_val_mrr = torch.stack([x['val_mrr'] for x in outputs]).mean()
        tensorboard_logs = {'avg_val_mrr': avg_val_mrr}    
        return {'avg_val_mrr': avg_val_mrr,'progress_bar': tensorboard_logs}

    def test_step(self, batch, batch_nb):
        
        test_mrr, batch_pids, batch_qids , batch_similarities = self.infer(batch)
        
        return {'test_mrr': test_mrr}

    def test_epoch_end(self, outputs):

        avg_test_mrr = torch.stack([x['test_mrr'] for x in outputs]).mean()

        tensorboard_logs = {'avg_test_mrr': avg_test_mrr}
        return {'avg_test_mrr': avg_test_mrr, 'log': tensorboard_logs,
                'progress_bar': tensorboard_logs}
    
    def configure_optimizers(self):
        return torch.optim.Adam(
            [p for p in self.parameters() if p.requires_grad],
            lr=self.learning_rate, eps=1e-08)

    def train_dataloader(self):
        return self._train_dataloader

    def val_dataloader(self):
        return self._val_dataloader

    def test_dataloader(self):
        return self._test_dataloader

In [None]:
# model = TwoTowerFineTuner(hyperparams, train_dataloader=train_dataloader,
#                       val_dataloader=val_dataloader,
#                       test_dataloader=test_dataloader)

In [None]:
# sum([torch.tensor(x.size()).prod() for x in model.parameters() if x.requires_grad]) # trainable parameters

## Etapas de avaliação do Lightning

### Testando rapidamente o modelo em treino, validação e teste com um batch

In [None]:
# trainer = pl.Trainer(gpus=1, 
#                      checkpoint_callback=False,  # Disable checkpoint saving.
#                      fast_dev_run=True)
# trainer.fit(model)
# trainer.test(model)
# del model  # Para não ter estouro de mémoria da GPU

In [None]:
# del model  # Para não ter estouro de mémoria da GPU


### Trabalhando no datasset todo



In [33]:
from pytorch_lightning.callbacks import ModelCheckpoint

checkpoint_path = proj_dir + '/Data/Checkpoints_dense_DOIS_MOD/epoch=3.ckpt'
 
checkpoint_dir = os.path.dirname(os.path.abspath(checkpoint_path))
print(f'Files in {checkpoint_dir}: {os.listdir(checkpoint_dir)}')
print(f'Saving checkpoints to {checkpoint_dir}')
checkpoint_callback = ModelCheckpoint(filepath=checkpoint_dir,save_top_k=-1)  # Keeps all checkpoints.

resume_from_checkpoint = None
if os.path.exists(checkpoint_path):
    print(f'Restoring checkpoint: {checkpoint_path}')
    resume_from_checkpoint = checkpoint_path


trainer = pl.Trainer(gpus=1,
                     max_epochs=max_epochs,
                     check_val_every_n_epoch=1,
                     accumulate_grad_batches= accumulate_grad_batches,
                     checkpoint_callback=checkpoint_callback,
                     resume_from_checkpoint=resume_from_checkpoint)


model = TwoTowerFineTuner(hyperparams, train_dataloader=train_dataloader,
                      val_dataloader=val_dataloader,
                      test_dataloader=test_dataloader)

trainer.fit(model)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]


Files in /content/drive/My Drive/Mestrado/PLN/Projeto/Data/Checkpoints_dense_DOIS_MOD: ['epoch=1.ckpt', 'epoch=2.ckpt', 'epoch=3.ckpt', 'results_1qid.txt', 'results.test_dev.txt', 'results.test.txt']
Saving checkpoints to /content/drive/My Drive/Mestrado/PLN/Projeto/Data/Checkpoints_dense_DOIS_MOD
Restoring checkpoint: /content/drive/My Drive/Mestrado/PLN/Projeto/Data/Checkpoints_dense_DOIS_MOD/epoch=3.ckpt


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…





  | Name        | Type      | Params
------------------------------------------
0 | tower1      | BertModel | 109 M 
1 | tower2      | BertModel | 109 M 
2 | dropout     | Dropout   | 0     
3 | final_layer | Linear    | 98 K  


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




1

In [None]:
# trainer.test(model)

## Colocando dados no formato do MRR@10

### DEV

In [34]:
dataset_val2  = InferenceDataset(
              dicts = inference_dev_dict,
              queries = original_queries_dev,
              artificial_queries = artificial_queries,
              tokenizer=tokenizer,
              max_length=max_length)
                              
val_dataloader2 = DataLoader(dataset_val2, batch_size=1, shuffle=False,num_workers=4)

In [36]:
import math
mrr = {}
results_path = proj_dir + '/Data/Checkpoints_dense_DOIS_MOD/' + 'passo_intermediario.txt' 
df_reaclass= pd.DataFrame(columns=['QID','PID','RANK'])
pdis = []
sims = []
qdis = []
with open(results_path, 'w') as text_file:
  for batch in val_dataloader2:
      qid,pids,o_query,passages,o_query_tok,o_query_mask,passages_tensors = batch
      o_query_tok = o_query_tok.to(device)
      o_query_mask = o_query_mask.to(device)
      passages_tensors = [(tuple_pasasge[0].to(device), tuple_pasasge[1].to(device)) for tuple_pasasge in  passages_tensors]
      batch_gpu = qid,pids,o_query,passages,o_query_tok,o_query_mask,passages_tensors
      _, batch_pids, batch_qids , batch_similarities = model.infer(batch_gpu)
      for pids, qids , similarities in zip(batch_pids, batch_qids , batch_similarities):
        ranks = [n+1 for n in range (0,len(qids))]
        for p, q, s, r in zip(pids, qids , similarities, ranks):
          try:
              mrr[q].append(p)
          except:
              mrr[q] = [p]

          df_reaclass = df_reaclass.append(pd.Series([q,p,r], index=df_reaclass.columns), ignore_index=True)
          text_file.write(f"{q} {p} {r} \n")
          # 
          if r==10:
              break
      
      break    
      
        
    # max_iteration = len(passages_tensors)
    # num_iterations = math.ceil(max_iteration/10)

    # for i in range(0,num_iterations):
    #   try:
    #     passages_tensors_aux =passages_tensors[i*10:(i+1)*10]
    #     pids_aux = pids[i*10:(i+1)*10]
    #   except:       
    #     passages_tensors_aux =passages_tensors[i*10:]
    #     pids_aux = pids[i*10:]


    #   batch_gpu = qid,pids_aux,o_query,passages,o_query_tok,o_query_mask,passages_tensors_aux
    #   _, batch_pids, batch_qids , batch_similarities = model.infer(batch_gpu)
    #   for pids, qids , similarities in zip(batch_pids, batch_qids , batch_similarities):
    #     ranks = [n+1 for n in range (0,len(qids))]
    #     for p, q, s, r in zip(pids, qids , similarities, ranks):
    #       pdis.append(p)
    #       qdis.append(q)
    #       sims.append(s)


    #   break
          # text_file.write(q + " " + "Q0" + " " + p + " " + str(r) + " " + str(s) + " " + runid + "\n") 
          # text_file.write(q + " " + "Q0" + " " + p + " " + str(r) + " " + str(s) + " " + runid + "\n") 
          # try:
          #   mrr[q].append(p)
          # except:
          #   mrr[q] = [p]
          # if r==10:
          #   break


RuntimeError: ignored

In [None]:
qrels_dev = load_qrels(qrels_dev_path)

In [None]:
msmarco_eval.compute_metrics(qrels_dev, mrr)

In [None]:
# compression_opts = dict(method='zip', archive_name='enpt_t5_results.csv') 
# model.df_test.to_csv('enpt_t5_results.zip', index=False,compression=compression_opts)  

# Fim do Notebook

In [None]:
# qids = []
# pdis = []
# sims = []
# ranks = []
# runid = 'runid1'
# results_path = proj_dir + '/Data/Checkpoints_dense_DOIS_MOD/' + 'results.test.txt' 

# with open(results_path, 'w') as text_file:
#   for batch in val_dataloader:
#       qid,pids,o_query,passages,o_query_tok,o_query_mask,passages_tensors = batch
#       o_query_tok = o_query_tok.to(device)
#       o_query_mask = o_query_mask.to(device)
#       passages_tensors = [(tuple_pasasge[0].to(device), tuple_pasasge[1].to(device)) for tuple_pasasge in  passages_tensors]
#       batch_gpu = qid,pids,o_query,passages,o_query_tok,o_query_mask,passages_tensors

      
#       _, batch_pids, batch_qids , batch_similarities = model.infer(batch_gpu)
#       for pids, qids , similarities in zip(batch_pids, batch_qids , batch_similarities):
#         ranks = [i+1 for i in range (0,len(qids))]
#         for p, q, s, r in zip(pids, qids , similarities, ranks):
#           # text_file.write(q + " " + "Q0" + " " + p + " " + str(r) + " " + str(s) + " " + runid + "\n") 
#           text_file.write(f"{q} {p} {r} \n")

#           if r==10:
#             break

