# Toy Examples DPR

In [1]:
#https://github.com/beir-cellar/beir/blob/main/examples/retrieval/evaluation/dense/evaluate_dpr.py
!pip install beir



# Import Libraries

In [2]:
import numpy as np
import pandas as pd

# Load Toy Data-Sets

In [3]:
#documents
docs = pd.read_csv('./toy_data/docs.csv', dtype=str)

#queries
queries = pd.read_csv('./toy_data/queries.csv', dtype=str)

#qrels
qrels = pd.read_csv('./toy_data/qrels.csv', dtype=str)
qrels = qrels.astype({'label': 'int32'})


#prints
print(docs.shape)
print(docs.head())

print(queries.shape)
print(queries.head())

print(qrels.shape)
print(qrels.head())

(2453, 2)
     docno                                               text
0   935016  he emigrated to france with his family in 1956...
1  2360440  after being ambushed by the germans in novembe...
2   347765  she was the second ship named for captain alex...
3  1969335  world war ii was a global war that was under w...
4  1576938  the ship was ordered on 2 april 1942 laid down...
(9, 2)
       qid                 query
0  1015979    president of chile
1     2674    computer animation
2   340095  2020 summer olympics
3  1502917         train station
4     2574       chinese cuisine
(2454, 4)
       qid    docno  label iteration
0  1015979  1015979      2         0
1  1015979  2226456      1         0
2  1015979  1514612      1         0
3  1015979  1119171      1         0
4  1015979  1053174      1         0


# Dense IR - Using Dense Passage Retrieval (DPR)

In [4]:
from beir import util, LoggingHandler
from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES

import logging
import pathlib, os

  from tqdm.autonotebook import tqdm


In [5]:
#IMPLEMENTED MODEL FROM https://github.com/beir-cellar/beir
#https://github.com/beir-cellar/beir/blob/main/beir/retrieval/models/dpr.py

from transformers import DPRContextEncoder, DPRContextEncoderTokenizerFast
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizerFast
from typing import Union, List, Dict, Tuple
from tqdm.autonotebook import trange
import torch

class DPR:
    def __init__(self, model_path: Union[str, Tuple] = None, **kwargs):
        # Query tokenizer and model
        self.q_tokenizer = DPRQuestionEncoderTokenizerFast.from_pretrained(model_path[0])
        self.q_model = DPRQuestionEncoder.from_pretrained(model_path[0])
        self.q_model.cuda()
        self.q_model.eval()

        # Context tokenizer and model
        self.ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(model_path[1])
        self.ctx_model = DPRContextEncoder.from_pretrained(model_path[1])
        self.ctx_model.cuda()
        self.ctx_model.eval()

    def encode_queries(self, queries: List[str], batch_size: int = 16, **kwargs) -> torch.Tensor:
        query_embeddings = []
        with torch.no_grad():
            for start_idx in trange(0, len(queries), batch_size):
                encoded = self.q_tokenizer(queries[start_idx:start_idx+batch_size], truncation=True, padding=True, return_tensors='pt')
                model_out = self.q_model(encoded['input_ids'].cuda(), attention_mask=encoded['attention_mask'].cuda())
                #model_out = self.q_model(encoded['input_ids'], attention_mask=encoded['attention_mask'])
                query_embeddings += model_out.pooler_output

        return torch.stack(query_embeddings)

    def encode_corpus(self, corpus: List[Dict[str, str]], batch_size: int = 8, **kwargs) -> torch.Tensor:

        corpus_embeddings = []
        with torch.no_grad():
            for start_idx in trange(0, len(corpus), batch_size):
                #titles = [row['title'] for row in corpus[start_idx:start_idx+batch_size]]
                texts = [row['text']  for row in corpus[start_idx:start_idx+batch_size]]
                #encoded = self.ctx_tokenizer(titles, texts, truncation='longest_first', padding=True, return_tensors='pt')
                encoded = self.ctx_tokenizer(texts, truncation='longest_first', padding=True, return_tensors='pt')
                model_out = self.ctx_model(encoded['input_ids'].cuda(), attention_mask=encoded['attention_mask'].cuda())
                #model_out = self.ctx_model(encoded['input_ids'], attention_mask=encoded['attention_mask'])
                corpus_embeddings += model_out.pooler_output.detach()

        return torch.stack(corpus_embeddings)

In [6]:
new_docs = {}
for i in range(len(docs)):
    new_docs[docs['docno'][i]] = {'text' : docs['text'][i]}

In [7]:
new_queries = {}
for i in range(len(queries)):
    new_queries[queries['qid'][i]] = queries['query'][i]

In [8]:
new_qrels = {}
for i in range(len(qrels)):
    new_qrels[qrels['qid'][i]] = {qrels['docno'][i] : int(qrels['label'][i])}

In [9]:
model_dpr = DRES(DPR((
     "facebook/dpr-question_encoder-multiset-base",
     "facebook/dpr-ctx_encoder-multiset-base"), batch_size=16))
retriever_dpr = EvaluateRetrieval(model_dpr, score_function="dot") # or "dot" for dot-product
results_dpr = retriever_dpr.retrieve(new_docs, new_queries)

Some weights of the model checkpoint at facebook/dpr-question_encoder-multiset-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRCont

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

In [18]:
model = DRES(models.SentenceBERT("msmarco-distilbert-base-tas-b", batch_size=16))
retriever = EvaluateRetrieval(model, score_function="dot") # or "cos_sim" for cosine similarity
results = retriever.retrieve(new_docs, new_queries)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

In [11]:
#https://www.sbert.net/docs/pretrained-models/msmarco-v3.html
#model_ance = DRES(models.SentenceBERT('msmarco-distilroberta-base-v3'))
model_ance = DRES(models.SentenceBERT('msmarco-roberta-base-v3', batch_size=16))
#model_ance = DRES(models.SentenceBERT('msmarco-distilbert-base-tas-b'))
retriever_ance = EvaluateRetrieval(model_ance, score_function="cos_sim")

#### Retrieve dense results (format of results is identical to qrels)
results_ance = retriever_ance.retrieve(new_docs, new_queries)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

In [12]:
model_dpr_alt = DRES(models.SentenceBERT((
    "facebook-dpr-question_encoder-multiset-base",
    "facebook-dpr-ctx_encoder-multiset-base",
    " [SEP] "), batch_size=16))
retriever_dpr_alt = EvaluateRetrieval(model_dpr_alt, score_function="dot")
results_dpr_alt = retriever_dpr_alt.retrieve(new_docs, new_queries)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

In [19]:
#### Evaluate your model with NDCG@k, MAP@K, Recall@K and Precision@K  where k = [1,3,5,10,100,1000] [10,100,100]
ndcg, _map, recall, precision = retriever_dpr.evaluate(new_qrels, results_dpr, [10,100,1000]) #retriever_dpr.k_values)
ndcg_alt, _map_alt, recall_alt, precision_alt = retriever.evaluate(new_qrels,results, [10,100,1000]) # retriever.k_values)
ndcg_ance, _map_ance, recall_ance, precision_ance = retriever_ance.evaluate(new_qrels, results_ance, [10,100,1000]) #retriever_ance.k_values)
ndcg_dpr_alt, _map_dpr_alt, recall_dpr_alt, precision_dpr_alt = retriever_dpr_alt.evaluate(new_qrels, results_dpr_alt, [10,100,1000]) #retriever_dpr_alt.k_values)


In [20]:
print("Original DPR:", ndcg)
print("Original Sentence BERT:", ndcg_alt)
print("Original ANCE:", ndcg_ance)
print("Alternative DPR", ndcg_dpr_alt)

Original DPR: {'NDCG@10': 0.16543, 'NDCG@100': 0.25085, 'NDCG@1000': 0.26321}
Original Sentence BERT: {'NDCG@10': 0.28915, 'NDCG@100': 0.33446, 'NDCG@1000': 0.35004}
Original ANCE: {'NDCG@10': 0.17526, 'NDCG@100': 0.27918, 'NDCG@1000': 0.30524}
Alternative DPR {'NDCG@10': 0.21626, 'NDCG@100': 0.28014, 'NDCG@1000': 0.30964}


In [15]:
print("Original DPR:", _map)
print("Original Sentence BERT:", _map_alt)
print("Original ANCE:", _map_ance)
print("Alternative DPR", _map_dpr_alt)

Original DPR: {'MAP@10': 0.08395, 'MAP@100': 0.09656, 'MAP@1000': 0.09678}
Original Sentence BERT: {'MAP@10': 0.20833, 'MAP@100': 0.21669, 'MAP@1000': 0.21749}
Original ANCE: {'MAP@10': 0.125, 'MAP@100': 0.15157, 'MAP@1000': 0.15223}
Alternative DPR {'MAP@10': 0.15309, 'MAP@100': 0.16235, 'MAP@1000': 0.16362}


In [16]:
print("Original DPR:", recall)
print("Original Sentence BERT:", recall_alt)
print("Original ANCE:", recall_ance)
print("Alternative DPR", recall_dpr_alt)

Original DPR: {'Recall@10': 0.44444, 'Recall@100': 0.88889, 'Recall@1000': 1.0}
Original Sentence BERT: {'Recall@10': 0.55556, 'Recall@100': 0.77778, 'Recall@1000': 0.88889}
Original ANCE: {'Recall@10': 0.33333, 'Recall@100': 0.77778, 'Recall@1000': 1.0}
Alternative DPR {'Recall@10': 0.44444, 'Recall@100': 0.77778, 'Recall@1000': 1.0}


In [17]:
print("Original DPR:", precision)
print("Original Sentence BERT:", precision_alt)
print("Original ANCE:", precision_ance)
print("Alternative DPR", precision_dpr_alt)

Original DPR: {'P@10': 0.04444, 'P@100': 0.00889, 'P@1000': 0.001}
Original Sentence BERT: {'P@10': 0.05556, 'P@100': 0.00778, 'P@1000': 0.00089}
Original ANCE: {'P@10': 0.03333, 'P@100': 0.00778, 'P@1000': 0.001}
Alternative DPR {'P@10': 0.04444, 'P@100': 0.00778, 'P@1000': 0.001}
