# Toy Examples DPR

In [1]:
#https://github.com/beir-cellar/beir/blob/main/examples/retrieval/evaluation/dense/evaluate_dpr.py
!pip install beir



# Import Libraries

In [2]:
import numpy as np
import pandas as pd

# Load Toy Data-Sets

In [3]:
#documents
docs = pd.read_csv('./toy_data/docs.csv', dtype=str)

#queries
queries = pd.read_csv('./toy_data/queries.csv', dtype=str)

#qrels
qrels = pd.read_csv('./toy_data/qrels.csv', dtype=str)
qrels = qrels.astype({'label': 'int32'})


#prints
print(docs.shape)
print(docs.head())

print(queries.shape)
print(queries.head())

print(qrels.shape)
print(qrels.head())

(2453, 2)
     docno                                               text
0   935016  he emigrated to france with his family in 1956...
1  2360440  after being ambushed by the germans in novembe...
2   347765  she was the second ship named for captain alex...
3  1969335  world war ii was a global war that was under w...
4  1576938  the ship was ordered on 2 april 1942 laid down...
(9, 2)
       qid                 query
0  1015979    president of chile
1     2674    computer animation
2   340095  2020 summer olympics
3  1502917         train station
4     2574       chinese cuisine
(2454, 4)
       qid    docno  label iteration
0  1015979  1015979      2         0
1  1015979  2226456      1         0
2  1015979  1514612      1         0
3  1015979  1119171      1         0
4  1015979  1053174      1         0


# Dense IR - Using Dense Passage Retrieval (DPR)

In [4]:
from beir import util, LoggingHandler
from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES

import logging
import pathlib, os

  from tqdm.autonotebook import tqdm


In [5]:
#IMPLEMENTED MODEL FROM https://github.com/beir-cellar/beir
#https://github.com/beir-cellar/beir/blob/main/beir/retrieval/models/dpr.py

from transformers import DPRContextEncoder, DPRContextEncoderTokenizerFast
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizerFast
from typing import Union, List, Dict, Tuple
from tqdm.autonotebook import trange
import torch

class DPR:
    def __init__(self, model_path: Union[str, Tuple] = None, **kwargs):
        # Query tokenizer and model
        self.q_tokenizer = DPRQuestionEncoderTokenizerFast.from_pretrained(model_path[0])
        self.q_model = DPRQuestionEncoder.from_pretrained(model_path[0])
        self.q_model.cuda()
        self.q_model.eval()

        # Context tokenizer and model
        self.ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(model_path[1])
        self.ctx_model = DPRContextEncoder.from_pretrained(model_path[1])
        self.ctx_model.cuda()
        self.ctx_model.eval()

    def encode_queries(self, queries: List[str], batch_size: int = 16, **kwargs) -> torch.Tensor:
        query_embeddings = []
        with torch.no_grad():
            for start_idx in trange(0, len(queries), batch_size):
                encoded = self.q_tokenizer(queries[start_idx:start_idx+batch_size], truncation=True, padding=True, return_tensors='pt')
                model_out = self.q_model(encoded['input_ids'].cuda(), attention_mask=encoded['attention_mask'].cuda())
                #model_out = self.q_model(encoded['input_ids'], attention_mask=encoded['attention_mask'])
                query_embeddings += model_out.pooler_output

        return torch.stack(query_embeddings)

    def encode_corpus(self, corpus: List[Dict[str, str]], batch_size: int = 8, **kwargs) -> torch.Tensor:

        corpus_embeddings = []
        with torch.no_grad():
            for start_idx in trange(0, len(corpus), batch_size):
                #titles = [row['title'] for row in corpus[start_idx:start_idx+batch_size]]
                texts = [row['text']  for row in corpus[start_idx:start_idx+batch_size]]
                #encoded = self.ctx_tokenizer(titles, texts, truncation='longest_first', padding=True, return_tensors='pt')
                encoded = self.ctx_tokenizer(texts, truncation='longest_first', padding=True, return_tensors='pt')
                model_out = self.ctx_model(encoded['input_ids'].cuda(), attention_mask=encoded['attention_mask'].cuda())
                #model_out = self.ctx_model(encoded['input_ids'], attention_mask=encoded['attention_mask'])
                corpus_embeddings += model_out.pooler_output.detach()

        return torch.stack(corpus_embeddings)

In [6]:
new_docs = {}
for i in range(len(docs)):
    new_docs[docs['docno'][i]] = {'text' : docs['text'][i]}

In [7]:
new_queries = {}
for i in range(len(queries)):
    new_queries[queries['qid'][i]] = queries['query'][i]

In [8]:
new_qrels = {}
for i in range(len(qrels)):
    new_qrels[qrels['qid'][i]] = {qrels['docno'][i] : int(qrels['label'][i])}

In [9]:
# https://huggingface.co/models
# DIFFERENT MODELS I CAN USE
#https://www.sbert.net/docs/pretrained-models/dpr.html
model_dpr = DRES(DPR((
     "facebook/dpr-question_encoder-multiset-base",
     "facebook/dpr-ctx_encoder-multiset-base"), batch_size=16))
retriever_dpr = EvaluateRetrieval(model_dpr, score_function="dot") # or "dot" for dot-product
results_dpr = retriever_dpr.retrieve(new_docs, new_queries)

Some weights of the model checkpoint at facebook/dpr-question_encoder-multiset-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.weight', 'question_encoder.bert_model.pooler.dense.bias']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRCont

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

In [10]:
model = DRES(models.SentenceBERT("msmarco-distilbert-base-tas-b", batch_size=16))
retriever = EvaluateRetrieval(model, score_function="dot") # or "cos_sim" for cosine similarity
results = retriever.retrieve(new_docs, new_queries)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

In [11]:
#https://www.sbert.net/docs/pretrained-models/msmarco-v3.html
#model_ance = DRES(models.SentenceBERT('msmarco-distilroberta-base-v3'))
model_ance = DRES(models.SentenceBERT('msmarco-roberta-base-v3', batch_size=16))
#model_ance = DRES(models.SentenceBERT('msmarco-distilbert-base-tas-b'))
retriever_ance = EvaluateRetrieval(model_ance, score_function="cos_sim")

#### Retrieve dense results (format of results is identical to qrels)
results_ance = retriever_ance.retrieve(new_docs, new_queries)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

In [12]:
model_dpr_alt = DRES(models.SentenceBERT((
    "facebook-dpr-question_encoder-multiset-base",
    "facebook-dpr-ctx_encoder-multiset-base",
    " [SEP] "), batch_size=16))
retriever_dpr_alt = EvaluateRetrieval(model_dpr_alt, score_function="dot")
results_dpr_alt = retriever_dpr_alt.retrieve(new_docs, new_queries)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

In [13]:
#### Evaluate your model with NDCG@k, MAP@K, Recall@K and Precision@K  where k = [1,3,5,10,100,1000] [10,100,100]
ndcg, _map, recall, precision = retriever_dpr.evaluate(new_qrels, results_dpr, [10,100,1000]) #retriever_dpr.k_values)
ndcg_alt, _map_alt, recall_alt, precision_alt = retriever.evaluate(new_qrels,results, [10,100,1000]) # retriever.k_values)
ndcg_ance, _map_ance, recall_ance, precision_ance = retriever_ance.evaluate(new_qrels, results_ance, [10,100,1000]) #retriever_ance.k_values)
ndcg_dpr_alt, _map_dpr_alt, recall_dpr_alt, precision_dpr_alt = retriever_dpr_alt.evaluate(new_qrels, results_dpr_alt, [10,100,1000]) #retriever_dpr_alt.k_values)


In [14]:
print("Original DPR:", ndcg)
print("Original Sentence BERT:", ndcg_alt)
print("Original ANCE:", ndcg_ance)
print("Alternative DPR", ndcg_dpr_alt)

Original DPR: {'NDCG@10': 0.16543, 'NDCG@100': 0.25085, 'NDCG@1000': 0.26321}
Original Sentence BERT: {'NDCG@10': 0.28915, 'NDCG@100': 0.33446, 'NDCG@1000': 0.35004}
Original ANCE: {'NDCG@10': 0.17526, 'NDCG@100': 0.27918, 'NDCG@1000': 0.30524}
Alternative DPR {'NDCG@10': 0.21626, 'NDCG@100': 0.28014, 'NDCG@1000': 0.30964}


In [15]:
print("Original DPR:", _map)
print("Original Sentence BERT:", _map_alt)
print("Original ANCE:", _map_ance)
print("Alternative DPR", _map_dpr_alt)

Original DPR: {'MAP@10': 0.08395, 'MAP@100': 0.09656, 'MAP@1000': 0.09678}
Original Sentence BERT: {'MAP@10': 0.20833, 'MAP@100': 0.21669, 'MAP@1000': 0.21749}
Original ANCE: {'MAP@10': 0.125, 'MAP@100': 0.15157, 'MAP@1000': 0.15223}
Alternative DPR {'MAP@10': 0.15309, 'MAP@100': 0.16235, 'MAP@1000': 0.16362}


In [16]:
print("Original DPR:", recall)
print("Original Sentence BERT:", recall_alt)
print("Original ANCE:", recall_ance)
print("Alternative DPR", recall_dpr_alt)

Original DPR: {'Recall@10': 0.44444, 'Recall@100': 0.88889, 'Recall@1000': 1.0}
Original Sentence BERT: {'Recall@10': 0.55556, 'Recall@100': 0.77778, 'Recall@1000': 0.88889}
Original ANCE: {'Recall@10': 0.33333, 'Recall@100': 0.77778, 'Recall@1000': 1.0}
Alternative DPR {'Recall@10': 0.44444, 'Recall@100': 0.77778, 'Recall@1000': 1.0}


In [17]:
print("Original DPR:", precision)
print("Original Sentence BERT:", precision_alt)
print("Original ANCE:", precision_ance)
print("Alternative DPR", precision_dpr_alt)

Original DPR: {'P@10': 0.04444, 'P@100': 0.00889, 'P@1000': 0.001}
Original Sentence BERT: {'P@10': 0.05556, 'P@100': 0.00778, 'P@1000': 0.00089}
Original ANCE: {'P@10': 0.03333, 'P@100': 0.00778, 'P@1000': 0.001}
Alternative DPR {'P@10': 0.04444, 'P@100': 0.00778, 'P@1000': 0.001}


# Alternative Approach - Does not work.

In [18]:
#https://haystack.deepset.ai/tutorials/09_dpr_training
!pip install farm-haystack[colab,inference,metrics]



In [19]:
from haystack.nodes import DensePassageRetriever
from haystack.utils import fetch_archive_from_http
from haystack.document_stores import InMemoryDocumentStore

In [20]:
#doc_dir = "PATH_TO_YOUR_DATA_DIR"
#train_filename = "TRAIN_FILENAME"
#dev_filename = "DEV_FILENAME"

query_model = "facebook/dpr-question_encoder-single-nq-base"
passage_model = "facebook/dpr-ctx_encoder-single-nq-base"

#save_dir = "../saved_models/dpr"

In [21]:
#UNSURE ABOUT document_store
retriever = DensePassageRetriever(
    document_store=InMemoryDocumentStore(),
    query_embedding_model=query_model,
    passage_embedding_model=passage_model
)

  return self.fget.__get__(instance, owner)()


In [22]:
#https://docs.haystack.deepset.ai/reference/retriever-api#densepassageretriever

In [23]:
retriever.train(
    data_dir='./toy_data/',
    train_filename='docs.csv',
    n_epochs=1,
    batch_size=16,
    grad_acc_steps=8,
    save_dir='./toy_data/',
    evaluate_every=3000,
    embed_title=True,
    num_positives=1,
    num_hard_negatives=1,
)

JSONDecodeError: ignored

In [None]:
#https://www.youtube.com/watch?v=DBsxUSUhfRg
#https://haystack.deepset.ai/tutorials/09_dpr_training

In [None]:
#Retriever Haystack API

#https://docs.haystack.deepset.ai/reference/retriever-api#densepassageretriever

#Document Haystack API

# https://docs.haystack.deepset.ai/reference/document-store-api#inmemorydocumentstore

In [24]:
!pip install --force-reinstall -v "SQLAlchemy==1.4.47"

Using pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)
Collecting SQLAlchemy==1.4.47
  Using cached SQLAlchemy-1.4.47-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
Collecting greenlet!=0.4.17 (from SQLAlchemy==1.4.47)
  Using cached greenlet-3.0.2-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (613 kB)
Installing collected packages: greenlet, SQLAlchemy
  Attempting uninstall: greenlet
    Found existing installation: greenlet 3.0.2
    Uninstalling greenlet-3.0.2:
      Removing file or directory /usr/include/python3.10/greenlet/
      Removing file or directory /usr/local/lib/python3.10/dist-packages/greenlet-3.0.2.dist-info/
      Removing file or directory /usr/local/lib/python3.10/dist-packages/greenlet/
      Successfully uninstalled greenlet-3.0.2
  Attempting uninstall: SQLAlchemy
    Found existing installation: SQLAlchemy 1.4.47
    Uninstalling SQLAlchemy-1.4.47:
      Removing

In [26]:
from haystack.document_stores.faiss import FAISSDocumentStore
from haystack.nodes import DensePassageRetriever
#from haystack import Finder

In [27]:
docs2 = docs
docs2['content'] = docs2['text']
docs3 = docs2[['docno','content']]
docs3

Unnamed: 0,docno,content
0,935016,he emigrated to france with his family in 1956...
1,2360440,after being ambushed by the germans in novembe...
2,347765,she was the second ship named for captain alex...
3,1969335,world war ii was a global war that was under w...
4,1576938,the ship was ordered on 2 april 1942 laid down...
...,...,...
2448,912095,the second of these three came home tells of h...
2449,313045,with the introduction of new tiger ii tanks in...
2450,501294,it became a classic tankette design worldwide ...
2451,1710863,he entered army service as fahnenjunker and co...


In [29]:
# Initialize FAISSDocumentStore to store your documents
document_store = FAISSDocumentStore()

# Convert DataFrame to a list of dictionaries
documents = docs3.to_dict(orient='records')

# Write documents to the document store
document_store.write_documents(documents)

# Load your CSV file into the document store
#document_store.write_documents(file_path='./toy_data/docs.csv')

Writing Documents: 10000it [00:04, 2227.21it/s]


In [30]:
# Initialize Dense Passage Retriever
retriever = DensePassageRetriever(document_store=document_store,
                                  query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                                  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                                  use_gpu=True,  # Set it to False if you don't have a GPU
                                  )

In [31]:
# Update embeddings in the DocumentStore
document_store.update_embeddings(retriever)

Updating Embedding:   0%|          | 0/2450 [00:00<?, ? docs/s]
Create embeddings:   0%|          | 0/2464 [00:00<?, ? Docs/s][A
Create embeddings:   1%|          | 16/2464 [00:00<00:44, 55.58 Docs/s][A
Create embeddings:   1%|▏         | 32/2464 [00:00<00:38, 63.62 Docs/s][A
Create embeddings:   2%|▏         | 48/2464 [00:00<00:36, 66.14 Docs/s][A
Create embeddings:   3%|▎         | 64/2464 [00:00<00:35, 67.66 Docs/s][A
Create embeddings:   3%|▎         | 80/2464 [00:01<00:36, 65.16 Docs/s][A
Create embeddings:   4%|▍         | 96/2464 [00:01<00:35, 66.19 Docs/s][A
Create embeddings:   5%|▍         | 112/2464 [00:01<00:35, 67.16 Docs/s][A
Create embeddings:   5%|▌         | 128/2464 [00:01<00:34, 67.56 Docs/s][A
Create embeddings:   6%|▌         | 144/2464 [00:02<00:35, 65.79 Docs/s][A
Create embeddings:   6%|▋         | 160/2464 [00:02<00:34, 66.01 Docs/s][A
Create embeddings:   7%|▋         | 176/2464 [00:02<00:34, 66.60 Docs/s][A
Create embeddings:   8%|▊         | 192/