# Toy Examples of Sparse and Dense IR

# Import Libraries

In [1]:
import numpy as np
import pandas as pd

# Load Toy Data-Sets

In [2]:
#documents
docs = pd.read_csv('./toy_data/docs.csv', dtype=str)

#queries
queries = pd.read_csv('./toy_data/queries.csv', dtype=str)

#qrels
qrels = pd.read_csv('./toy_data/qrels.csv', dtype=str)


#prints
print(docs.shape)
print(docs.head())

print(queries.shape)
print(queries.head())

print(qrels.shape)
print(qrels.head())

(2453, 2)
     docno                                               text
0   935016  he emigrated to france with his family in 1956...
1  2360440  after being ambushed by the germans in novembe...
2   347765  she was the second ship named for captain alex...
3  1969335  world war ii was a global war that was under w...
4  1576938  the ship was ordered on 2 april 1942 laid down...
(9, 2)
       qid                 query
0  1015979    president of chile
1     2674    computer animation
2   340095  2020 summer olympics
3  1502917         train station
4     2574       chinese cuisine
(2454, 4)
       qid    docno label iteration
0  1015979  1015979     2         0
1  1015979  2226456     1         0
2  1015979  1514612     1         0
3  1015979  1119171     1         0
4  1015979  1053174     1         0


# Implement PyTerrier

In [3]:
import pyterrier as pt
if not pt.started():
    pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

PyTerrier 0.9.2 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [4]:
# Build DEFAULT index
indexer = pt.DFIndexer("./indexes/default", overwrite=True, blocks=True)
index_ref = indexer.index(docs["text"], docs["docno"])
index = pt.IndexFactory.of(index_ref)
print(index.getCollectionStatistics().toString())

  for column, value in meta_column[1].iteritems():


Number of documents: 2453
Number of terms: 23693
Number of postings: 208487
Number of fields: 0
Number of tokens: 273373
Field names: []
Positions:   true



In [5]:
# Loading index
index_ref = pt.IndexRef.of("./indexes/default/data.properties")
index = pt.IndexFactory.of(index_ref)
print(index.getCollectionStatistics().toString())

Number of documents: 2453
Number of terms: 23693
Number of postings: 208487
Number of fields: 0
Number of tokens: 273373
Field names: []
Positions:   true



# Sparse IR

In [6]:
#Build Sparse IR Systems
tf = pt.BatchRetrieve(index, wmodel="Tf")
tf_idf = pt.BatchRetrieve(index, wmodel="TF_IDF")
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

In [7]:
# Evaluate models on queries using PyTerrier Experiment Interface
qrels = qrels.astype({'label': 'int32'})
pt.Experiment(
    retr_systems = [tf, tf_idf, bm25],
    names =  ["TF", "TF-IDF", "BM25"],
    topics = queries, 
    qrels = qrels,
    eval_metrics = ["map", "ndcg", "ndcg_cut_10", "P_10"])

Unnamed: 0,name,map,ndcg,ndcg_cut_10,P_10
0,TF,0.610184,0.789583,0.851008,0.8
1,TF-IDF,0.622287,0.798228,0.840808,0.766667
2,BM25,0.628454,0.800955,0.842503,0.766667


# Dense IR - Using Re-Ranking with Transformers - Hybrid BM25 Cross-Encoder

In [8]:
from sentence_transformers import CrossEncoder 
model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')

In [9]:
#Retrive top-K documents using BM25
K = 20 
top_k_bm25 = bm25 % K

init = top_k_bm25.transform(queries)
init

Unnamed: 0,qid,docid,docno,rank,score,query
0,1015979,205,1015979,0,20.927815,president of chile
1,1015979,2435,229754,1,18.834027,president of chile
2,1015979,2417,1186821,2,18.584731,president of chile
3,1015979,546,2226456,3,14.702179,president of chile
4,1015979,549,1514612,4,13.293413,president of chile
...,...,...,...,...,...,...
2266,8438,983,329834,15,9.048160,mexican cuisine
2267,8438,448,198481,16,7.538971,mexican cuisine
2268,8438,6,11904,17,7.484533,mexican cuisine
2269,8438,2053,409,18,7.351817,mexican cuisine


In [10]:
# Re-Ranking documents using a Cross-Encoder
import os
cross_run = []
model_name = 'cross'
for i in range(len(queries)):
    qid, query = queries.iloc[i]

    # Retrieve top-K document IDs for query i 
    qid_docnos = init[init['qid'] == qid]['docno']
    
    # Retrive docno and text for top-K documents
    qid_docnos = docs[docs['docno'].isin(qid_docnos)]['docno']
    qid_docs = docs[docs['docno'].isin(qid_docnos)]['text']
    
    
    # Concatenate the query and documents and predict the scores for the pairs [query, passage]
    model_inputs = [[query, doc] for doc in qid_docs]
    docno_inputs = [docno for docno in qid_docnos]
    scores = model.predict(model_inputs)

    # Sort the scores in decreasing order
    results = [{'input': inp, 'docno': docno, 'score': score} for inp, docno, score in zip(model_inputs, docno_inputs, scores)]
    results = sorted(results, key=lambda x: x['score'], reverse=True)

    # Save the results in TREC format
    for rank, hit in enumerate(results):
        docno = hit['docno']
        score = hit['score']
        row_str = f"{qid} 0 {docno} {rank} {score} {model_name}"
        cross_run.append(row_str)    
    
# Store ranking on disk in TREC format
if os.path.exists('./outputs') == False:
    os.makedirs('./outputs')
with open('./' + f"outputs/{model_name}.run", "w") as f:
    for l in cross_run:
        f.write(l + "\n")

# Evaluate

In [11]:
import pytrec_eval

# Load qrels in a dictionary
qrels_dict = dict()
for _, r in qrels.iterrows():
    qid, docno, label, iteration = r
    if qid not in qrels_dict:
        qrels_dict[qid] = dict()
    qrels_dict[qid][docno] = int(label)

# Build evaluator based on the qrels and metrics
metrics = {"ndcg_cut_5", "ndcg_cut_10", "P_5", "P_10"}
my_qrel = {q: d for q, d in qrels_dict.items()}
evaluator = pytrec_eval.RelevanceEvaluator(my_qrel, metrics)

In [12]:
# Load Cross-Encoder run
with open("outputs/cross.run", 'r') as f_run:
    cross_run = pytrec_eval.parse_run(f_run)

In [13]:
# Evaluate Cross-Encoder model
cross_evals = evaluator.evaluate(cross_run)

# Compute performance in different metrics for each query
cross_metric2vals = {m: [] for m in metrics}
for q, d in cross_evals.items():
    for m, val in d.items():
        cross_metric2vals[m].append(val)

# Average results by query
cross_metric2avg = dict()
for m in metrics:
    val = pytrec_eval.compute_aggregated_measure(m, cross_metric2vals[m])
    cross_metric2avg[m] = val
    print(m, '\t', val)

ndcg_cut_5 	 0.9296490683037353
P_5 	 0.9555555555555555
P_10 	 0.8111111111111111
ndcg_cut_10 	 0.8840626039066009


In [14]:
# Compare system performance
experiment = pt.Experiment(
    retr_systems=[tf, tf_idf, bm25],
    names=['TF', 'TF-IDF', 'BM25'],
    topics=queries,
    qrels=qrels,
    eval_metrics=metrics)

cross_metric2avg['name'] = 'BM25 >> Cross-Encoder'
experiment.append(cross_metric2avg, ignore_index=True)

  experiment.append(cross_metric2avg, ignore_index=True)


Unnamed: 0,name,ndcg_cut_5,P_5,P_10,ndcg_cut_10
0,TF,0.861466,0.866667,0.8,0.851008
1,TF-IDF,0.87727,0.888889,0.766667,0.840808
2,BM25,0.878503,0.888889,0.766667,0.842503
3,BM25 >> Cross-Encoder,0.929649,0.955556,0.811111,0.884063


# Dense IR - Using Dense Passage Retrieval (DPR)

In [17]:
# SEE dpr_example.ipynb

In [20]:
#### Load the SBERT model and retrieve using cosine-similarity
#model = DRES(models.SentenceBERT("msmarco-distilbert-base-tas-b"), batch_size=16)
#retriever = EvaluateRetrieval(model, score_function="dot") # or "cos_sim" for cosine similarity
#results = retriever.retrieve(corpus, queries)
#results = retriever.retrieve(new_docs, new_queries)

In [21]:
#### Evaluate your model with NDCG@k, MAP@K, Recall@K and Precision@K  where k = [1,3,5,10,100,1000] 
#ndcg, _map, recall, precision = retriever.evaluate(new_qrels, results, retriever.k_values)