In [1]:
# Imports

import time
import pandas as pd
import numpy as np
import pyterrier as pt

## Implementation of Baseline Model (BM25)
Craig Macdonald and Nicola Tonellotto. 2020. Declarative Experimentation in Information Retrieval using PyTerrier. In Proceedings of the 2020 ACM SIGIR on International Conference on Theory of Information Retrieval (ICTIR '20). Association for Computing Machinery, New York, NY, USA, 161–168. <br> DOI:https://doi-org.tudelft.idm.oclc.org/10.1145/3409256.3409829

In [2]:
# Initialize PyTerrier

if not pt.started():
    pt.init()

PyTerrier 0.8.0 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [5]:
!tar xvfz collections/msmarco_passage/collection.tar.gz -C collections/msmarco_passage

collection.tsv


In [6]:
dataset = pt.get_dataset("trec-deep-learning-passages")

In [7]:
# Iterator for msmarco passage

def msmarco_generate():
    
    with pt.io.autoopen('collections/msmarco_passage/collection.tsv', 'rt') as corpusfile:
        for l in corpusfile:
            docno, passage = l.split("\t")
            yield {'docno' : docno, 'text' : passage}

In [8]:
# Terrier Indexing

indexer = pt.IterDictIndexer("./indexes/msmarco-passage")
indexref = indexer.index(msmarco_generate(), fields=['docno', 'text'], meta_lengths=[20, 4096])
index = pt.IndexFactory.of(indexref)
print(index.getCollectionStatistics().toString())

16:26:31.714 [ForkJoinPool-1-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 5 empty documents
Number of documents: 8841823
Number of terms: 1170693
Number of postings: 215248445
Number of fields: 2
Number of tokens: 288769520
Field names: [docno, text]
Positions:   false



In [9]:
# Batch Retrievalfor BM25 

BM25_baseline = pt.BatchRetrieve(index, wmodel="BM25", verbose=True)

In [11]:
# Evaluation

start_time = time.time()

result = pt.Experiment([BM25_baseline], 
                       dataset.get_topics("test-2019"), 
                       dataset.get_qrels("test-2019"),
                       eval_metrics=["recip_rank", "ndcg_cut_10","map"])

print(f'Time taken : {time.time() - start_time}')

BR(BM25): 100%|████████████████████████████████████████████████████████████████████████| 200/200 [00:34<00:00,  5.75q/s]


Time taken : 35.78198957443237


In [12]:
result

Unnamed: 0,name,recip_rank,ndcg_cut_10,map
0,BR(BM25),0.795028,0.479592,0.370014


In [13]:
# Topic wise analysis

start_time = time.time()

result_per_query = pt.Experiment([BM25_baseline], 
                                 dataset.get_topics("test-2019"),
                                 dataset.get_qrels("test-2019"), 
                                 eval_metrics=["recip_rank", "ndcg_cut_10","map"],
                                 perquery=True)

print(f'Time taken : {time.time() - start_time}')

BR(BM25): 100%|████████████████████████████████████████████████████████████████████████| 200/200 [00:20<00:00,  9.69q/s]


Time taken : 21.154677867889404


  warn(f'{backfill_count} topic(s) not found in qrels. Scores for these topics are given as NaN and should not contribute to averages.')


In [17]:
result_per_query = result_per_query[result_per_query["value"].notnull()]

In [22]:
result_per_query[result_per_query["measure"]=='map'].sort_values(by="value")

Unnamed: 0,name,qid,measure,value
96,BR(BM25),1121709,map,0.000239
6,BR(BM25),1063750,map,0.003193
93,BR(BM25),443396,map,0.005157
12,BR(BM25),489204,map,0.055242
39,BR(BM25),962179,map,0.058701
24,BR(BM25),527433,map,0.064043
78,BR(BM25),1106007,map,0.094742
27,BR(BM25),1037798,map,0.109533
48,BR(BM25),451602,map,0.122209
66,BR(BM25),833860,map,0.139674


In [24]:
result_per_query[result_per_query["measure"]=='ndcg_cut_10'].sort_values(by="value")

Unnamed: 0,name,qid,measure,value
41,BR(BM25),962179,ndcg_cut_10,0.0
98,BR(BM25),1121709,ndcg_cut_10,0.0
95,BR(BM25),443396,ndcg_cut_10,0.069431
50,BR(BM25),451602,ndcg_cut_10,0.122273
8,BR(BM25),1063750,ndcg_cut_10,0.142811
29,BR(BM25),1037798,ndcg_cut_10,0.152866
47,BR(BM25),148538,ndcg_cut_10,0.163883
86,BR(BM25),490595,ndcg_cut_10,0.223715
122,BR(BM25),1113437,ndcg_cut_10,0.226504
26,BR(BM25),527433,ndcg_cut_10,0.293011


In [26]:
result_per_query[result_per_query["measure"]=='recip_rank'].sort_values(by="value")

Unnamed: 0,name,qid,measure,value
97,BR(BM25),1121709,recip_rank,0.002874
40,BR(BM25),962179,recip_rank,0.025
94,BR(BM25),443396,recip_rank,0.125
46,BR(BM25),148538,recip_rank,0.2
49,BR(BM25),451602,recip_rank,0.333333
115,BR(BM25),405717,recip_rank,0.333333
28,BR(BM25),1037798,recip_rank,0.333333
7,BR(BM25),1063750,recip_rank,0.333333
121,BR(BM25),1113437,recip_rank,0.5
34,BR(BM25),264014,recip_rank,0.5
