In [1]:
# Imports

import time
import pandas as pd
import numpy as np
import pyterrier as pt

In [2]:
# Initialize PyTerrier

if not pt.started():
    pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

  warn("From PyTerrier 0.8, Python 3.7 will be required, you currently have %s" % platform.python_version())


PyTerrier 0.7.2 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)


In [19]:
dataset = pt.get_dataset("trec-deep-learning-passages")

### Index Statistics

In [34]:
# Terrier Load Index with stemming
index_stemmed = dataset.get_index('terrier_stemmed')
print(pt.IndexFactory.of(index_stemmed).getCollectionStatistics().toString())

Number of documents: 8841823
Number of terms: 1170682
Number of postings: 215238456
Number of fields: 1
Number of tokens: 288759529
Field names: [text]
Positions:   false



### BM25 Baseline Topic Wise Analysis

In [5]:
BM25_baseline_stemmed = pt.BatchRetrieve(index_stemmed, wmodel="BM25", verbose=True)

Reference: [perquery passage ranking](https://pyterrier.readthedocs.io/en/latest/experiments.html#per-query-effectiveness)

In [7]:
# perquery is set to true, because we need topic-wise passage ranks

start_time = time.time()

result_per_query = pt.Experiment([BM25_baseline_stemmed], 
                                 dataset.get_topics("test-2019"),
                                 dataset.get_qrels("test-2019"), 
                                 eval_metrics=["recip_rank", "ndcg_cut_10","map"],
                                 perquery=True)

print(f'Time taken : {time.time() - start_time}')

BR(BM25): 100%|██████████| 200/200 [00:18<00:00, 10.85q/s]


Time taken : 19.754709243774414


  warn(f'{backfill_count} topic(s) not found in qrels. Scores for these topics are given as NaN and should not contribute to averages.')


In [8]:
result_per_query = result_per_query[result_per_query["value"].notnull()]

In [9]:
#Perquery performance
result_per_query[result_per_query["measure"]=='map'].sort_values(by="value")

Unnamed: 0,name,qid,measure,value
96,BR(BM25),1121709,map,0.000239
6,BR(BM25),1063750,map,0.003193
93,BR(BM25),443396,map,0.005157
12,BR(BM25),489204,map,0.055242
39,BR(BM25),962179,map,0.058701
24,BR(BM25),527433,map,0.064043
78,BR(BM25),1106007,map,0.094742
27,BR(BM25),1037798,map,0.109533
48,BR(BM25),451602,map,0.122208
66,BR(BM25),833860,map,0.139674


In [16]:
topics = dataset.get_topics("test-2019")
topics.head()

Unnamed: 0,qid,query
0,1108939,what slows down the flow of blood
1,1112389,what is the county for grand rapids mn
2,792752,what is ruclip
3,1119729,what do you do when you have a nosebleed from ...
4,1105095,where is sugar lake lodge located


In [20]:
qrels = dataset.get_qrels("test-2019")
qrels.head()

Unnamed: 0,qid,docno,label
0,19335,1017759,0
1,19335,1082489,0
2,19335,109063,0
3,19335,1160863,0
4,19335,1160871,0


In [21]:
BM25_baseline_results = BM25_baseline_stemmed(topics)
BM25_baseline_results.head()

BR(BM25): 100%|██████████| 200/200 [00:17<00:00, 11.26q/s]


Unnamed: 0,qid,docid,docno,rank,score,query
0,1108939,4069373,4069373,0,36.189054,what slows down the flow of blood
1,1108939,4744533,4744533,1,35.865644,what slows down the flow of blood
2,1108939,7454708,7454708,2,34.213639,what slows down the flow of blood
3,1108939,7724054,7724054,3,33.891239,what slows down the flow of blood
4,1108939,841975,841975,4,33.763758,what slows down the flow of blood


### Find the worst performing topics

In [23]:
# List of worst performing qids

worst_perf_qids = []
worst_perf_qids.append(list(result_per_query[result_per_query["measure"]=='ndcg_cut_10'].sort_values(by="value").head(5)['qid']))
result_per_query[result_per_query["measure"]=='ndcg_cut_10'].sort_values(by="value").head(5)

Unnamed: 0,name,qid,measure,value
41,BR(BM25),962179,ndcg_cut_10,0.0
98,BR(BM25),1121709,ndcg_cut_10,0.0
95,BR(BM25),443396,ndcg_cut_10,0.069431
50,BR(BM25),451602,ndcg_cut_10,0.122273
8,BR(BM25),1063750,ndcg_cut_10,0.142811


In [24]:
worst_perf_qids.append(list(result_per_query[result_per_query["measure"]=='recip_rank'].sort_values(by="value").head(5)['qid']))
result_per_query[result_per_query["measure"]=='recip_rank'].sort_values(by="value").head(5)

Unnamed: 0,name,qid,measure,value
97,BR(BM25),1121709,recip_rank,0.002874
40,BR(BM25),962179,recip_rank,0.025
94,BR(BM25),443396,recip_rank,0.125
46,BR(BM25),148538,recip_rank,0.2
49,BR(BM25),451602,recip_rank,0.333333


In [30]:
worst_perf_qids.append(list(result_per_query[result_per_query["measure"]=='map'].sort_values(by="value").head(5)['qid']))
result_per_query[result_per_query["measure"]=='map'].sort_values(by="value").head(5)

Unnamed: 0,name,qid,measure,value
96,BR(BM25),1121709,map,0.000239
6,BR(BM25),1063750,map,0.003193
93,BR(BM25),443396,map,0.005157
12,BR(BM25),489204,map,0.055242
39,BR(BM25),962179,map,0.058701


In [26]:
# Get unique list of top 5 worst performing queries
worst_perf_qids = np.array(worst_perf_qids).flatten().tolist()
worst_perf_qids = list(set(worst_perf_qids))
worst_perf_qids

['489204', '443396', '962179', '451602', '1063750', '1121709', '148538']

In [11]:
#Get the text corresponding required qid, to use in the search() for query specific passage ranks
topics[topics["qid"] == "962179"]

Unnamed: 0,qid,query
76,962179,when was the salvation army founded


### Crosscheck relevance of top ranked passages
Reference: [passage ranking results specific to a query](https://github.com/terrier-org/cikm2021tutorial/blob/main/notebooks/notebook1.ipynb)

In [28]:
baselineRanks_perQuery = BM25_baseline_stemmed.search("when was the salvation army founded").head(100)
baselineRanks_perQuery

BR(BM25): 100%|██████████| 1/1 [00:00<00:00, 14.72q/s]


Unnamed: 0,qid,docid,docno,rank,score,query
0,1,8689054,8689054,0,38.471748,when was the salvation army founded
1,1,4511501,4511501,1,38.190918,when was the salvation army founded
2,1,4511504,4511504,2,37.620726,when was the salvation army founded
3,1,8689056,8689056,3,37.273490,when was the salvation army founded
4,1,5773189,5773189,4,36.879666,when was the salvation army founded
...,...,...,...,...,...,...
95,1,7428148,7428148,95,28.872857,when was the salvation army founded
96,1,4606387,4606387,96,28.859203,when was the salvation army founded
97,1,181089,181089,97,28.833306,when was the salvation army founded
98,1,5427064,5427064,98,28.731822,when was the salvation army founded


In [14]:
#Get the qrels scores for the required query
required_query_relevance = qrels.loc[(qrels["qid"] == "962179"),  

               ["qid","docno","label"]]

In [15]:
#Get those documents for the required query which have the highest relevance scores
requiredQueryQrels = qrels.loc[(qrels["qid"] == "962179"),  

               ["qid","docno","label"]]
isMostRelevant = requiredQueryQrels["label"] > 2
mostRelavantPassages = requiredQueryQrels[isMostRelevant]
mostRelavantPassages

Unnamed: 0,qid,docno,label
5351,962179,2329692,3
5352,962179,2329693,3
5356,962179,2329697,3
5358,962179,2329699,3
5360,962179,2329701,3
5363,962179,2978864,3
5365,962179,2978866,3
5387,962179,3705165,3
5394,962179,3896632,3
5406,962179,4511499,3


In [31]:
# How many of the most relevant passages are present in the top 100 retrieved by BM25

listOfDocNo = mostRelavantPassages["docno"].tolist()
revCounter = len(listOfDocNo)

for doc in listOfDocNo:
    tmpdf = baselineRanks_perQuery[baselineRanks_perQuery["docno"] == doc]
    if tmpdf.empty:
        print(f'doc absent: {doc}')
        revCounter = revCounter - 1
    else:
        print(f'doc present: {doc}')
print(f'{revCounter}/{len(listOfDocNo)} of the most relevant passages in top 100 ranked passages')

doc absent: 2329692
doc absent: 2329693
doc present: 2329697
doc absent: 2329699
doc absent: 2329701
doc absent: 2978864
doc present: 2978866
doc absent: 3705165
doc absent: 3896632
doc present: 4511499
doc absent: 4606386
doc present: 4606387
doc absent: 536176
doc absent: 5653659
doc absent: 5919340
doc absent: 5919342
doc absent: 6898289
doc present: 6980697
doc absent: 8785367
doc absent: 8785371
5/20 of the most relevant passages in top 100 ranked passages
