In [1]:
# [COLAB] Installations

# %%capture
# !pip install --upgrade git+https://github.com/terrier-org/pyterrier.git#egg=python-terrier
# !pip install --upgrade git+https://github.com/terrierteam/pyterrier_doc2query.git

In [2]:
# Imports

import time, os
import pandas as pd
import numpy as np
import pyterrier as pt

In [3]:
# Initialize PyTerrier

if not pt.started():
    pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

PyTerrier 0.8.0 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [4]:
dataset = pt.get_dataset("trec-deep-learning-passages")

#### Index Statistics

In [5]:
# Terrier Load Index without stemming

index_unstemmed = dataset.get_index('terrier_unstemmed')
print(pt.IndexFactory.of(index_unstemmed).getCollectionStatistics().toString())

Number of documents: 8841823
Number of terms: 1372086
Number of postings: 356030265
Number of fields: 1
Number of tokens: 511225848
Field names: [text]
Positions:   false



In [6]:
# Terrier Load Index with stemming

index = dataset.get_index('terrier_stemmed')
print(pt.IndexFactory.of(index).getCollectionStatistics().toString())

Number of documents: 8841823
Number of terms: 1170682
Number of postings: 215238456
Number of fields: 1
Number of tokens: 288759529
Field names: [text]
Positions:   false



In [7]:
# Terrier Load Index with stemming and doc2query

index_doc2query = dataset.get_index('terrier_stemmed_docT5query')
print(pt.IndexFactory.of(index_doc2query).getCollectionStatistics().toString())

Number of documents: 8841823
Number of terms: 2341829
Number of postings: 330953705
Number of fields: 1
Number of tokens: 1453630605
Field names: [text]
Positions:   false



#### BM25 Baseline Topic Wise Analysis

In [None]:
collection = dataset.get_corpus()[0]

In [8]:
topics = dataset.get_topics("test-2019")
topics.head()

Unnamed: 0,qid,query
0,1108939,what slows down the flow of blood
1,1112389,what is the county for grand rapids mn
2,792752,what is ruclip
3,1119729,what do you do when you have a nosebleed from ...
4,1105095,where is sugar lake lodge located


In [9]:
qrels = dataset.get_qrels("test-2019")
qrels.head()

Unnamed: 0,qid,docno,label
0,19335,1017759,0
1,19335,1082489,0
2,19335,109063,0
3,19335,1160863,0
4,19335,1160871,0


In [12]:
BM25_baseline = pt.BatchRetrieve(index, wmodel="BM25", verbose=True)

In [13]:
BM25_baseline_results = BM25_baseline(topics)

BR(BM25): 100%|████████████████████████████████████████████████████████████████| 200/200 [00:12<00:00, 16.20q/s]


In [14]:
BM25_baseline_results.head()

Unnamed: 0,qid,docid,docno,rank,score,query
0,1108939,4069373,4069373,0,36.189054,what slows down the flow of blood
1,1108939,4744533,4744533,1,35.865644,what slows down the flow of blood
2,1108939,7454708,7454708,2,34.213639,what slows down the flow of blood
3,1108939,7724054,7724054,3,33.891239,what slows down the flow of blood
4,1108939,841975,841975,4,33.763758,what slows down the flow of blood


In [15]:
# Topic wise evaluations

start_time = time.time()

result_per_query = pt.Experiment([BM25_baseline], 
                                 dataset.get_topics("test-2019"),
                                 dataset.get_qrels("test-2019"), 
                                 eval_metrics=["recip_rank", "ndcg_cut_10", "map"],
                                 perquery=True)

print(f'Time taken : {time.time() - start_time}')

BR(BM25): 100%|████████████████████████████████████████████████████████████████| 200/200 [00:11<00:00, 17.01q/s]


Time taken : 12.68502163887024


  warn(f'{backfill_count} topic(s) not found in qrels. Scores for these topics are given as NaN and should not contribute to averages.')


#### Find the worst performing topics

In [16]:
# Since qrel does not contain all topics remove null rows

result_per_query = result_per_query[result_per_query["value"].notnull()]

In [17]:
# List of worst performing qids

worst_perf_qids = []

In [18]:
result_per_query[result_per_query["measure"]=='ndcg_cut_10'].sort_values(by="value").head(5)

Unnamed: 0,name,qid,measure,value
41,BR(BM25),962179,ndcg_cut_10,0.0
98,BR(BM25),1121709,ndcg_cut_10,0.0
95,BR(BM25),443396,ndcg_cut_10,0.069431
50,BR(BM25),451602,ndcg_cut_10,0.122273
8,BR(BM25),1063750,ndcg_cut_10,0.142811


In [19]:
worst_perf_qids.append(list(result_per_query[result_per_query["measure"]=='ndcg_cut_10'].sort_values(by="value").head(5)['qid']))

In [20]:
result_per_query[result_per_query["measure"]=='recip_rank'].sort_values(by="value").head(5)

Unnamed: 0,name,qid,measure,value
97,BR(BM25),1121709,recip_rank,0.002874
40,BR(BM25),962179,recip_rank,0.025
94,BR(BM25),443396,recip_rank,0.125
46,BR(BM25),148538,recip_rank,0.2
49,BR(BM25),451602,recip_rank,0.333333


In [21]:
worst_perf_qids.append(list(result_per_query[result_per_query["measure"]=='recip_rank'].sort_values(by="value").head(5)['qid']))

In [22]:
result_per_query[result_per_query["measure"]=='map'].sort_values(by="value").head(5)

Unnamed: 0,name,qid,measure,value
96,BR(BM25),1121709,map,0.000239
6,BR(BM25),1063750,map,0.003193
93,BR(BM25),443396,map,0.005157
12,BR(BM25),489204,map,0.055242
39,BR(BM25),962179,map,0.058701


In [23]:
worst_perf_qids.append(list(result_per_query[result_per_query["measure"]=='map'].sort_values(by="value").head(5)['qid']))

In [24]:
# Get unique list of top 5 worst performing queries
worst_perf_qids = np.array(worst_perf_qids).flatten().tolist()
worst_perf_qids = list(set(worst_perf_qids))

In [25]:
worst_perf_qids

['443396', '489204', '1121709', '451602', '962179', '1063750', '148538']

#### Further Investigating the Worst Peroming Queries

In [26]:
indFactoryObj_stemmed = pt.IndexFactory.of(index)
indFactoryObj = pt.IndexFactory.of(index_unstemmed)

In [27]:
metaIndex = indFactoryObj.getMetaIndex()
invIndex = indFactoryObj.getInvertedIndex()
dirIndex = indFactoryObj.getDirectIndex()
docIndex = indFactoryObj.getDocumentIndex()
lexFactory = indFactoryObj.getLexicon()

In [44]:
def findInDocuments(term, ret_docno=False):
    
    numOccur = indFactoryObj.getLexicon()[term].getDocumentFrequency() if term in indFactoryObj.getLexicon() else 0
    
    if numOccur == 0:
        print('Term not found')
        return
    
    print(f'{term} occurs {numOccur} times in the collection')
    print(f'Probability of Occurence is {numOccur / indFactoryObj.getCollectionStatistics().getNumberOfTokens()}')
    
    if ret_docno:
        lexEntry = lexFactory.getLexiconEntry(term)

        try:
            for posting in invIndex.getPostings(lexEntry): 
                docno = metaIndex.getItem("docno", posting.getId())
                print(f'{docno}:{posting.getFrequency()} times')
        except:
            print('Term not found')

In [60]:
def findTermsInDocuments(docid):
    termList = []
    
    for posting in dirIndex.getPostings(docIndex.getDocumentEntry(docid)):
        termID = posting.getId()
        leEntry = lexFactory.getLexiconEntry(termID)
        termList.append(leEntry.getKey())
        # print(f'{leEntry.getKey()}: {posting.getFrequency()} times')
    
    print(termList)

In [30]:
topics[topics['qid'].isin(worst_perf_qids)]

Unnamed: 0,qid,query
26,1063750,why did the us volunterilay enter ww1
42,489204,right pelvic pain causes
76,962179,when was the salvation army founded
83,148538,difference between rn and bsn
84,451602,medicare s definition of mechanical ventilation
142,443396,lps laws definition
148,1121709,what are the three percenters


#### Query ID: #1063750 "why did the us volunterilay enter ww1"

In [31]:
qrels_1063750 = qrels[qrels['qid'] == '1063750']
qrels_1063750[qrels_1063750['label'] == 3]

Unnamed: 0,qid,docno,label
5827,1063750,4066863,3
5845,1063750,4337532,3
5971,1063750,7247262,3


In [42]:
findInDocuments('volunterilay') # Spelling error

Term not found


In [45]:
findInDocuments('voluntarily') # Word doesn't exist

voluntarily occurs 1814 times in the collection
Probability of Occurence is 3.5483338862787703e-06


In [61]:
findTermsInDocuments(4066863) # ww1 = world war (abbreviations)

['of', 'the', 'to', 'and', 'history', 'its', 'an', 'world', 'war', 'this', 'change', 'u', 'for', 'first', 'a', 'in', 'states', 'united', 'caused', 'time', 'economic', 'public', 'effects', 'alliance', 'sinking', 'entered', 'boat', 'unrestricted', 'allies', 'ties', 'sussex', 'allied', 'submarine', 'zimmerman', 'warfare', 'lusitania', 'telegram', 'outrage', 'torpedoing']


#### Query ID: #489204	"right pelvic pain causes"

In [62]:
qrels_489204 = qrels[qrels['qid'] == '489204']
qrels_489204[qrels_489204['label'] == 3]

Unnamed: 0,qid,docno,label
4332,489204,852257,3


In [63]:
findTermsInDocuments(489204)

['the', 'to', 'is', 'that', 'for', 's', 'a', 'from', '2', 'or', '1', 'data', 'services', 'obtained', 'contractor', 'medicare', 'report', 'contains', 'statistics', '04', 'specific', 'fi', 'provider', 'administrative', 'fiscal', 'intermediary', 'submitted', 'payments', 'pepper', 'claims', 'vulnerable', 'mac', 'improper', 'discharges', 'snf', 'ub']


#### Query ID: #962179	"when was the salvation army founded"

In [64]:
qrels_962179 = qrels[qrels['qid'] == '962179']
qrels_962179[qrels_962179['label'] == 3].head(5)

Unnamed: 0,qid,docno,label
5351,962179,2329692,3
5352,962179,2329693,3
5356,962179,2329697,3
5358,962179,2329699,3
5360,962179,2329701,3


In [65]:
findInDocuments('salvation')

salvation occurs 1560 times in the collection
Probability of Occurence is 3.0514888988946426e-06


In [66]:
findInDocuments('army')

army occurs 30472 times in the collection
Probability of Occurence is 5.960574982507536e-05


In [67]:
findTermsInDocuments(2329692)

['of', 'the', 'to', 'and', 'was', 'it', 'general', 'first', 's', 'a', 'from', 'army', 'in', 'had', 'not', 'by', 'who', 'william', 'became', 'second', 'charles', '10', 'saw', 'being', 'been', 'need', 'april', 'set', 'churches', 'his', '20', 'n', 'son', 'music', 'wrote', 'august', 'lay', 'england', 'founder', 'serving', 'originally', 'poem', '1929', 'heaven', 'enters', 'lindsay', 'honor', 'methodist', '1878', '1912', 'tremendous', 'salvation', 'succeeded', 'fulfilled', 'neighbor', 'victorian', 'mainstream', 'preacher', '1829', 'booth', 'ives', 'bramwell', 'vachel']


#### Query ID: #148538	"difference between rn and bsn"

In [None]:
qrels_148538  = qrels[qrels['qid'] == '148538']
qrels_148538 [qrels_148538 ['label'] == 3]

#### Query ID: #489204	"medicare s definition of mechanical ventilation"

In [None]:
qrels_489204 = qrels[qrels['qid'] == '489204']
qrels_489204[qrels_489204['label'] == 3]

#### Query ID: #443396	"lps laws definition"

In [None]:
qrels_443396 = qrels[qrels['qid'] == '443396']
qrels_443396[qrels_443396['label'] == 3]

#### Query ID: #1121709	"what are the three percenters"

In [None]:
qrels_1121709 = qrels[qrels['qid'] == '1121709']
qrels_1121709[qrels_1121709['label'] == 3]