<a href="https://colab.research.google.com/github/Gjel/Core-IR-project/blob/master/ProbabilisticModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## PyTerrier Implementation of Probabilistic Model
Craig Macdonald and Nicola Tonellotto. 2020. Declarative Experimentation in Information Retrieval using PyTerrier. In Proceedings of the 2020 ACM SIGIR on International Conference on Theory of Information Retrieval (ICTIR '20). Association for Computing Machinery, New York, NY, USA, 161–168. https://doi.org/10.1145/3409256.3409829 <br> <br>

Reference: https://github.com/terrier-org/pyterrier

In [None]:
# Only mount if using generated index

from google.colab import drive
drive.mount('/content/drive')

In [1]:
%%capture
!pip install --upgrade git+https://github.com/terrier-org/pyterrier.git#egg=python-terrier
!pip install --upgrade git+https://github.com/terrierteam/pyterrier_doc2query.git

In [2]:
# Imports

import time, os
import pandas as pd
import numpy as np
import pyterrier as pt
from pyterrier_doc2query import Doc2Query

In [3]:
# Initialize PyTerrier

if not pt.started():
    pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

terrier-assemblies 5.6 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.6 jar not found, downloading to /root/.pyterrier...
Done
terrier-prf -SNAPSHOT jar not found, downloading to /root/.pyterrier...
Done


PyTerrier 0.8.0 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)



In [4]:
dataset = pt.get_dataset("trec-deep-learning-passages")

### Generate Index

In [6]:
# Iterator for msmarco passage

def msmarco_generate():
    
    with pt.io.autoopen(dataset.get_corpus()[0], 'rt') as corpusfile:
        for l in corpusfile:
            docno, passage = l.split("\t")
            yield {'docno' : docno, 'text' : passage}

In [None]:
# Terrier Indexing: Time taken 2157.8492665290833 seconds

!rm -rf "drive/MyDrive/indexes/msmarco-passage"

!mkdir "drive/MyDrive/indexes/msmarco-passage"

start_time = time.time()

indexer = pt.IterDictIndexer("./drive/MyDrive/indexes/msmarco-passage")
indexref = indexer.index(msmarco_generate())

print(f'Time taken : {time.time() - start_time}')

index = pt.IndexFactory.of(indexref)
print(index.getCollectionStatistics().toString())

### Generate Index using doc-T5-query
Reference: https://github.com/terrierteam/pyterrier_doc2query

In [7]:
# Download t5-base.zip from https://github.com/castorini/docTTTTTquery

%%capture
!wget https://www.dropbox.com/s/q1nye6wfsvf5sen/t5-base.zip
!unzip t5-base.zip

In [None]:
# Initialize doc2query

doc2query = Doc2Query(out_attr="text", batch_size=8)

In [5]:
# Terrier Indexing with Doc-T5-Query: Time taken  seconds

if os.path.isdir('drive/MyDrive/indexes/msmarco-passage-docTTTTTquery'):
  !rm -rf "drive/MyDrive/indexes/msmarco-passage-docTTTTTquery"

!mkdir "drive/MyDrive/indexes/msmarco-passage-docTTTTTquery"

start_time = time.time()

indexer_doc2query = doc2query >> pt.IterDictIndexer("./drive/MyDrive/indexes/msmarco-passage-docTTTTTquery")
indexref_doc2query = indexer_doc2query.index(msmarco_generate())

print(f'Time taken : {time.time() - start_time}')

index_doc2query = pt.IndexFactory.of(indexref_doc2query)
print(index_doc2query.getCollectionStatistics().toString())

### BM25

#### 1a. Load Generated Index

In [None]:
# Terrier Load Index

index = pt.IndexFactory.of("./drive/MyDrive/indexes/msmarco-passage")
print(index.getCollectionStatistics().toString())

#### 1b. Load PyTerrier Pre-trained Indexes

In [14]:
# Terrier Load Index with stemming

index = dataset.get_index('terrier_stemmed')

#### 2. Initailise BM25 Batch Retreival

In [15]:
# Batch Retrieval for BM25 

BM25_baseline = pt.BatchRetrieve(index, wmodel="BM25", verbose=True)

#### 3. Run Experiments

In [16]:
# Experiments with different query expansion techniques

AQ = pt.rewrite.AxiomaticQE(index)
Bo1 = pt.rewrite.Bo1QueryExpansion(index)
KL = pt.rewrite.KLQueryExpansion(index)
RM3 = pt.rewrite.RM3(index)

start_time = time.time()

result = pt.Experiment([BM25_baseline,
                        BM25_baseline >> AQ >> BM25_baseline,
                        BM25_baseline >> Bo1 >> BM25_baseline,
                        BM25_baseline >> KL >> BM25_baseline,
                        BM25_baseline >> RM3 >> BM25_baseline], 
                       dataset.get_topics("test-2019"), 
                       dataset.get_qrels("test-2019"),
                       eval_metrics=["recip_rank", "ndcg_cut_10","map"],
                       names=["BM25", "BM25+AQ", "BM25+Bo1", "BM25+KL", "BM25+RM3"])

print(f'Time taken : {time.time() - start_time}')

BR(BM25):   0%|          | 0/200 [00:00<?, ?q/s]

BR(BM25):   0%|          | 0/200 [00:00<?, ?q/s]

BR(BM25):   0%|          | 0/200 [00:00<?, ?q/s]

BR(BM25):   0%|          | 0/200 [00:00<?, ?q/s]

BR(BM25):   0%|          | 0/200 [00:00<?, ?q/s]

BR(BM25):   0%|          | 0/200 [00:00<?, ?q/s]

BR(BM25):   0%|          | 0/200 [00:00<?, ?q/s]

BR(BM25):   0%|          | 0/200 [00:00<?, ?q/s]

BR(BM25):   0%|          | 0/200 [00:00<?, ?q/s]

Time taken : 237.44730186462402


In [17]:
result

Unnamed: 0,name,recip_rank,ndcg_cut_10,map
0,BM25,0.795028,0.47954,0.370004
1,BM25+AQ,0.795028,0.47954,0.370004
2,BM25+Bo1,0.788124,0.50862,0.399897
3,BM25+KL,0.785023,0.505715,0.397658
4,BM25+RM3,0.790123,0.515595,0.404489


### BM25 with doc2query

#### 1a. Load Generated Index

In [None]:
# Terrier Load Index with Doc-T5-Query

index_doc2query = pt.IndexFactory.of("./drive/MyDrive/indexes/msmarco-passage-docTTTTTquery")
print(index_doc2query.getCollectionStatistics().toString())

#### 1b. Load PyTerrier Pre-trained Indexes

In [18]:
# Terrier Load Index with stemming and doc2query

index_doc2query = dataset.get_index('terrier_stemmed_docT5query')

#### 2. Initailise BM25 Batch Retreival

In [19]:
# Batch Retrieval for BM25 with doc2query

BM25_doc2query = pt.BatchRetrieve(index_doc2query, wmodel="BM25", verbose=True)

#### 3. Run Experiments

In [20]:
# Experiments with different query expansion techniques + doc2query

AQ = pt.rewrite.AxiomaticQE(index_doc2query)
Bo1 = pt.rewrite.Bo1QueryExpansion(index_doc2query)
KL = pt.rewrite.KLQueryExpansion(index_doc2query)
RM3 = pt.rewrite.RM3(index_doc2query)

start_time = time.time()

result_doc2query = pt.Experiment([BM25_doc2query,
                                  BM25_doc2query >> AQ >> BM25_doc2query,
                                  BM25_doc2query >> Bo1 >> BM25_doc2query,
                                  BM25_doc2query >> KL >> BM25_doc2query,
                                  BM25_doc2query >> RM3 >> BM25_doc2query], 
                                dataset.get_topics("test-2019"), 
                                dataset.get_qrels("test-2019"),
                                eval_metrics=["recip_rank", "ndcg_cut_10","map"],
                                names=["BM25", "BM25+AQ", "BM25+Bo1", "BM25+KL", "BM25+RM3"])

print(f'Time taken : {time.time() - start_time}')

11:20:56.047 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - OutOfMemoryError: Structure meta reading lookup file directly from disk
11:20:56.138 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - OutOfMemoryError: Structure meta reading data file directly from disk


BR(BM25):   0%|          | 0/200 [00:00<?, ?q/s]

BR(BM25):   0%|          | 0/200 [00:00<?, ?q/s]

BR(BM25):   0%|          | 0/200 [00:00<?, ?q/s]

BR(BM25):   0%|          | 0/200 [00:00<?, ?q/s]

BR(BM25):   0%|          | 0/200 [00:00<?, ?q/s]

BR(BM25):   0%|          | 0/200 [00:00<?, ?q/s]

BR(BM25):   0%|          | 0/200 [00:00<?, ?q/s]

BR(BM25):   0%|          | 0/200 [00:00<?, ?q/s]

BR(BM25):   0%|          | 0/200 [00:00<?, ?q/s]

Time taken : 428.2844123840332


In [21]:
result_doc2query

Unnamed: 0,name,recip_rank,ndcg_cut_10,map
0,BM25,0.9,0.630835,0.45383
1,BM25+AQ,0.9,0.630835,0.45383
2,BM25+Bo1,0.870247,0.627738,0.477918
3,BM25+KL,0.870247,0.626656,0.480612
4,BM25+RM3,0.862026,0.617302,0.485503
