## PyTerrier Implementation of Learning to Rank(L2R) Model
Craig Macdonald and Nicola Tonellotto. 2020. Declarative Experimentation in Information Retrieval using PyTerrier. In Proceedings of the 2020 ACM SIGIR on International Conference on Theory of Information Retrieval (ICTIR '20). Association for Computing Machinery, New York, NY, USA, 161–168. https://doi.org/10.1145/3409256.3409829 <br> <br>

Reference: https://github.com/terrier-org/pyterrier

In [None]:
# [COLAB]

from google.colab import drive
drive.mount('/content/drive')

In [None]:
# [COLAB] Installations

%%capture
!pip install --upgrade git+https://github.com/terrier-org/pyterrier.git#egg=python-terrier
!pip install --upgrade git+https://github.com/terrierteam/pyterrier_doc2query.git

In [1]:
# Imports

import time
import os
import pandas as pd
import numpy as np
import pyterrier as pt
import xgboost as xgb
import pickle
from sklearn.ensemble import RandomForestRegressor
from pyterrier_doc2query import Doc2Query

  from pandas import MultiIndex, Int64Index


In [2]:
# Initialize PyTerrier

if not pt.started():
    pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

PyTerrier 0.8.0 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [3]:
dataset = pt.get_dataset("trec-deep-learning-passages")

In [4]:
# Create a shorter training file

if not os.path.isfile("data/train_topics_100.txt"):

    # Read the train topics and qrels
    train_topics = dataset.get_topics('train')
    train_qrels = dataset.get_qrels('train')

    # Get 100 topics that are in qrels
    train_qids = list(train_qrels['qid'].unique())
    train_qids_small = train_qids[:100]

    # Keep only those 100 topics
    train_topics_small = train_topics[train_topics['qid'].isin(train_qids_small)]

    # Create train topics file
    if not os.path.isdir("data"):
        !mkdir "data"

    with open("data/train_topics_100.txt", "w") as output:
        for index, row in train_topics_small.iterrows():
            query_line = row['qid'] + ":" + row['query']
            output.write(query_line + '\n')

### Generate Index

In [None]:
# Iterator for msmarco passage

def msmarco_generate():
    
    with pt.io.autoopen(dataset.get_corpus()[0], 'rt') as corpusfile:
        for l in corpusfile:
            docno, passage = l.split("\t")
            yield {'docno' : docno, 'text' : passage}

In [None]:
# Terrier Indexing: Time taken 2157.8492665290833 seconds

!rm -rf "indexes/msmarco-passage"

!mkdir "indexes/msmarco-passage"

start_time = time.time()

indexer = pt.IterDictIndexer("./indexes/msmarco-passage")
indexref = indexer.index(msmarco_generate())

print(f'Time taken : {time.time() - start_time}')

index = pt.IndexFactory.of(indexref)
print(index.getCollectionStatistics().toString())

### Generate Index using doc-T5-query
Reference: https://github.com/terrierteam/pyterrier_doc2query

In [None]:
# Download t5-base.zip from https://github.com/castorini/docTTTTTquery

%%capture
!wget https://www.dropbox.com/s/q1nye6wfsvf5sen/t5-base.zip
!unzip t5-base.zip

In [None]:
# Initialize doc2query

doc2query = Doc2Query(out_attr="text", batch_size=8)

In [None]:
# Terrier Indexing with Doc-T5-Query: Time taken  seconds

if os.path.isdir('indexes/msmarco-passage-docTTTTTquery'):
    !rm -rf "indexes/msmarco-passage-docTTTTTquery"

!mkdir "indexes/msmarco-passage-docTTTTTquery"

start_time = time.time()

indexer_doc2query = doc2query >> pt.IterDictIndexer("./indexes/msmarco-passage-docTTTTTquery")
indexref_doc2query = indexer_doc2query.index(msmarco_generate())

print(f'Time taken : {time.time() - start_time}')

index_doc2query = pt.IndexFactory.of(indexref_doc2query)
print(index_doc2query.getCollectionStatistics().toString())

### LambdaMART with XGBoost and Random Forest Regressor

#### 1a. Load Generated Index

In [None]:
# Terrier Load Index

index = pt.IndexFactory.of("./indexes/msmarco-passage")
print(index.getCollectionStatistics().toString())

#### 1b. Load PyTerrier Pre-trained Indexes

In [5]:
# Terrier Load Index with stemming

index = dataset.get_index('terrier_stemmed')

#### 2. Initailise BM25 Batch Feature Retreival

In [6]:
# Batch Feature Retrieval for BM25 

BM25_withFeatures = pt.FeaturesBatchRetrieve(index, wmodel="BM25", features=["WMODEL:Tf", "WMODEL:PL2"]) % 100

# BM25_withFeatures = pt.BatchRetrieve(index, wmodel="BM25")        

#### 3a. Configure XGBoost as LambdaMART

In [None]:
# Not executed due to system limitations

lambdamart_x = xgb.sklearn.XGBRanker(objective='rank:ndcg',
                                    learning_rate=0.1,
                                    gamma=1.0,
                                    min_child_weight=0.1,
                                    max_depth=6,
                                    verbose=2,
                                    random_state=42)

lambdamart_x_pipe = BM25_withFeatures >> pt.ltr.apply_learned_model(lambdamart_x, form="ltr")

start_time = time.time()

lambdamart_x_pipe.fit(pt.io.read_topics("data/train_topics_100.txt", format="singleline"), 
                      dataset.get_qrels('train'), 
                      dataset.get_topics('dev.small'), 
                      dataset.get_qrels('dev.small'))

print(f'Time taken : {time.time() - start_time}')

#### 3b. Random Forest Regressor

In [7]:
rf = RandomForestRegressor(n_estimators = 100)

rf_pipe = BM25_withFeatures >> pt.ltr.apply_learned_model(rf)

start_time = time.time()

rf_pipe.fit(pt.io.read_topics("data/train_topics_100.txt", format="singleline"), 
            dataset.get_qrels('train'))

print(f'Time taken : {time.time() - start_time}') #  676.38 seconds

Time taken : 676.3848423957825


In [9]:
# Save the Random Forest Regressor pipeline

with open("checkpoints/rf_100.pickle", "wb") as output_file:
    pickle.dump(rf_pipe, output_file)

#### 4. Run Experiments

In [19]:
# Experiments

start_time = time.time()

results = pt.Experiment([BM25_withFeatures, rf_pipe],
                       dataset.get_topics("test-2019"), 
                       dataset.get_qrels("test-2019"),
                       eval_metrics=["recip_rank","ndcg_cut_10","map", "mrt"],
                       names=["BM25", "RFRegressor"]
                  )

print(f'Time taken : {time.time() - start_time}') # 2538.20 seconds

Time taken : 2521.6439867019653


In [20]:
results

Unnamed: 0,name,recip_rank,ndcg_cut_10,map,mrt
0,BM25,0.794961,0.47954,0.290681,5958.579604
1,RFRegressor,0.558018,0.301347,0.209281,6648.992057


### LambdaMART with XGBoost and Random Forest Regressor using Doc2Query

#### 1a. Load Generated Index

In [None]:
# Terrier Load Index with Doc-T5-Query

index_doc2query = pt.IndexFactory.of("./indexes/msmarco-passage-docTTTTTquery")
print(index_doc2query.getCollectionStatistics().toString())

#### 1b. Load PyTerrier Pre-trained Indexes

In [13]:
# Terrier Load Index with stemming and doc2query

index_doc2query = dataset.get_index('terrier_stemmed_docT5query')

#### 2. Initailise BM25 Batch Feature Retreival

In [14]:
# Batch Feature Retrieval for BM25 

BM25_withFeatures_doc2query = pt.FeaturesBatchRetrieve(index_doc2query, 
                                                       wmodel="BM25", 
                                                       features=["WMODEL:Tf", "WMODEL:PL2"]) % 100

#### 3a. Configure XGBoost as LambdaMART

In [None]:
# Not executed due to system limitations

lmart_x_doc2query = xgb.sklearn.XGBRanker(objective='rank:ndcg',
                                          learning_rate=0.1,
                                          gamma=1.0,
                                          min_child_weight=0.1,
                                          max_depth=6,
                                          verbose=2,
                                          random_state=42)

lmart_x_doc2query_pipe = BM25_withFeatures_doc2query >> pt.ltr.apply_learned_model(lmart_x_doc2query, form="ltr")
lmart_x_doc2query_pipe.fit(dataset.get_topics('train'), 
                           dataset.get_qrels('train'), 
                           dataset.get_topics('dev'), 
                           dataset.get_qrels('dev'))

#### 3b. Random Forest Regressor

In [15]:
rf_doc2query = RandomForestRegressor(n_estimators=100)

rf_doc2query_pipe = BM25_withFeatures_doc2query >> pt.ltr.apply_learned_model(rf_doc2query)

start_time = time.time()

rf_doc2query_pipe.fit(pt.io.read_topics("data/train_topics_100.txt", format="singleline"), 
                      dataset.get_qrels('train'))

print(f'Time taken : {time.time() - start_time}') # 1645 seconds

Time taken : 1645.001565694809


In [18]:
# Save the Random Forest Regressor pipeline

with open("checkpoints/rf_100_doc2query.pickle", "wb") as output_file:
    pickle.dump(rf_pipe, output_file)

#### 3. Run Experiments

In [16]:
# # Experiments

start_time = time.time()

results_doc2query = pt.Experiment([BM25_withFeatures_doc2query, 
                                  rf_doc2query_pipe],
                                dataset.get_topics("test-2019"), 
                                dataset.get_qrels("test-2019"),
                                eval_metrics=["recip_rank", "ndcg_cut_10","map"],
                                names=["BM25", "RandomForestRegressor" ]
                            )

print(f'Time taken : {time.time() - start_time}') # 6039.18 seconds

Time taken : 6039.181530237198


In [17]:
results_doc2query

Unnamed: 0,name,recip_rank,ndcg_cut_10,map
0,BM25,0.9,0.630835,0.35929
1,RandomForestRegressor,0.585817,0.323078,0.229042


### Random Forest Regressor using Doc2Query and Query Expansion

In [22]:
# Experiments with different query expansion techniques + doc2query

AQ = pt.rewrite.AxiomaticQE(index_doc2query)
RM3 = pt.rewrite.RM3(index_doc2query)

start_time = time.time()

results_doc2query_qe = pt.Experiment([rf_doc2query_pipe,
                                      rf_doc2query_pipe >> AQ >> rf_doc2query_pipe,
                                      rf_doc2query_pipe >> RM3 >> rf_doc2query_pipe],
                                    dataset.get_topics("test-2019"), 
                                    dataset.get_qrels("test-2019"),
                                    eval_metrics=["recip_rank", "ndcg_cut_10","map"],
                                    names=["RFRegressor",
                                           "RFRegressor+AQ",
                                           "RFRegressor+RM3"]
                                )

print(f'Time taken : {time.time() - start_time}')

Time taken : 18589.914915323257


In [23]:
results_doc2query_qe

Unnamed: 0,name,recip_rank,ndcg_cut_10,map
0,RFRegressor,0.585817,0.323078,0.229042
1,RFRegressor+AQ,0.585817,0.323078,0.229042
2,RFRegressor+RM3,0.527628,0.310928,0.222889
