In this notebook we run the L2R baseline.

In [None]:
# Import all the needed libraries
import pyterrier as pt
import pandas as pd
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import algos
import os
if not pt.started():
    pt.init()

PyTerrier 0.8.0 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)



In [None]:
# Load the dataset
dataset = pt.datasets.get_dataset("trec-deep-learning-passages")

In [None]:
index_ref = pt.IndexRef.of(os.getcwd() + "./passage_index/data.properties")

In [None]:
index = pt.IndexFactory.of(index_ref)

14:21:53.612 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 1,9 GiB of memory would be required.


In [None]:
pipeline = pt.FeaturesBatchRetrieve(index, wmodel="BM25", features=["WMODEL:Tf", "WMODEL:PL2"])

In [None]:
train_topics = dataset.get_topics("train")
train_qrels = dataset.get_qrels("train")

14:21:53.703 [main] WARN org.terrier.applications.batchquerying.TRECQuery - trec.encoding is not set; resorting to platform default (windows-1252). Retrieval may be platform dependent. Recommend trec.encoding=UTF-8


In [None]:
train_topics = train_topics.sort_values(by='qid', ascending=True)[3:]
train_qrels = train_qrels.sort_values(by='qid', ascending=True)[3:]

In [None]:
# A simple join operation on the qids, as we have much more queries than qrels, we want to only use those
# queries with a qrel for optimal training performances. 
temp = pd.merge(train_topics, train_qrels, left_on='qid', right_on='qid')

In [None]:
# Make the new train_topics dataframe with only those queries that have a qrel
train_topics_100 = pd.concat([temp['qid'], temp['query']], axis=1, keys=['qid', 'query'])
train_topics_100.head(100)

Unnamed: 0,qid,query
0,1000005,where is westminster california
1,1000007,where is westminster ma
2,1000008,where is westminster md
3,1000009,where is westmont illinois located
4,1000010,where is westmoreland
...,...,...
95,100015,cortana what is the average blood pressure
96,1000150,where is willard north carolina
97,1000153,where is william brewster buried
98,1000154,where is william key buried


In [None]:
train_qrel_100 = pd.concat([temp['qid'], temp['docno'], temp['label']], axis=1, keys=['qid', 'docno', 'label'])
train_qrel_100.head(100)

Unnamed: 0,qid,docno,label
0,1000005,3617173,1
1,1000007,5939245,1
2,1000008,4630854,1
3,1000009,1728286,1
4,1000010,4186186,1
...,...,...,...
95,100015,3035531,1
96,1000150,3598087,1
97,1000153,3606427,1
98,1000154,4609289,1


In [None]:
import time
start_time = time.time()

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=400)
rf_pipe = pipeline >> pt.ltr.apply_learned_model(rf)
rf_pipe.fit(train_topics_100.head(100), train_qrel_100.head(100))

In [None]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 1161.2626819610596 seconds ---


In [None]:
start_time = time.time()

In [None]:
pt.Experiment([rf_pipe], dataset.get_topics("test-2019"), 
  dataset.get_qrels("test-2019"),eval_metrics=["ndcg", "map", "recip_rank"], names=["LTR"])

14:41:29.826 [main] WARN org.terrier.applications.batchquerying.TRECQuery - trec.encoding is not set; resorting to platform default (windows-1252). Retrieval may be platform dependent. Recommend trec.encoding=UTF-8


Unnamed: 0,name,ndcg,map,recip_rank
0,LTR,0.37094,0.084299,0.118964


In [None]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 5477.988292694092 seconds ---
