In [1]:
# Import all the needed libraries
import pyterrier as pt
import pandas as pd
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import os
import time
from sklearn.ensemble import RandomForestRegressor
if not pt.started():
    pt.init()

PyTerrier 0.8.0 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)



In [3]:
# Load the dataset
dataset = pt.datasets.get_dataset("trec-deep-learning-passages")

index_ref = pt.IndexRef.of(os.getcwd() + "./passage_index/data.properties")

index = pt.IndexFactory.of(index_ref)

pipeline = pt.FeaturesBatchRetrieve(index, wmodel="BM25", features=["WMODEL:Tf", "WMODEL:PL2", "WMODEL:GloVe"])

train_topics = dataset.get_topics("train")
train_qrels = dataset.get_qrels("train")

09:01:15.712 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 1,9 GiB of memory would be required.
09:01:15.727 [main] WARN org.terrier.applications.batchquerying.TRECQuery - trec.encoding is not set; resorting to platform default (windows-1252). Retrieval may be platform dependent. Recommend trec.encoding=UTF-8


In [4]:
train_topics = train_topics.sort_values(by='qid', ascending=True)[3:]
train_qrels = train_qrels.sort_values(by='qid', ascending=True)[3:]

In [5]:
# A simple join operation on the qids, as we have much more queries than qrels, we want to only use those
# queries with a qrel for optimal training performances.
temp = pd.merge(train_topics, train_qrels, left_on='qid', right_on='qid')

# Make the new train_topics dataframe with only those queries that have a qrel
train_topics_100 = pd.concat([temp['qid'], temp['query']], axis=1, keys=['qid', 'query'])

In [6]:
train_qrel_100 = pd.concat([temp['qid'], temp['docno'], temp['label']], axis=1, keys=['qid', 'docno', 'label'])

In [7]:
final_df_train = pd.read_csv('final_df.csv')
final_df_train

Unnamed: 0,qid,pid,cdist
0,1000005,7871705,0.888864
1,1000005,8564649,0.906684
2,1000005,8564641,0.936021
3,1000005,1170305,0.866449
4,1000005,635918,0.853606
...,...,...,...
90995,1000162,252483,0.829435
90996,1000162,1005526,0.837253
90997,1000162,2229864,0.846573
90998,1000162,2313262,0.752206


In [8]:
final_df_test = pd.read_csv('final_df_test.csv')
final_df_test

Unnamed: 0,qid,pid,cdist
0,156493,8182161,0.770682
1,156493,6139386,0.753413
2,156493,3288600,0.785007
3,156493,3288596,0.762550
4,156493,2259183,0.740538
...,...,...,...
41995,146187,4239826,0.809611
41996,146187,4874233,0.784742
41997,146187,6065118,0.879775
41998,146187,7392175,0.709208


In [9]:
complete = pd.concat([final_df_train, final_df_test])
complete

Unnamed: 0,qid,pid,cdist
0,1000005,7871705,0.888864
1,1000005,8564649,0.906684
2,1000005,8564641,0.936021
3,1000005,1170305,0.866449
4,1000005,635918,0.853606
...,...,...,...
41995,146187,4239826,0.809611
41996,146187,4874233,0.784742
41997,146187,6065118,0.879775
41998,146187,7392175,0.709208


In [59]:
def features(row):
    val = np.array(complete.loc[(complete['qid'] == int(row['qid'])) & (complete['pid'] == int(row['docid']))]['cdist'].tolist())
    if len(val) == 0:
        return np.array([0])
    return val

In [60]:
# This method is used to give a new score to a document.
# In particular, it this notebook it is used to sum the bm25 score with the distilBERT score,
# which is multiplied by a certain factor to make it more compatible with the range of bm25 scores

def scores(row):
    val = np.array(complete.loc[(complete['qid'] == int(row['qid'])) & (complete['pid'] == int(row['docid']))]['cdist'].tolist())
    if len(val) == 0:
        return np.array([0])

    new_score = row['score'] + float(val)*10

    return new_score

In [92]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25")
tf = pt.BatchRetrieve(index, wmodel="Tf")
pl2 = pt.BatchRetrieve(index, wmodel="PL2")
pipeline = bm25 >> (pt.apply.doc_features(features) ** tf ** pl2)
L2R = bm25 >> (tf ** pl2)
summation = bm25 >> pt.apply.doc_score(scores)

df = complete.copy()
df['qid'] = df['qid'].astype(str)
df['doc_id'] = df['pid'].astype(str)
df['score'] = df['cdist'].astype(str)

glove = pt.Transformer.from_df(df)

In [86]:
start = time.time()

rf_complete = RandomForestRegressor(n_estimators=300)
rf_pipe_complete = pipeline >> pt.ltr.apply_learned_model(rf_complete)
rf_pipe_complete.fit(train_topics_100.head(100), train_qrel_100.head(100))

end = time.time()

print(end - start)

11343.447141647339


In [87]:
import pickle

In [88]:
# save the model to disk
filename = 'rf_pipe_complete_300trees_100queries_noflag.sav'
pickle.dump(rf_pipe_complete, open(filename, 'wb'))

In [None]:
# For when you run out of RAM: https://stackoverflow.com/questions/36137671/pycharm-running-out-of-memory

In [93]:
# Just L2R

start = time.time()

rf_L2R = RandomForestRegressor(n_estimators=300)
pipe_rf_L2R = L2R >> pt.ltr.apply_learned_model(rf_L2R)
pipe_rf_L2R.fit(train_topics_100.head(100), train_qrel_100.head(100))

end = time.time()
print(end - start)

190.03120493888855


In [81]:
filename = 'rf_L2R_300trees_100queries.sav'
pickle.dump(pipe_rf_L2R, open(filename, 'wb'))

In [77]:
start = time.time()

In [None]:
pt.Experiment([pipe_rf_L2R], dataset.get_topics("test-2019"),
              dataset.get_qrels("test-2019"), eval_metrics=["ndcg", "map", "recip_rank"], names=["LTR"])

In [None]:
end = time.time()
print(end - start)

In [None]:
# load the model from disk
# rf_pipe_complete = pickle.load(open('finalized_model_100que_300for.sav', 'rb'))

In [89]:
start = time.time()

In [90]:
pt.Experiment([bm25, pipe_rf_L2R, rf_pipe_complete, summation, glove], dataset.get_topics("test-2019"), dataset.get_qrels("test-2019"), filter_by_qrels = True, eval_metrics=["map", "ndcg", "recip_rank"], names = ["bm25", "L2R", "L2R+", "sum", "GloVe"])

18:31:34.357 [main] WARN org.terrier.applications.batchquerying.TRECQuery - trec.encoding is not set; resorting to platform default (windows-1252). Retrieval may be platform dependent. Recommend trec.encoding=UTF-8


Unnamed: 0,name,map,ndcg,recip_rank
0,bm25,0.370004,0.593433,0.795028
1,L2R,0.119847,0.432649,0.524127
2,L2R+,0.118282,0.430317,0.49222
3,sum,0.08516,0.374265,0.173978
4,GloVe,0.050179,0.326222,0.072101


In [91]:
end = time.time()
print(end - start)

1649.7361085414886


In [None]:
# without flags this time

In [None]:
# 11343.447141647339 secs for hte fitting function for l2r+ with 300 trees, 100 queries and no flags