In [1]:
import sys
import os
import shutil
from collections import namedtuple
sys.path.append('..') # assuming we're running from OpenNIR/examples/
os.environ['ONIR_IGNORE_ARGV'] = 'true' # don't process command line arguments (they come from jupyter)

In [2]:
import pandas as pd
import pyterrier as pt
if not pt.started():
    pt.init(tqdm='notebook')
import onir.pt

PyTerrier 0.3.1 has loaded Terrier 5.4 (built by craigm on 2021-01-16 14:17)
Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [3]:
# Sample EPIC re-ranker (trained on msmarco)
rr = onir.pt.reranker.from_checkpoint('epic.msmarco.tar.gz', {'learning_rate': 1e-5})

configuraiton file not found: config


In [4]:
dataset = pt.datasets.get_dataset('irds:vaswani')
index_path = './index_vaswani'
if not os.path.exists(index_path):
    indexer = pt.index.IterDictIndexer(index_path)
    index_ref = indexer.index(dataset.get_corpus_iter(), meta=('docno', 'text'))
else:
    index_ref = pt.IndexRef.of(index_path + '/data.properties')
index = pt.IndexFactory.of(index_ref)

In [5]:
base_pipeline = pt.BatchRetrieve(index, wmodel="BM25") % 100
res = base_pipeline.transform(dataset.get_topics())

In [6]:
pt.Utils.evaluate(res, dataset.get_qrels(), metrics = ['map'])

{'map': 0.2725231249761632}

In [7]:
rr_pipeline = base_pipeline >> pt.text.get_text(index, "text") >> rr
epic_res = rr_pipeline.transform(dataset.get_topics())

[2021-02-21 20:37:59,216][onir.pt][DEBUG] using GPU (deterministic)
[2021-02-21 20:37:59,310][onir.pt][DEBUG] [starting] batches


HBox(children=(IntProgress(value=0, description='batches', layout=Layout(flex='2'), max=2325, style=ProgressSt…

[2021-02-21 20:39:15,133][onir.pt][DEBUG] [finished] batches: [01:16] [2325it] [30.66it/s]


In [8]:
pt.Utils.evaluate(epic_res, dataset.get_qrels(), metrics = ['map'])

{'map': 0.22642311752488706}

In [9]:
# (over)fit 1 training iteration on the vaswani dataset
rr_pipeline.fit(dataset.get_topics(), dataset.get_qrels())

[2021-02-21 20:39:19,927][onir.pt][DEBUG] using GPU (deterministic)
[2021-02-21 20:39:19,931][onir.pt][DEBUG] [starting] training
[2021-02-21 20:39:19,932][onir.pt][DEBUG] [starting] train pairs


HBox(children=(IntProgress(value=0, description='train pairs', layout=Layout(flex='2'), max=1024, style=Progre…

[2021-02-21 20:40:24,105][onir.pt][DEBUG] [finished] train pairs: [01:04] [1024it] [15.96it/s]
[2021-02-21 20:40:24,110][onir.pt][DEBUG] [finished] training [01:04]
[2021-02-21 20:40:24,113][onir.pt][INFO] training   it=0 loss=0.1170


In [None]:
# (over)fit many training iterations on the vaswani dataset
rr_pipeline.fit(dataset.get_topics(), dataset.get_qrels(), dataset.get_topics(), dataset.get_qrels())

[2021-02-21 20:40:33,574][onir.pt][DEBUG] [starting] validation
[2021-02-21 20:40:33,576][onir.pt][DEBUG] using GPU (deterministic)
[2021-02-21 20:40:33,581][onir.pt][DEBUG] [starting] batches


HBox(children=(IntProgress(value=0, description='batches', layout=Layout(flex='2'), max=2325, style=ProgressSt…

[2021-02-21 20:41:49,903][onir.pt][DEBUG] [finished] batches: [01:16] [2325it] [30.46it/s]
[2021-02-21 20:41:49,923][onir.pt][DEBUG] [finished] validation [01:16]
[2021-02-21 20:41:49,925][onir.pt][INFO] pre-validation: 0.3259
[2021-02-21 20:41:50,324][onir.pt][DEBUG] using GPU (deterministic)
[2021-02-21 20:41:50,329][onir.pt][DEBUG] [starting] training
[2021-02-21 20:41:50,331][onir.pt][DEBUG] [starting] train pairs


HBox(children=(IntProgress(value=0, description='train pairs', layout=Layout(flex='2'), max=1024, style=Progre…

[2021-02-21 20:42:54,742][onir.pt][DEBUG] [finished] train pairs: [01:04] [1024it] [15.90it/s]
[2021-02-21 20:42:54,746][onir.pt][DEBUG] [finished] training [01:04]
[2021-02-21 20:42:54,748][onir.pt][INFO] training   it=0 loss=0.0651
[2021-02-21 20:42:54,750][onir.pt][DEBUG] [starting] validation
[2021-02-21 20:42:54,751][onir.pt][DEBUG] using GPU (deterministic)
[2021-02-21 20:42:54,758][onir.pt][DEBUG] [starting] batches


HBox(children=(IntProgress(value=0, description='batches', layout=Layout(flex='2'), max=2325, style=ProgressSt…

[2021-02-21 20:44:11,061][onir.pt][DEBUG] [finished] batches: [01:16] [2325it] [30.47it/s]
[2021-02-21 20:44:11,084][onir.pt][DEBUG] [finished] validation [01:16]
[2021-02-21 20:44:11,867][onir.pt][INFO] validation it=0 map=0.3557 ndcg=0.5712 P_10=0.4688 <--
[2021-02-21 20:44:11,870][onir.pt][DEBUG] using GPU (deterministic)
[2021-02-21 20:44:11,878][onir.pt][DEBUG] [starting] training
[2021-02-21 20:44:11,880][onir.pt][DEBUG] [starting] train pairs


HBox(children=(IntProgress(value=0, description='train pairs', layout=Layout(flex='2'), max=1024, style=Progre…

In [None]:
# Train on a pair iterator (also works with dataframe of same columns)
def tr_pairs():
    import ir_datasets
    ds = ir_datasets.load('msmarco-passage/train')
    queries = {q.query_id: q for q in ds.queries_iter()}
    docstore = ds.docs_store()
    for scoreddoc in ds.docpairs_iter():
        yield onir.pt.TrainPair(
            scoreddoc.query_id,
            queries[scoreddoc.query_id].text,
            scoreddoc.doc_id_a,
            docstore.get(scoreddoc.doc_id_a).text,
            scoreddoc.doc_id_b,
            docstore.get(scoreddoc.doc_id_b).text)
rr.fit(tr_pairs=tr_pairs())