In [3]:
import sys
import os
import shutil
from collections import namedtuple
sys.path.append('..') # assuming we're running from OpenNIR/examples/
os.environ['ONIR_IGNORE_ARGV'] = 'true' # don't process command line arguments (they come from jupyter)
os.environ['ONIR_PBAR_COLS'] = '' # no ncols for tqdm

In [4]:
import pandas as pd
import pyterrier as pt
if not pt.started():
    pt.init(tqdm='notebook')
import onir.pt

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [5]:
# Sample EPIC re-ranker (trained on msmarco)
rr = onir.pt.reranker.from_checkpoint('epic.msmarco.tar.gz', {'learning_rate': 1e-5})

configuraiton file not found: config


In [6]:
dataset = pt.datasets.get_dataset('irds:vaswani')
index_path = './index_vaswani'
if not os.path.exists(index_path):
    indexer = pt.index.IterDictIndexer(index_path)
    index_ref = indexer.index(dataset.get_corpus_iter(), meta=('docno', 'text'))
else:
    index_ref = pt.IndexRef.of(index_path + '/data.properties')
index = pt.IndexFactory.of(index_ref)

In [7]:
base_pipeline = pt.BatchRetrieve(index, wmodel="BM25") % 100
res = base_pipeline.transform(dataset.get_topics())

In [8]:
pt.Utils.evaluate(res, dataset.get_qrels(), metrics = ['map'])

{'map': 0.2725231249761632}

In [9]:
rr_pipeline = base_pipeline >> pt.text.get_text(index, "text") >> rr
epic_res = rr_pipeline.transform(dataset.get_topics())

[2021-02-21 20:53:37,937][onir.pt][DEBUG] using GPU (deterministic)
[2021-02-21 20:53:38,029][onir.pt][DEBUG] [starting] batches


HBox(children=(IntProgress(value=0, description='batches', max=2325, style=ProgressStyle(description_width='in…

[2021-02-21 20:54:53,435][onir.pt][DEBUG] [finished] batches: [01:15] [2325it] [30.83it/s]


In [10]:
pt.Utils.evaluate(epic_res, dataset.get_qrels(), metrics = ['map'])

{'map': 0.22642311752488706}

In [11]:
# (over)fit 1 training iteration on the vaswani dataset
rr_pipeline.fit(dataset.get_topics(), dataset.get_qrels())

[2021-02-21 20:54:58,285][onir.pt][DEBUG] using GPU (deterministic)
[2021-02-21 20:54:58,289][onir.pt][DEBUG] [starting] training
[2021-02-21 20:54:58,291][onir.pt][DEBUG] [starting] train pairs


HBox(children=(IntProgress(value=0, description='train pairs', max=1024, style=ProgressStyle(description_width…

[2021-02-21 20:56:01,753][onir.pt][DEBUG] [finished] train pairs: [01:03] [1024it] [16.14it/s]
[2021-02-21 20:56:01,757][onir.pt][DEBUG] [finished] training [01:03]
[2021-02-21 20:56:01,760][onir.pt][INFO] training   it=0 loss=0.1159


In [None]:
# (over)fit many training iterations on the vaswani dataset
rr_pipeline.fit(dataset.get_topics(), dataset.get_qrels(), dataset.get_topics(), dataset.get_qrels())

[2021-02-21 20:56:11,299][onir.pt][DEBUG] [starting] validation
[2021-02-21 20:56:11,302][onir.pt][DEBUG] using GPU (deterministic)
[2021-02-21 20:56:11,308][onir.pt][DEBUG] [starting] batches


HBox(children=(IntProgress(value=0, description='batches', max=2325, style=ProgressStyle(description_width='in…

[2021-02-21 20:57:27,421][onir.pt][DEBUG] [finished] batches: [01:16] [2325it] [30.55it/s]
[2021-02-21 20:57:27,441][onir.pt][DEBUG] [finished] validation [01:16]
[2021-02-21 20:57:27,443][onir.pt][INFO] pre-validation: 0.3240
[2021-02-21 20:57:27,842][onir.pt][DEBUG] using GPU (deterministic)
[2021-02-21 20:57:27,847][onir.pt][DEBUG] [starting] training
[2021-02-21 20:57:27,848][onir.pt][DEBUG] [starting] train pairs


HBox(children=(IntProgress(value=0, description='train pairs', max=1024, style=ProgressStyle(description_width…

In [None]:
# Train on a pair iterator (also works with dataframe of same columns)
def tr_pairs():
    import ir_datasets
    ds = ir_datasets.load('msmarco-passage/train')
    queries = {q.query_id: q for q in ds.queries_iter()}
    docstore = ds.docs_store()
    for scoreddoc in ds.docpairs_iter():
        yield onir.pt.TrainPair(
            scoreddoc.query_id,
            queries[scoreddoc.query_id].text,
            scoreddoc.doc_id_a,
            docstore.get(scoreddoc.doc_id_a).text,
            scoreddoc.doc_id_b,
            docstore.get(scoreddoc.doc_id_b).text)
rr.fit(tr_pairs=tr_pairs())