In [2]:
# This is only needed in Google Colab, in the dev container, everything should be installed already
!pip3 install tira trectools python-terrier

[0m

In [3]:
from tira.third_party_integrations import ir_datasets, ensure_pyterrier_is_loaded, persist_and_normalize_run
import pyterrier as pt

ensure_pyterrier_is_loaded()

training_dataset = 'ir-lab-jena-leipzig-wise-2023/training-20231104-training'
validation_dataset = 'ir-lab-jena-leipzig-wise-2023/validation-20231104-training'

Start PyTerrier with version=5.7, helper_version=0.0.7, no_download=True


PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [4]:
def create_index(documents):
    indexer = pt.IterDictIndexer("/tmp/index", overwrite=True, meta={'docno': 100, 'text': 20480})
    index_ref = indexer.index(({'docno': i.doc_id, 'text': i.text} for i in documents))
    return pt.IndexFactory.of(index_ref)

In [5]:
import os
import shutil

def recreateOutputDir(dir):
    shutil.rmtree(dir)
    os.mkdir(dir)

In [6]:
def run_bm25_grid_search_run(index, output_dir, queries):
    """
        defaults: http://terrier.org/docs/current/javadoc/org/terrier/matching/models/BM25.html
        k_1 = 1.2d, k_3 = 8d, b = 0.75d
        We do not tune parameter k_3, as this parameter only impacts queries with reduntant terms.
    """
    b = 0.75
    k_1 = 1.2
    system = f'bm25-b={b}-k_1={k_1}'
    configuration = {"bm25.b" : b, "bm25.k_1": k_1}
    run_output_dir = output_dir + '/' + system

    # recreateOutputDir(run_output_dir)
    !rm -Rf {run_output_dir}
    !mkdir -p {run_output_dir}
    
    print(f'Run {system}')
    BM25 = pt.BatchRetrieve(index, wmodel="BM25", controls=configuration, verbose=True)
    run = BM25(queries)
    persist_and_normalize_run(run, system, run_output_dir)

In [7]:
dataset = ir_datasets.load(training_dataset)
queries = pt.io.read_topics(ir_datasets.topics_file(training_dataset), format='trecxml')

queries.head(3)

Load ir_dataset "ir-lab-jena-leipzig-wise-2023/training-20231104-training" from tira.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.


Unnamed: 0,qid,query
0,q06223196,car shelter
1,q062228,airport
2,q062287,antivirus comparison


In [8]:
index = create_index(dataset.docs_iter())

No settings given in /root/.tira/.tira-settings.json. I will use defaults.


In [9]:
run_bm25_grid_search_run(index, 'grid-search/training', queries)

Run bm25-b=0.75-k_1=1.2


BR(BM25): 100%|██████████████████████████████████████████████████████████████████████████████████████████| 672/672 [08:14<00:00,  1.36q/s]


Done. run file is stored under "grid-search/training/bm25-b=0.75-k_1=1.2/run.txt".
