In [None]:
from tira.third_party_integrations import ir_datasets, ensure_pyterrier_is_loaded, persist_and_normalize_run
import pyterrier as pt
import pandas as pd

if not pt.started():
    pt.init(boot_packages=['mam10eks:custom-terrier-token-processing:0.0.1'])
    from jnius import autoclass

ensure_pyterrier_is_loaded()

training_dataset = 'ir-lab-jena-leipzig-wise-2023/training-20231104-training'
validation_dataset = 'ir-lab-jena-leipzig-wise-2023/validation-20231104-training'

terrier-assemblies 5.8 jar-with-dependencies not found, downloading to /root/.pyterrier...
100% [......................................................................] 104292653 / 104292653Done
terrier-python-helper 0.0.8 jar not found, downloading to /root/.pyterrier...
100% [..............................................................................] 37524 / 37524Done


PyTerrier 0.9.2 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [8]:
def create_index(documents):
    indexer = pt.IterDictIndexer("/tmp/index", overwrite=True, meta={'docno': 100, 'text': 20480}, verbose=True)
    index_ref = indexer.index(({'docno': i.doc_id, 'text': i.text} for i in documents))
    return pt.IndexFactory.of(index_ref)

In [None]:
import os
import shutil

def recreateOutputDir(dir):
    finalDir = os.path.join(os.getcwd(), dir)
    shutil.rmtree(finalDir, ignore_errors=True)
    os.makedirs(finalDir, )

In [None]:
def run_bm25_grid_search_run(index, output_dir, queries):
    """
        defaults: http://terrier.org/docs/current/javadoc/org/terrier/matching/models/BM25.html
        k_1 = 1.2d, k_3 = 8d, b = 0.75d
        We do not tune parameter k_3, as this parameter only impacts queries with reduntant terms.
    """
    # read environment variables
    b= os.environ.get('BM25_b', "error" )
    if b == "error":
        raise ValueError("Environment variable BM25_b is not set")
    b = float(b)
    k_1 = os.environ.get('BM25_k_1', "error" )
    if k_1 == "error":
        raise ValueError("Environment variable BM25_k_1 is not set")
    k_1 = float(k_1)

    print(f'BM25_b={b}, BM25_k_1={k_1}')

    system = f'bm25-b={b}-k_1={k_1}'
    configuration = {"bm25.b" : b, "bm25.k_1": k_1}
    run_output_dir = output_dir + '/' + system

    recreateOutputDir(run_output_dir)
    # !rm -Rf {run_output_dir}
    # !mkdir -p {run_output_dir}
    
    print(f'Run {system}')
    BM25 = pt.BatchRetrieve(index, wmodel="BM25", controls=configuration, verbose=True)
    run = BM25(queries)
    persist_and_normalize_run(run, system, run_output_dir)

In [5]:
dataset = ir_datasets.load(training_dataset)
queries = pt.io.read_topics(ir_datasets.topics_file(training_dataset), format='trecxml')
#qrels = pd.DataFrame(dataset.qrels_iter()).rename(columns={"query_id": "qid"})

#queries.head(3)

Load ir_dataset "ir-lab-jena-leipzig-wise-2023/validation-20231104-training" from tira.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.


Unnamed: 0,qid,query
0,q072224,purchase money
1,q072226,purchase used car
2,q072232,buy gold silver


In [11]:
index = create_index(dataset.docs_iter())
b= os.environ.get('BM25_b', "error" )
if b == "error":
    raise ValueError("Environment variable BM25_b is not set")
b = float(b)
k_1 = os.environ.get('BM25_k_1', "error" )
if k_1 == "error":
    raise ValueError("Environment variable BM25_k_1 is not set")
k_1 = float(k_1)
evalmetrics = os.environ.get('EVAL_METRIC', "error" )
if evalmetrics == "error":
    raise ValueError("Environment variable EVAL_METRIC is not set")
evalmetrics = evalmetrics.split(';')

configuration = {"bm25.b" : b, "bm25.k_1": k_1}
bm25Orig = pt.BatchRetrieve(index, wmodel="BM25", controls=configuration, verbose=True)

No settings given in /root/.tira/.tira-settings.json. I will use defaults.


In [None]:
run_bm25_grid_search_run(index, 'grid-search/training', queries)


In [None]:
v_dataset = ir_datasets.load(validation_dataset)
queries = pt.io.read_topics(ir_datasets.topics_file(validation_dataset), format='trecxml')
qrels = pd.DataFrame(v_dataset.qrels_iter()).rename(columns={"query_id": "qid"})

#hier validation dataset nehmen oder ???

In [None]:
result = pt.Experiment([bm25Orig], pd.DataFrame(queries), qrels, eval_metrics=evalmetrics)
print(result)
print(type(result))
# write results to file
result.to_csv('grid-search/validation.csv')