# Import Everything

In [1]:
from tira.third_party_integrations import ir_datasets, ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
import pyterrier as pt
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from trectools import TrecRun, TrecQrel, TrecEval
from glob import glob
import os, shutil

ensure_pyterrier_is_loaded()

#from jnius import autoclass

Start PyTerrier with version=5.7, helper_version=0.0.7, no_download=True


PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [2]:
if not pt.started():
    pt.init(boot_packages=['mam10eks:custom-terrier-token-processing:0.0.1'])

In [3]:
training_dataset_path = 'ir-lab-jena-leipzig-wise-2023/training-20231104-training'
validation_dataset_path = 'ir-lab-jena-leipzig-wise-2023/validation-20231104-training'

In [4]:
training_dataset = ir_datasets.load(training_dataset_path)
training_queries = pt.io.read_topics(ir_datasets.topics_file(training_dataset_path), format='trecxml')
training_qrels = pd.DataFrame(training_dataset.qrels_iter()).rename(columns={"query_id": "qid"})

Load ir_dataset "ir-lab-jena-leipzig-wise-2023/training-20231104-training" from tira.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.


### Setup Parameter

In [7]:
os.environ["BM25_k_1"] = "1.3"
os.environ["BM25_b"] = "0.5"
os.environ["LM_mu"] = "1000"

## Create Index

In [8]:
def create_index(documents):
    indexer = pt.IterDictIndexer("/tmp/index", overwrite=True, meta={'docno': 100, 'text': 20480}, verbose=True)
    index_ref = indexer.index(({'docno': i.doc_id, 'text': i.text} for i in documents))
    return pt.IndexFactory.of(index_ref)

In [9]:
index = create_index(training_dataset.docs_iter())

No settings given in /root/.tira/.tira-settings.json. I will use defaults.


### BM25

In [10]:
# read environment variables
b = os.environ.get('BM25_b', "error" )
if b == "error":
    raise ValueError("Environment variable BM25_b is not set")
b = float(b)
k_1 = os.environ.get('BM25_k_1', "error" )
if k_1 == "error":
    raise ValueError("Environment variable BM25_k_1 is not set")
k_1 = float(k_1)

In [11]:
configuration = {"bm25.b" : b, "bm25.k_1": k_1}
bm25 = pt.BatchRetrieve(index, wmodel="BM25", controls=configuration, verbose=True)

In [12]:
bm25Results = bm25(training_queries)

BR(BM25): 100%|████████████████████████████████████████████████████████████████████████| 672/672 [10:01<00:00,  1.12q/s]


In [15]:
bm25_results = bm25Results

### LM

In [16]:
mu = os.environ.get('LM_mu', "error" )
if mu == "error":
    raise ValueError("Environment variable LM_mu is not set")
mu = float(mu)

In [18]:
from jnius import autoclass
dlm = autoclass("org.terrier.matching.models.DirichletLM")()
dlm.mu=mu
lm = pt.BatchRetrieve(index, wmodel=dlm, verbose=True)

In [19]:
lm_results = lm(training_queries)

BR(DPH): 100%|█████████████████████████████████████████████████████████████████████████| 672/672 [10:06<00:00,  1.11q/s]


### LTR

In [21]:
# Setup
import numpy as np
merged_results = pd.merge(bm25_results, lm_results, on=['qid', 'docno'])
merged_results['features'] = merged_results.apply(lambda row: np.array([row['rank_x'], row['score_y'], row['rank_y']]), axis=1)
merged_results = merged_results.rename(columns={'doc_id': 'docno', 'score_x': 'score'})
new_df = merged_results[['qid', 'docno', 'score', 'features']]
training_qrels = training_qrels.rename(columns={'doc_id': 'docno', 'relevance': 'label'})

In [None]:
#bm25_cleaned = pt.Transformer.from_df(bm25_results)
#lm_cleaned = pt.Transformer.from_df(lm_results)
#rank_feature = pt.apply.doc_score(lambda row: row['rank'])

#pipeline = bm25_cleaned >> (lm_cleaned ** rank_feature)

In [22]:
pipeline = pt.Transformer.from_df(new_df)
rf = RandomForestRegressor(n_estimators=400)
ltr = pipeline >> pt.ltr.apply_learned_model(rf)

In [23]:
ltr.fit(pd.DataFrame(training_queries), training_qrels)

## Final run and persist

In [26]:
save_path = os.environ.get("TIRA_OUTPUT_DIRECTORY", ".")

In [31]:
ltr_result = ltr(training_queries)

In [32]:
persist_and_normalize_run(ltr_result, save_path)

I use the environment variable "TIRA_OUTPUT_DIRECTORY" to determine where I should store the run file using "." as default.
Done. run file is stored under "./run.txt".
