In [6]:
from tira.third_party_integrations import ir_datasets, ensure_pyterrier_is_loaded, persist_and_normalize_run
import pyterrier as pt
import pandas as pd
import numpy as np

ensure_pyterrier_is_loaded()

training_dataset_path = 'ir-lab-jena-leipzig-wise-2023/training-20231104-training'
validation_dataset_path = 'ir-lab-jena-leipzig-wise-2023/validation-20231104-training'

### import previously made results

In [7]:
import os
from glob import glob
import json

In [None]:
mu = os.environ.get('LM_mu', "error" )
if mu == "error":
    raise ValueError("Environment variable LM_mu is not set")
mu = float(mu)
b= os.environ.get('BM25_b', "error" )
if b == "error":
    raise ValueError("Environment variable BM25_b is not set")
b = float(b)
k_1 = os.environ.get('BM25_k_1', "error" )
if k_1 == "error":
    raise ValueError("Environment variable BM25_k_1 is not set")
k_1 = float(k_1)

evalmetrics = os.environ.get('EVAL_METRIC', "error" )
if evalmetrics == "error":
    raise ValueError("Environment variable EVAL_METRIC is not set")
evalmetrics = evalmetrics.split(';')

In [8]:
#find bm data 
bm25_path = None
for folder in glob("input/bm25"):
    if bm25_path:
        break
    for file in glob(folder + "/metadata.json"):
        with open(file) as f:
            bm25meta = json.load(f)
            if bm25meta["b"] == b and bm25meta["k_1"] == k_1:
                bm25path = folder
                break

assert bm25_path, "BM25 run not found"
#find lm data
lm_path = None
for folder in glob("input/lm"):
    if lm_path:
        break
    for file in glob(folder + "/metadata.json"):
        with open(file) as f:
            lmmeta = json.load(f)
            if lmmeta["mu"] == mu:
                lm_path = folder
                break

bm25_path = bm25_path+"/out.txt"
lm_path = lm_path+"/out.txt"

print("BM25 path: ", bm25_path)
print("LM path: ", lm_path)

In [9]:
bm25_results = pt.io.read_results(bm25_path)

In [10]:
lm_results = pt.io.read_results(lm_path)

### prepare data

In [None]:
training_dataset = ir_datasets.load(training_dataset_path)
training_queries = pt.io.read_topics(ir_datasets.topics_file(training_dataset_path), format='trecxml')
training_qrels = pd.DataFrame(training_dataset.qrels_iter()).rename(columns={"query_id": "qid"})

validation_dataset = ir_datasets.load(validation_dataset_path)
validation_queries = pt.io.read_topics(ir_datasets.topics_file(validation_dataset_path), format='trecxml')
validation_qrels = pd.DataFrame(validation_dataset.qrels_iter()).rename(columns={"query_id": "qid"})

In [None]:
merged_results = pd.merge(bm25_results, lm_results, on=['qid', 'docno'])
merged_results['features'] = merged_results.apply(lambda row: np.array([row['rank_x'], row['score_y'], row['rank_y']]), axis=1)
merged_results = merged_results.rename(columns={'doc_id': 'docno', 'score_x': 'score'})
new_df = merged_results[['qid', 'docno', 'score', 'features']]
print(new_df)

## LTR

In [12]:
bm25 = pt.Transformer.from_df(bm25_results)
lm = pt.Transformer.from_df(lm_results)
featureA = pt.apply.doc_score(lambda row: row['rank'])

pipeline = bm25 >> (lm ** featureA)
print(pipeline)

Compose(Transformer(), FUnion(Transformer(), pt.apply.doc_score()))


In [14]:
new_pipeline = pt.Transformer.from_df(new_df)

In [16]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=400)

In [17]:
training_qrels = training_qrels.rename(columns={'doc_id': 'docno', 'relevance': 'label'})

In [18]:
rf_pipe = new_pipeline >> pt.ltr.apply_learned_model(rf)
rf_pipe.fit(pd.DataFrame(training_queries), training_qrels)

In [22]:
from pyterrier.measures import *

In [None]:
result = pt.Experiment([bm25, lm, rf_pipe], pd.DataFrame(training_queries), training_qrels, eval_metrics=evalmetrics, names=["BM25 Baseline", "LM Baseline", "LTR"])

print(result)

result.to_csv("output/results.csv")