In [6]:
from tira.third_party_integrations import ir_datasets, ensure_pyterrier_is_loaded, persist_and_normalize_run
import pyterrier as pt
import pandas as pd
import numpy as np

ensure_pyterrier_is_loaded()

training_dataset_path = 'ir-lab-jena-leipzig-wise-2023/training-20231104-training'
validation_dataset_path = 'ir-lab-jena-leipzig-wise-2023/validation-20231104-training'

### import previously made results

In [7]:
import os
from glob import glob

In [8]:
bm25_path = "./bm25.txt"
lm_path = "./lm.txt"

In [9]:
bm25_results = pt.io.read_results(bm25_path)

In [10]:
lm_results = pt.io.read_results(lm_path)

### prepare data

In [11]:
training_dataset = ir_datasets.load(training_dataset_path)
training_queries = pt.io.read_topics(ir_datasets.topics_file(training_dataset_path), format='trecxml')
training_qrels = pd.DataFrame(training_dataset.qrels_iter()).rename(columns={"query_id": "qid"})

validation_dataset = ir_datasets.load(validation_dataset_path)
validation_queries = pt.io.read_topics(ir_datasets.topics_file(validation_dataset_path), format='trecxml')
validation_qrels = pd.DataFrame(validation_dataset.qrels_iter()).rename(columns={"query_id": "qid"})

Load ir_dataset "ir-lab-jena-leipzig-wise-2023/training-20231104-training" from tira.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
Load ir_dataset "ir-lab-jena-leipzig-wise-2023/validation-20231104-training" from tira.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
            qid                       query
0     q06223196                 car shelter
1       q062228                     airport
2       q062287        antivirus comparison
3     q06223261              free antivirus
4       q062291            orange antivirus
..          ...                         ...
667  q062224914             tax garden shed
668  q062224961              land of france
669  q062225030   find my training pole job
670  q062225194                     gpl car
671  q062225197                cheapest ca

In [13]:
merged_results = pd.merge(bm25_results, lm_results, on=['qid', 'docno'])
merged_results['features'] = merged_results.apply(lambda row: np.array([row['rank_x'], row['score_y'], row['rank_y']]), axis=1)
merged_results = merged_results.rename(columns={'doc_id': 'docno', 'score_x': 'score'})
new_df = merged_results[['qid', 'docno', 'score', 'features']]
print(new_df)

               qid            docno     score  \
0       q062210081  doc062200602177  7.722567   
1       q062210081  doc062200206592  7.718526   
2       q062210081  doc062200201629  7.714514   
3       q062210081  doc062210912628  7.703608   
4       q062210081  doc062201201840  7.691112   
...            ...              ...       ...   
457611   q06229908  doc062201007812  2.758472   
457612   q06229908  doc062204608629  2.758409   
457613   q06229908  doc062203900979  2.758266   
457614   q06229908  doc062200114940  2.758258   
457615   q06229908  doc062200113336  2.758244   

                                   features  
0              [1.0, 8.41192338523742, 5.0]  
1             [2.0, 8.864250453721736, 4.0]  
2             [3.0, 9.519325132152874, 3.0]  
3              [4.0, 8.09223216041958, 8.0]  
4            [5.0, 7.212775139416361, 29.0]  
...                                     ...  
457611   [993.0, 2.0082993251888093, 942.0]  
457612    [995.0, 2.007101073424835, 945.0]

## LTR

In [12]:
bm25 = pt.Transformer.from_df(bm25_results)
lm = pt.Transformer.from_df(lm_results)
featureA = pt.apply.doc_score(lambda row: row['rank'])

pipeline = bm25 >> (lm ** featureA)
print(pipeline)

Compose(Transformer(), FUnion(Transformer(), pt.apply.doc_score()))


In [14]:
new_pipeline = pt.Transformer.from_df(new_df)

In [15]:
#pt.Experiment([bm25, lm, pipeline], pd.DataFrame(training_queries), training_qrels, eval_metrics=['ndcg_cut_5'])

In [16]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=400)

In [17]:
training_qrels = training_qrels.rename(columns={'doc_id': 'docno', 'relevance': 'label'})

In [18]:
rf_pipe = new_pipeline >> pt.ltr.apply_learned_model(rf)
rf_pipe.fit(pd.DataFrame(training_queries), training_qrels)

In [19]:
pt.Experiment([bm25, lm, rf_pipe], pd.DataFrame(training_queries), training_qrels, eval_metrics=['NDCG@5', 'NDCG@10', 'P@10'], names=["BM25 Baseline", "LM Baseline", "LTR"])

ValueError: unkonwn measure NDCG@5

In [None]:
#pt.Experiment([bm25, lm, bm25 >> lm], pd.DataFrame(training_queries), training_qrels, eval_metrics=['ndcg_cut_5'], names=["BM25 Baseline", "LM Baseline", "Combines"])