In [5]:
from tira.third_party_integrations import ir_datasets, ensure_pyterrier_is_loaded, persist_and_normalize_run
import pyterrier as pt
import pandas as pd
import numpy as np

ensure_pyterrier_is_loaded()

training_dataset_path = 'ir-lab-jena-leipzig-wise-2023/training-20231104-training'
validation_dataset_path = 'ir-lab-jena-leipzig-wise-2023/validation-20231104-training'

### import previously made BM25 results

In [6]:
import os
from glob import glob

In [7]:
basePath = "../../grid-search/training"
input_files = glob(os.path.join(basePath, '**/*.txt'))

test_input = input_files[0]
print(test_input)

../../grid-search/training/bm25-b=0.75-k_1=1.2/run.txt


In [8]:
bm25_results = pt.io.read_results(test_input)
print(bm25_results)

               qid            docno  rank      score                 name
0       q062210081  doc062200602177     1  14.266406  bm25-b=0.75-k_1=1.2
1       q062210081  doc062200206592     2  14.123982  bm25-b=0.75-k_1=1.2
2       q062210081  doc062210912628     3  14.037971  bm25-b=0.75-k_1=1.2
3       q062210081  doc062200201629     4  13.842112  bm25-b=0.75-k_1=1.2
4       q062210081  doc062200304990     5  13.688794  bm25-b=0.75-k_1=1.2
...            ...              ...   ...        ...                  ...
625518   q06229908  doc062207505063   996   4.631728  bm25-b=0.75-k_1=1.2
625519   q06229908  doc062200201995   997   4.628934  bm25-b=0.75-k_1=1.2
625520   q06229908  doc062200204993   998   4.627846  bm25-b=0.75-k_1=1.2
625521   q06229908  doc062202102915   999   4.627371  bm25-b=0.75-k_1=1.2
625522   q06229908  doc062200500656  1000   4.626296  bm25-b=0.75-k_1=1.2

[625523 rows x 5 columns]


In [9]:
lm_results = pt.io.read_results("lm.txt")
print(lm_results)

               qid            docno  rank      score          name
0       q062210081  doc062200104006     1  10.296331  lm-mu=1500.0
1       q062210081  doc062200111527     2   9.662938  lm-mu=1500.0
2       q062210081  doc062200201629     3   9.519325  lm-mu=1500.0
3       q062210081  doc062200206592     4   8.864250  lm-mu=1500.0
4       q062210081  doc062200602177     5   8.411923  lm-mu=1500.0
...            ...              ...   ...        ...           ...
625518   q06229908  doc062208101904   996   1.916573  lm-mu=1500.0
625519   q06229908  doc062200210772   997   1.911147  lm-mu=1500.0
625520   q06229908  doc062201903468   998   1.908658  lm-mu=1500.0
625521   q06229908  doc062201306155   999   1.907860  lm-mu=1500.0
625522   q06229908  doc062202102686  1000   1.907781  lm-mu=1500.0

[625523 rows x 5 columns]


## prepare data for LTR

In [10]:
training_dataset = ir_datasets.load(training_dataset_path)
training_queries = pt.io.read_topics(ir_datasets.topics_file(training_dataset_path), format='trecxml')
training_qrels = pd.DataFrame(training_dataset.qrels_iter()).rename(columns={"query_id": "qid"})

validation_dataset = ir_datasets.load(validation_dataset_path)
validation_queries = pt.io.read_topics(ir_datasets.topics_file(validation_dataset_path), format='trecxml')
validation_qrels = pd.DataFrame(validation_dataset.qrels_iter()).rename(columns={"query_id": "qid"})

print(training_queries)

Load ir_dataset "ir-lab-jena-leipzig-wise-2023/training-20231104-training" from tira.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
Load ir_dataset "ir-lab-jena-leipzig-wise-2023/validation-20231104-training" from tira.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
            qid                       query
0     q06223196                 car shelter
1       q062228                     airport
2       q062287        antivirus comparison
3     q06223261              free antivirus
4       q062291            orange antivirus
..          ...                         ...
667  q062224914             tax garden shed
668  q062224961              land of france
669  q062225030   find my training pole job
670  q062225194                     gpl car
671  q062225197                cheapest ca

In [11]:
merged_results = pd.merge(bm25_results, lm_results, on=['qid', 'docno'])
print(merged_results)

               qid            docno  rank_x    score_x               name_x  \
0       q062210081  doc062200602177       1  14.266406  bm25-b=0.75-k_1=1.2   
1       q062210081  doc062200206592       2  14.123982  bm25-b=0.75-k_1=1.2   
2       q062210081  doc062210912628       3  14.037971  bm25-b=0.75-k_1=1.2   
3       q062210081  doc062200201629       4  13.842112  bm25-b=0.75-k_1=1.2   
4       q062210081  doc062200304990       5  13.688794  bm25-b=0.75-k_1=1.2   
...            ...              ...     ...        ...                  ...   
489061   q06229908  doc062201801608     989   4.639605  bm25-b=0.75-k_1=1.2   
489062   q06229908  doc062207505063     996   4.631728  bm25-b=0.75-k_1=1.2   
489063   q06229908  doc062200201995     997   4.628934  bm25-b=0.75-k_1=1.2   
489064   q06229908  doc062200204993     998   4.627846  bm25-b=0.75-k_1=1.2   
489065   q06229908  doc062200500656    1000   4.626296  bm25-b=0.75-k_1=1.2   

        rank_y   score_y        name_y  
0         

In [12]:
features = merged_results[['score_x', 'score_y']].values
print(features)

[[14.26640626  8.41192339]
 [14.12398236  8.86425045]
 [14.03797094  8.09223216]
 ...
 [ 4.62893396  1.93423078]
 [ 4.6278459   2.32935189]
 [ 4.62629557  2.32814613]]


## Just combination

In [13]:
bm25 = pt.Transformer.from_df(bm25_results)
lm = pt.Transformer.from_df(lm_results)
featureA = pt.apply.doc_score(lambda row: row['rank'])

pipeline = bm25 >> (lm ** featureA)
print(pipeline)

Compose(Transformer(), FUnion(Transformer(), pt.apply.doc_score()))


In [14]:
merged_results['features'] = merged_results.apply(lambda row: np.array([row['rank_x'], row['score_y'], row['rank_y']]), axis=1)
merged_results = merged_results.rename(columns={'doc_id': 'docno', 'score_x': 'score'})
#print(merged_results)
new_df = merged_results[['qid', 'docno', 'score', 'features']]
print(new_df)

               qid            docno      score  \
0       q062210081  doc062200602177  14.266406   
1       q062210081  doc062200206592  14.123982   
2       q062210081  doc062210912628  14.037971   
3       q062210081  doc062200201629  13.842112   
4       q062210081  doc062200304990  13.688794   
...            ...              ...        ...   
489061   q06229908  doc062201801608   4.639605   
489062   q06229908  doc062207505063   4.631728   
489063   q06229908  doc062200201995   4.628934   
489064   q06229908  doc062200204993   4.627846   
489065   q06229908  doc062200500656   4.626296   

                                   features  
0              [1.0, 8.41192338523742, 5.0]  
1             [2.0, 8.864250453721736, 4.0]  
2              [3.0, 8.09223216041958, 8.0]  
3             [4.0, 9.519325132152874, 3.0]  
4            [5.0, 7.660348333339622, 12.0]  
...                                     ...  
489061    [989.0, 2.068436687270035, 900.0]  
489062    [996.0, 1.93573280451

In [15]:
new_pipeline = pt.Transformer.from_df(new_df)

In [16]:
#pt.Experiment([bm25, lm, pipeline], pd.DataFrame(training_queries), training_qrels, eval_metrics=['ndcg_cut_5'])

In [17]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=400)

In [18]:
training_qrels = training_qrels.rename(columns={'doc_id': 'docno', 'relevance': 'label'})
print(training_qrels)

             qid            docno  label iteration
0      q06223196  doc062200112743      0         0
1      q06223196  doc062200205250      0         0
2      q06223196  doc062200101983      0         0
3      q06223196  doc062200204465      1         0
4      q06223196  doc062200115614      0         0
...          ...              ...    ...       ...
9651  q062225197  doc062200205276      0         0
9652  q062225197  doc062200107121      1         0
9653  q062225197  doc062200204419      0         0
9654  q062225197  doc062200103774      0         0
9655  q062225197  doc062200110087      0         0

[9656 rows x 4 columns]


In [None]:
rf_pipe = new_pipeline >> pt.ltr.apply_learned_model(rf)
rf_pipe.fit(pd.DataFrame(training_queries), training_qrels)

In [None]:
pt.Experiment([bm25, lm, rf_pipe], pd.DataFrame(training_queries), training_qrels, eval_metrics=['NDCG@5', 'NDCG@10', 'P@10'], names=["BM25 Baseline", "LM Baseline", "LTR"])

In [None]:
#pt.Experiment([bm25, lm, bm25 >> lm], pd.DataFrame(training_queries), training_qrels, eval_metrics=['ndcg_cut_5'], names=["BM25 Baseline", "LM Baseline", "Combines"])