In [1]:
from tira.third_party_integrations import ir_datasets, ensure_pyterrier_is_loaded, persist_and_normalize_run
import pyterrier as pt
import pandas as pd

ensure_pyterrier_is_loaded()

training_dataset_path = 'ir-lab-jena-leipzig-wise-2023/training-20231104-training'
validation_dataset_path = 'ir-lab-jena-leipzig-wise-2023/validation-20231104-training'

Start PyTerrier with version=5.7, helper_version=0.0.7, no_download=True


PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


### import previously made BM25 results

In [2]:
import os
from glob import glob

In [3]:
basePath = "../../milestone3/grid-search/training"
input_files = glob(os.path.join(basePath, '**/*.txt'))

test_input = input_files[0]

In [4]:
results = pt.io.read_results(test_input)
print(results)

       qid            docno  rank      score                 name
0    q0001  doc062200806649     1  17.609471  bm25-b=0.75-k_1=1.2
1    q0001  doc062200708788     2  17.589946  bm25-b=0.75-k_1=1.2
2    q0001  doc062200210793     3  17.343684  bm25-b=0.75-k_1=1.2
3    q0001  doc062201000378     4  17.239007  bm25-b=0.75-k_1=1.2
4    q0001  doc062201105278     5  17.053650  bm25-b=0.75-k_1=1.2
..     ...              ...   ...        ...                  ...
995  q0001  doc062200309291   996   5.945129  bm25-b=0.75-k_1=1.2
996  q0001  doc062214005339   997   5.944639  bm25-b=0.75-k_1=1.2
997  q0001  doc062200208773   998   5.944639  bm25-b=0.75-k_1=1.2
998  q0001  doc062208300136   999   5.939730  bm25-b=0.75-k_1=1.2
999  q0001  doc062200111997  1000   5.939730  bm25-b=0.75-k_1=1.2

[1000 rows x 5 columns]


### create fake other data for testing (later LM)

In [5]:
import random

In [7]:
def randomScore(row):
    row["score"] = random.random() * 10
    return row

In [8]:
results_2 = results.apply(randomScore, axis=1)
results_2 = results_2.sort_values(by=["score"],ascending=False)
results_2['rank'] = results_2['score'].rank(ascending=False).astype(int)
print(results_2)

       qid            docno  rank     score                 name
271  q0001  doc062208704222     1  9.998877  bm25-b=0.75-k_1=1.2
676  q0001  doc062200100599     2  9.990608  bm25-b=0.75-k_1=1.2
888  q0001  doc062200707616     3  9.989972  bm25-b=0.75-k_1=1.2
721  q0001  doc062211402243     4  9.988692  bm25-b=0.75-k_1=1.2
992  q0001  doc062203101879     5  9.943263  bm25-b=0.75-k_1=1.2
..     ...              ...   ...       ...                  ...
769  q0001  doc062201200901   996  0.042836  bm25-b=0.75-k_1=1.2
660  q0001  doc062201709441   997  0.025456  bm25-b=0.75-k_1=1.2
830  q0001  doc062207603270   998  0.024327  bm25-b=0.75-k_1=1.2
818  q0001  doc062203102859   999  0.019441  bm25-b=0.75-k_1=1.2
839  q0001  doc062200805484  1000  0.012337  bm25-b=0.75-k_1=1.2

[1000 rows x 5 columns]


## prepare data for LTR

In [9]:
# add rank of results 2 as feature
results['features'] = results['docno'].map(results_2.set_index('docno')['rank']).apply(lambda x: [x])
print(results)

       qid            docno  rank      score                 name features
0    q0001  doc062200806649     1  17.609471  bm25-b=0.75-k_1=1.2    [862]
1    q0001  doc062200708788     2  17.589946  bm25-b=0.75-k_1=1.2    [476]
2    q0001  doc062200210793     3  17.343684  bm25-b=0.75-k_1=1.2    [704]
3    q0001  doc062201000378     4  17.239007  bm25-b=0.75-k_1=1.2    [183]
4    q0001  doc062201105278     5  17.053650  bm25-b=0.75-k_1=1.2    [957]
..     ...              ...   ...        ...                  ...      ...
995  q0001  doc062200309291   996   5.945129  bm25-b=0.75-k_1=1.2    [186]
996  q0001  doc062214005339   997   5.944639  bm25-b=0.75-k_1=1.2    [781]
997  q0001  doc062200208773   998   5.944639  bm25-b=0.75-k_1=1.2    [448]
998  q0001  doc062208300136   999   5.939730  bm25-b=0.75-k_1=1.2    [595]
999  q0001  doc062200111997  1000   5.939730  bm25-b=0.75-k_1=1.2    [900]

[1000 rows x 6 columns]


In [10]:
pipeline = pt.Transformer.from_df(results)
print(pipeline)

Transformer()


## LTR

In [11]:
training_dataset = ir_datasets.load(training_dataset_path)
training_queries = pt.io.read_topics(ir_datasets.topics_file(training_dataset_path), format='trecxml')
training_qrels = pd.DataFrame(training_dataset.qrels_iter()).rename(columns={"query_id": "qid"})

validation_dataset = ir_datasets.load(validation_dataset_path)
validation_queries = pt.io.read_topics(ir_datasets.topics_file(validation_dataset_path), format='trecxml')
validation_qrels = pd.DataFrame(validation_dataset.qrels_iter()).rename(columns={"query_id": "qid"})

Load ir_dataset "ir-lab-jena-leipzig-wise-2023/training-20231104-training" from tira.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
Load ir_dataset "ir-lab-jena-leipzig-wise-2023/validation-20231104-training" from tira.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.


In [12]:
print(validation_queries)

            qid               query
0       q072224      purchase money
1       q072226   purchase used car
2       q072232     buy gold silver
3       q072240          adenovirus
4       q072242      water softener
..          ...                 ...
877  q072229958       video twitter
878  q072230046            used car
879  q072230066   used electric car
880  q072230072      collector cars
881  q072230074       electric cars

[882 rows x 2 columns]


In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
#rf = RandomForestRegressor(n_estimators=400)
#rf_pipe = pipeline >> pt.ltr.apply_learned_model(rf)

#X = pd.DataFrame(results["features"] + results["qid"])
#rf_pipe.fit(queries, qrels)

## XBoost

In [13]:
!pip3 install xgboost

[0m

In [14]:
import xgboost as xgb
lmart_x = xgb.sklearn.XGBRanker(objective='rank:ndcg',
      learning_rate=0.1,
      gamma=1.0,
      min_child_weight=0.1,
      max_depth=6,
      verbose=2,
      random_state=42)

In [None]:
def create_index(documents):
    indexer = pt.IterDictIndexer("/tmp/index", overwrite=True, meta={'docno': 100, 'text': 20480})
    index_ref = indexer.index(({'docno': i.doc_id, 'text': i.text} for i in documents))
    return pt.IndexFactory.of(index_ref)
index = index = create_index(training_dataset.docs_iter())
pipeline = pt.BatchRetrieve(index, wmodel="BM25", , verbose=True)

No settings given in /root/.tira/.tira-settings.json. I will use defaults.


In [None]:
lmart_x_pipe = pipeline >> pt.ltr.apply_learned_model(lmart_x, form="ltr")
lmart_x_pipe.fit(training_queries, training_qrels, validation_queries, validation_qrels)
print(lmart_x_pipe)

In [None]:
results = pt.Experiment(
    [bm25, lmart_x_pipe],
    test_topics,
    test_qrels,
    ["map"],
    names=["BM25 Baseline", "LambdaMART (xgBoost)" ]
)

In [None]:
print(results)