In [2]:
from tira.third_party_integrations import ir_datasets, ensure_pyterrier_is_loaded, persist_and_normalize_run
import pyterrier as pt
import pandas as pd

ensure_pyterrier_is_loaded()

training_dataset_path = 'ir-lab-jena-leipzig-wise-2023/training-20231104-training'
validation_dataset_path = 'ir-lab-jena-leipzig-wise-2023/validation-20231104-training'

Start PyTerrier with version=5.7, helper_version=0.0.7, no_download=True


PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


### import previously made BM25 results

In [3]:
import os
from glob import glob

In [4]:
basePath = "../../grid-search/training"
input_files = glob(os.path.join(basePath, '**/*.txt'))

test_input = input_files[0]
print(test_input)

../../grid-search/training/bm25-b=0.75-k_1=1.2/run.txt


In [26]:
bm25_results = pt.io.read_results(test_input).rename(columns={"docno": "doc_id"})
print(bm25_results)

               qid           doc_id  rank      score                 name
0       q062210081  doc062200602177     1  14.266406  bm25-b=0.75-k_1=1.2
1       q062210081  doc062200206592     2  14.123982  bm25-b=0.75-k_1=1.2
2       q062210081  doc062210912628     3  14.037971  bm25-b=0.75-k_1=1.2
3       q062210081  doc062200201629     4  13.842112  bm25-b=0.75-k_1=1.2
4       q062210081  doc062200304990     5  13.688794  bm25-b=0.75-k_1=1.2
...            ...              ...   ...        ...                  ...
625518   q06229908  doc062207505063   996   4.631728  bm25-b=0.75-k_1=1.2
625519   q06229908  doc062200201995   997   4.628934  bm25-b=0.75-k_1=1.2
625520   q06229908  doc062200204993   998   4.627846  bm25-b=0.75-k_1=1.2
625521   q06229908  doc062202102915   999   4.627371  bm25-b=0.75-k_1=1.2
625522   q06229908  doc062200500656  1000   4.626296  bm25-b=0.75-k_1=1.2

[625523 rows x 5 columns]


In [6]:
import random
def randomScore(row):
    row["score"] = random.random() * 10
    return row

## prepare data for LTR

In [7]:
training_dataset = ir_datasets.load(training_dataset_path)
training_queries = pt.io.read_topics(ir_datasets.topics_file(training_dataset_path), format='trecxml')
training_qrels = pd.DataFrame(training_dataset.qrels_iter()).rename(columns={"query_id": "qid"})

validation_dataset = ir_datasets.load(validation_dataset_path)
validation_queries = pt.io.read_topics(ir_datasets.topics_file(validation_dataset_path), format='trecxml')
validation_qrels = pd.DataFrame(validation_dataset.qrels_iter()).rename(columns={"query_id": "qid"})

print(training_queries)

Load ir_dataset "ir-lab-jena-leipzig-wise-2023/training-20231104-training" from tira.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
Load ir_dataset "ir-lab-jena-leipzig-wise-2023/validation-20231104-training" from tira.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
            qid                       query
0     q06223196                 car shelter
1       q062228                     airport
2       q062287        antivirus comparison
3     q06223261              free antivirus
4       q062291            orange antivirus
..          ...                         ...
667  q062224914             tax garden shed
668  q062224961              land of france
669  q062225030   find my training pole job
670  q062225194                     gpl car
671  q062225197                cheapest ca

In [9]:
filtered_qrels = pd.merge(bm25_results, training_qrels, on=['qid', 'doc_id'])
print(filtered_qrels)
filtered_qrels = pd.merge(filtered_qrels, training_queries, on=['qid'])
print(filtered_qrels)

             qid           doc_id  rank      score                 name  \
0     q062210081  doc062200201629     4  13.842112  bm25-b=0.75-k_1=1.2   
1     q062210081  doc062200207334    15  13.405335  bm25-b=0.75-k_1=1.2   
2     q062210081  doc062200108490    22  13.163220  bm25-b=0.75-k_1=1.2   
3     q062210081  doc062200100242    32  12.994303  bm25-b=0.75-k_1=1.2   
4     q062210081  doc062200210944    43  12.814115  bm25-b=0.75-k_1=1.2   
...          ...              ...   ...        ...                  ...   
7720    q0622984  doc062200101503   246   7.926566  bm25-b=0.75-k_1=1.2   
7721    q0622984  doc062200203958   312   7.010942  bm25-b=0.75-k_1=1.2   
7722   q06229908  doc062200108116   522   5.032561  bm25-b=0.75-k_1=1.2   
7723   q06229908  doc062200105658   744   4.842786  bm25-b=0.75-k_1=1.2   
7724   q06229908  doc062200115252   799   4.799547  bm25-b=0.75-k_1=1.2   

      relevance iteration  
0             0         0  
1             0         0  
2             0

In [27]:
lm_results = filtered_qrels.apply(randomScore, axis=1)
lm_results = lm_results.sort_values(by=["score"],ascending=False)
lm_results['rank'] = lm_results['score'].rank(ascending=False).astype(int)

In [11]:
relevance_labels = filtered_qrels['relevance'].values
print(relevance_labels)

[0 0 0 ... 0 0 1]


In [12]:
topicMap = training_queries['query'].values

def mapTopic(row):
    row['topic'] = topicMap.tolist().index(row['query'])
    return row

In [13]:
merged_results = pd.merge(bm25_results, lm_results, on=['qid', 'doc_id'])
merged_results = merged_results.apply(mapTopic, axis=1)
print(merged_results)

             qid           doc_id  rank_x    score_x               name_x  \
0     q062210081  doc062200201629       4  13.842112  bm25-b=0.75-k_1=1.2   
1     q062210081  doc062200207334      15  13.405335  bm25-b=0.75-k_1=1.2   
2     q062210081  doc062200108490      22  13.163220  bm25-b=0.75-k_1=1.2   
3     q062210081  doc062200100242      32  12.994303  bm25-b=0.75-k_1=1.2   
4     q062210081  doc062200210944      43  12.814115  bm25-b=0.75-k_1=1.2   
...          ...              ...     ...        ...                  ...   
7720    q0622984  doc062200101503     246   7.926566  bm25-b=0.75-k_1=1.2   
7721    q0622984  doc062200203958     312   7.010942  bm25-b=0.75-k_1=1.2   
7722   q06229908  doc062200108116     522   5.032561  bm25-b=0.75-k_1=1.2   
7723   q06229908  doc062200105658     744   4.842786  bm25-b=0.75-k_1=1.2   
7724   q06229908  doc062200115252     799   4.799547  bm25-b=0.75-k_1=1.2   

      rank_y   score_y               name_y  relevance iteration  \
0      

In [14]:
features = merged_results[['score_x', 'score_y']].values
print(features)

[[13.84211188  2.28588558]
 [13.40533509  5.40960753]
 [13.16321991  3.60847565]
 ...
 [ 5.03256119  4.11886811]
 [ 4.84278614  4.33089842]
 [ 4.79954723  4.7633034 ]]


In [15]:
from sklearn.linear_model import LinearRegression

In [16]:
# Train the LTR model
ltr_model = LinearRegression()
ltr_model.fit(features, relevance_labels)

In [103]:
from pyterrier.transformer import TransformerBase

class LTRTransformer(TransformerBase):

    def __init__(self, ltr_model, feature_extractor=None):
        # ltr_model is your trained LTR model, e.g., LinearRegression
        self.ltr_model = ltr_model
        # feature_extractor is a function/transformer that takes a dataframe with 'qid' and 'doc_id' and returns the features
        self.feature_extractor = feature_extractor

    def transform(self, qrels_df):
        return result_df

In [104]:
ltr_transformer = LTRTransformer(ltr_model)

In [105]:
results = pt.Experiment(
    [ltr_transformer],
    validation_queries,
    validation_qrels,
    eval_metrics=["map", "ndcg"],
    names=["My LTR Model"]
)

TypeError: LTRTransformer.transform() missing 1 required positional argument: 'other'

## Rank Fusion

In [108]:
def rank_fusion(qid, bm25_rank, lm_rank):
    # Simple average of ranks
    fusion_rank = (bm25_rank + lm_rank) / 2

    return pd.DataFrame({
        "qid": qid,
        "doc_id": bm25_results.loc[bm25_results["qid"] == qid, "doc_id"].values,
        "rank": fusion_rank,
        "score": 0.0,  # You can set a default score or leave it as 0
        "name": "RankFusion"
    })

# Apply rank fusion for each query
fusion_results = pd.concat([
    rank_fusion(qid, bm25_results.loc[bm25_results["qid"] == qid, "rank"].values, lm_results.loc[lm_results["qid"] == qid, "rank"].values)
    for qid in validation_queries["qid"]
])

In [109]:
print(fusion_results)

Empty DataFrame
Columns: [qid, doc_id, rank, score, name]
Index: []


In [None]:
# Use the fusion_results as the input for the PyTerrier Experiment
results = pt.Experiment(
    [fusion_results],
    validation_queries,
    validation_qrels,
    eval_metrics=["map", "ndcg"],
    names=["Rank Fusion Model"]
)

## Just combination

In [56]:
bm25_results.rename(columns={'doc_id': 'docno'})
lm_results.rename(columns={'doc_id': 'docno'})

Unnamed: 0,qid,docno,rank,score,name,relevance,iteration,query
4376,q062222431,doc062200201937,1,9.999980,bm25-b=0.75-k_1=1.2,0,0,car battery
6295,q06225704,doc062200201560,2,9.998375,bm25-b=0.75-k_1=1.2,0,0,recipe gateau without eggs
7477,q06229171,doc062200209815,3,9.995788,bm25-b=0.75-k_1=1.2,0,0,ugc bordeaux
3406,q062219091,doc062200111126,4,9.993866,bm25-b=0.75-k_1=1.2,0,0,purchase car
4336,q062222128,doc062200109857,5,9.993134,bm25-b=0.75-k_1=1.2,0,0,ethanol car
...,...,...,...,...,...,...,...,...
6343,q06225918,doc062200110415,7721,0.010069,bm25-b=0.75-k_1=1.2,0,0,starterre
896,q062212370,doc062200210239,7722,0.007209,bm25-b=0.75-k_1=1.2,0,0,hybrid car
4924,q062224315,doc062200108168,7723,0.002156,bm25-b=0.75-k_1=1.2,0,0,self consumption solar panels
874,q062212365,doc062200210942,7724,0.001323,bm25-b=0.75-k_1=1.2,0,0,electric car autonomy


In [57]:
bm25_results = bm25_results.set_index('docno')
lm_results = lm_results.set_index('docno')

KeyError: "None of ['docno'] are in the columns"

In [53]:
bm25 = pt.Transformer.from_df(bm25_results)
lm = pt.Transformer.from_df(lm_results)
featureB = pt.apply.doc_score(lambda row: 1)

pipeline = bm25 >> (lm ** featureB)
print(pipeline)

Compose(Transformer(), FUnion(Transformer(), pt.apply.doc_score()))


In [48]:
pt.Experiment([bm25, lm, pipeline], pd.DataFrame(training_queries), training_qrels, eval_metrics=['ndcg_cut_5'])

Unnamed: 0,name,ndcg_cut_5
0,Transformer(),0.14144
1,Transformer(),0.324172
2,"Compose(Transformer(), Transformer())",0.324172


In [44]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=400)

In [None]:
rf_pipe = pipeline >> pt.ltr.apply_learned_model(rf)
rf_pipe.fit(training_queries, training_qrels)