In [94]:
from tira.third_party_integrations import ir_datasets, ensure_pyterrier_is_loaded, persist_and_normalize_run
import pyterrier as pt
import pandas as pd

ensure_pyterrier_is_loaded()

training_dataset_path = 'ir-lab-jena-leipzig-wise-2023/training-20231104-training'
validation_dataset_path = 'ir-lab-jena-leipzig-wise-2023/validation-20231104-training'

### import previously made BM25 results

In [95]:
import os
from glob import glob

In [96]:
basePath = "../../milestone3/grid-search/training"
input_files = glob(os.path.join(basePath, '**/*.txt'))

test_input = input_files[0]

In [136]:
results = pt.io.read_results(test_input)
print(results)

       qid            docno  rank      score                 name
0    q0001  doc062200806649     1  17.609471  bm25-b=0.75-k_1=1.2
1    q0001  doc062200708788     2  17.589946  bm25-b=0.75-k_1=1.2
2    q0001  doc062200210793     3  17.343684  bm25-b=0.75-k_1=1.2
3    q0001  doc062201000378     4  17.239007  bm25-b=0.75-k_1=1.2
4    q0001  doc062201105278     5  17.053650  bm25-b=0.75-k_1=1.2
..     ...              ...   ...        ...                  ...
995  q0001  doc062200309291   996   5.945129  bm25-b=0.75-k_1=1.2
996  q0001  doc062214005339   997   5.944639  bm25-b=0.75-k_1=1.2
997  q0001  doc062200208773   998   5.944639  bm25-b=0.75-k_1=1.2
998  q0001  doc062208300136   999   5.939730  bm25-b=0.75-k_1=1.2
999  q0001  doc062200111997  1000   5.939730  bm25-b=0.75-k_1=1.2

[1000 rows x 5 columns]


### create fake other data for testing (later LM)

In [98]:
import random

In [100]:
def randomScore(row):
    row["score"] = random.random() * 10
    return row

In [128]:
results_2 = results.apply(randomScore, axis=1)
results_2 = results_2.sort_values(by=["score"],ascending=False)
results_2['rank'] = results_2['score'].rank(ascending=False).astype(int)

In [132]:
test_df = qrels['doc_id'].map(results.set_index('docno')['rank']).apply(lambda x: [x])
print(qrels['doc_id'])

0       doc062200112743
1       doc062200205250
2       doc062200101983
3       doc062200204465
4       doc062200115614
             ...       
9651    doc062200205276
9652    doc062200107121
9653    doc062200204419
9654    doc062200103774
9655    doc062200110087
Name: doc_id, Length: 9656, dtype: object


## prepare data for LTR

In [112]:
# add rank of results 2 as feature
results['features'] = results['docno'].map(results_2.set_index('docno')['rank']).apply(lambda x: [x])
print(results)

       qid            docno  rank      score                 name features
0    q0001  doc062200806649     1  17.609471  bm25-b=0.75-k_1=1.2    [592]
1    q0001  doc062200708788     2  17.589946  bm25-b=0.75-k_1=1.2    [127]
2    q0001  doc062200210793     3  17.343684  bm25-b=0.75-k_1=1.2    [996]
3    q0001  doc062201000378     4  17.239007  bm25-b=0.75-k_1=1.2    [836]
4    q0001  doc062201105278     5  17.053650  bm25-b=0.75-k_1=1.2     [84]
..     ...              ...   ...        ...                  ...      ...
995  q0001  doc062200309291   996   5.945129  bm25-b=0.75-k_1=1.2     [71]
996  q0001  doc062214005339   997   5.944639  bm25-b=0.75-k_1=1.2    [686]
997  q0001  doc062200208773   998   5.944639  bm25-b=0.75-k_1=1.2    [872]
998  q0001  doc062208300136   999   5.939730  bm25-b=0.75-k_1=1.2    [309]
999  q0001  doc062200111997  1000   5.939730  bm25-b=0.75-k_1=1.2    [374]

[1000 rows x 6 columns]


In [None]:
pipeline = pt.Transformer.from_df(results)
print(pipeline)

## LTR

In [138]:
training_dataset = ir_datasets.load(training_dataset_path)
training_queries = pt.io.read_topics(ir_datasets.topics_file(training_dataset_path), format='trecxml')
training_qrels = pd.DataFrame(training_dataset.qrels_iter()).rename(columns={"query_id": "qid"})

validation_dataset = ir_datasets.load(validation_dataset_path)
validation_queries = pt.io.read_topics(ir_datasets.topics_file(validation_dataset_path), format='trecxml')
validation_qrels = pd.DataFrame(validation_dataset.qrels_iter()).rename(columns={"query_id": "qid"})

Load ir_dataset "ir-lab-jena-leipzig-wise-2023/training-20231104-training" from tira.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
Load ir_dataset "ir-lab-jena-leipzig-wise-2023/validation-20231104-training" from tira.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.
No settings given in /root/.tira/.tira-settings.json. I will use defaults.


In [143]:
results.to_csv('output.txt', index=False, header=False, sep='\n')

In [92]:
print(validation_queries)

            qid               query
0       q072224      purchase money
1       q072226   purchase used car
2       q072232     buy gold silver
3       q072240          adenovirus
4       q072242      water softener
..          ...                 ...
877  q072229958       video twitter
878  q072230046            used car
879  q072230066   used electric car
880  q072230072      collector cars
881  q072230074       electric cars

[882 rows x 2 columns]


In [93]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
#rf = RandomForestRegressor(n_estimators=400)
#rf_pipe = pipeline >> pt.ltr.apply_learned_model(rf)

#X = pd.DataFrame(results["features"] + results["qid"])
##rf_pipe.fit(queries, qrels)

In [117]:
qrels.rename(columns={'docno': 'doc_id'}, inplace=True)

In [126]:
print(results)

       qid           doc_id  rank      score                 name features
0    q0001  doc062200806649     1  17.609471  bm25-b=0.75-k_1=1.2    [592]
1    q0001  doc062200708788     2  17.589946  bm25-b=0.75-k_1=1.2    [127]
2    q0001  doc062200210793     3  17.343684  bm25-b=0.75-k_1=1.2    [996]
3    q0001  doc062201000378     4  17.239007  bm25-b=0.75-k_1=1.2    [836]
4    q0001  doc062201105278     5  17.053650  bm25-b=0.75-k_1=1.2     [84]
..     ...              ...   ...        ...                  ...      ...
995  q0001  doc062200309291   996   5.945129  bm25-b=0.75-k_1=1.2     [71]
996  q0001  doc062214005339   997   5.944639  bm25-b=0.75-k_1=1.2    [686]
997  q0001  doc062200208773   998   5.944639  bm25-b=0.75-k_1=1.2    [872]
998  q0001  doc062208300136   999   5.939730  bm25-b=0.75-k_1=1.2    [309]
999  q0001  doc062200111997  1000   5.939730  bm25-b=0.75-k_1=1.2    [374]

[1000 rows x 6 columns]


In [173]:
import numpy as np
from sklearn.model_selection import train_test_split
# Generate a sample feature DataFrame with random values
np.random.seed(42)  # for reproducibility
num_samples = len(qrels)
num_features = 5  # Adjust this based on your actual features


your_feature_dataframe = pd.DataFrame({
    'qid': np.random.choice(queries['qid'], num_samples),
    'feature1': np.random.rand(num_samples),
    'feature2': np.random.rand(num_samples),
})

In [165]:
df = pd.merge(qrels, your_feature_dataframe, on=['qid'])
print(df)

               qid           doc_id  relevance iteration  feature1  feature2
0        q06223196  doc062200112743          0         0  0.453262  0.104804
1        q06223196  doc062200112743          0         0  0.105514  0.303605
2        q06223196  doc062200112743          0         0  0.693210  0.573911
3        q06223196  doc062200112743          0         0  0.270058  0.035503
4        q06223196  doc062200112743          0         0  0.932846  0.274745
...            ...              ...        ...       ...       ...       ...
138696  q062225197  doc062200110087          0         0  0.104939  0.328535
138697  q062225197  doc062200110087          0         0  0.749290  0.992538
138698  q062225197  doc062200110087          0         0  0.495472  0.428395
138699  q062225197  doc062200110087          0         0  0.976587  0.859999
138700  q062225197  doc062200110087          0         0  0.488210  0.890811

[138701 rows x 6 columns]


In [178]:
# Define features and labels
features = df.drop(['qid', 'doc_id', 'relevance'], axis=1)
labels = df['relevance']
X_train, X_val, Y_train, Y_val = train_test_split(features, labels, test_size=0.2, random_state=42)

In [180]:
rf_model = RandomForestRegressor()
rf_model.fit(X_train, Y_train)

In [181]:
val_predictions = rf_model.predict(X_val)