# Imports and PyTerrier Initialization

In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from icecream import ic
from tqdm import tqdm, trange
import os
import random
import pyterrier as pt
from sklearn.model_selection import train_test_split
from typing import List, Dict, Tuple, Union
os.environ["JAVA_HOME"] = "/Users/rohanjha/miniforge3/envs/tot_java/lib/jvm"
pt.init()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


# Make 20GB Index of Wikipedia Passages in ./indexes (ALREADY RUN, TAKES HOURS)

In [2]:
# Load passages (dict of docid-pid -> passage)
# passages = pickle.load(open("/Users/rohanjha/Desktop/college/research/TipTongue/WIKIPEDIA/wikipedia_passages.pkl", 'rb'))
documents = pickle.load(open("/Users/rohanjha/Desktop/college/research/TipTongue/WIKIPEDIA/wikipedia_documents.pkl", 'rb'))

In [None]:
max(len(text) for text in documents.values())

In [None]:
max(len(docid) for docid in documents.keys())

In [None]:
def passages_dataset():
    for dpid, text in tqdm(passages.items()):
        docid, pid = dpid.split('-')
        result = {'docno': docid, 'pid' : pid, 'text': text}
        yield result

In [3]:
def documents_dataset():
    for docid, text in tqdm(documents.items()):
        result = {'docno': docid, 'text': text}
        yield result

In [7]:
# del passages
del documents

In [4]:
# Measured empirically
max_passage_length = 4204
max_document_length = 491966
max_docno_length = 8
max_pid_length = 4

In [5]:
index_path = "/Users/rohanjha/Desktop/college/research/TipTongue/document_index"
iter_indexer = pt.IterDictIndexer(
    index_path,
    blocks=True,
    verbose=True,
    overwrite=True,
    # meta={"docno": max_docno_length, "pid": max_pid_length, "text": max_text_length},
    meta={"docno": max_docno_length, "text": max_document_length},
    threads=16,
)

In [6]:
index_ref = iter_indexer.index(documents_dataset())

  warn('Using multiple threads results in a non-deterministic ordering of document in the index. For deterministic behavior, use threads=1')
  9%|▊         | 553058/6458670 [07:41<53:07, 1852.53it/s]  



 10%|█         | 653703/6458670 [08:44<1:02:25, 1550.05it/s]



 10%|█         | 654249/6458670 [08:44<51:12, 1888.88it/s]  



 10%|█         | 654452/6458670 [08:44<55:46, 1734.50it/s]



 10%|█         | 655141/6458670 [08:45<45:19, 2134.05it/s]  



 10%|█         | 655400/6458670 [08:45<55:10, 1752.72it/s]



 10%|█         | 663397/6458670 [08:50<1:03:03, 1531.64it/s]



 10%|█         | 663665/6458670 [08:50<59:26, 1624.73it/s]  



 10%|█         | 663923/6458670 [08:50<53:07, 1817.71it/s]



 10%|█         | 664143/6458670 [08:50<54:33, 1770.38it/s]



100%|██████████| 6458670/6458670 [58:20<00:00, 1845.23it/s]  


12:53:48.158 [ForkJoinPool-1-worker-7] WARN org.terrier.structures.indexing.Indexer - Indexed 9 empty documents
12:53:59.728 [ForkJoinPool-1-worker-11] WARN org.terrier.structures.indexing.Indexer - Indexed 8 empty documents
12:54:13.557 [ForkJoinPool-1-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 7 empty documents
12:54:14.997 [ForkJoinPool-1-worker-8] WARN org.terrier.structures.indexing.Indexer - Indexed 7 empty documents
12:54:26.847 [ForkJoinPool-1-worker-5] WARN org.terrier.structures.indexing.Indexer - Indexed 6 empty documents
12:54:33.188 [ForkJoinPool-1-worker-4] WARN org.terrier.structures.indexing.Indexer - Indexed 8 empty documents
12:54:33.403 [ForkJoinPool-1-worker-6] WARN org.terrier.structures.indexing.Indexer - Indexed 6 empty documents
12:54:34.442 [ForkJoinPool-1-worker-14] WARN org.terrier.structures.indexing.Indexer - Indexed 7 empty documents
12:54:40.256 [ForkJoinPool-1-worker-13] WARN org.terrier.structures.indexing.Indexer - Indexed 8 empty

# Get queries and qrels for each domain/split

In [9]:
# processes and cleans queries from Dict[qid, query] to the required dataframe format for pyterrier
# removes empty queries and queries without qrels
# returns queries_df, qrels_df
def process_queries_and_qrels(queries, qrels):
    qrel_qids = set(qrels.keys())
    # queries
    cleaned_queries = {
        qid : "".join([x if x.isalnum() else " " for x in query])
        for qid, query in queries.items()
        if query != "" and qid in qrel_qids
    }
    
    print(f"Removed {len(queries) - len(cleaned_queries)} empty/no-qrel queries from {len(cleaned_queries)} total queries")
    queries_df = pd.DataFrame(list(cleaned_queries.items()), columns=['qid', 'query'])

    # qrels
    qrels_df = pd.DataFrame([(qid, docno, 1) for qid, docno in qrels.items()], columns=['qid', 'docno', 'label'])
    return queries_df, qrels_df

domains = ['book', 'movie', 'music', 'game']
all_domain_queries, all_domain_qrels = {}, {}

for domain in domains:
    queries = pickle.load(open(f"../DATA/{domain}_queries.pkl", 'rb'))
    qrels = pickle.load(open(f"../DATA/{domain}_qrels.pkl", 'rb'))
    all_domain_queries[domain], all_domain_qrels[domain] = process_queries_and_qrels(queries, qrels)

Removed 10664 empty/no-qrel queries from 9898 total queries
Removed 29148 empty/no-qrel queries from 54958 total queries
Removed 47866 empty/no-qrel queries from 34250 total queries
Removed 6320 empty/no-qrel queries from 6899 total queries


In [11]:
ic(all_domain_queries['book'].index)


ic| all_domain_queries['book'].index: RangeIndex

(start=0, stop=9898, step=1)


RangeIndex(start=0, stop=9898, step=1)

In [12]:
all_domain_queries['book'].iloc[train_idxs]

NameError: name 'train_idxs' is not defined

In [None]:
all_domain_queries['book'].iloc[test_idxs]


# Create Experiment

In [13]:
# set up index and bm25 retriever
index_path = "/Users/rohanjha/Desktop/college/research/TipTongue/document_index/data.properties"
index = pt.IndexFactory.of(index_path, memory=True) #, memory=['lexicon']) #, memory=True)

bm25 = pt.BatchRetrieve(
    index, 
    wmodel="BM25", 
    verbose=True,
)

In [None]:

    # controls={"bm25.b" : 0.75, "bm25.k_1" : 0.75})

# train_topics = all_domain_queries['book'].iloc[:10]
# train_qrels = all_domain_qrels['book']
# pt.GridSearch(
#     bm25,
#     {
#         bm25: {
#             "bm25.b" : [0.3, 0.9], # [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1 ],
#             "bm25.k_1": [0.6, 1.2] # [0.3, 0.6, 0.9, 1.2, 1.4, 1.6, 2],
#         }
#     },
#     train_topics,
#     train_qrels,
#     metric="map",
#     verbose=True,
# )

- No Parallel: 1m18s for 20 queries, 6m49s for 100 queries
- Parallel(2): 1m6s for 20 queries
- parallel(4): 58s for 20 queries , 2m25s for 100 queries, [4m11s, 4m13s] for 200 queries with/without index(memory=True), 3m20s for 200 queries with index.meta.data-source=file instead of fileinmem (no OOM error just slow warning)
- Parallel(8): 1m40s for 20 queries, 3m2s for 100 queries, 4m11s for 200 queries | with heavy lag/legit OOM errors ending @ around 2min
- Parallel(16) 1m35s for 100 queries, [2m27s with memory=inverted, lexicon], 2m43s with memory=True for 200 queries, 12m7s for 1000 queries | (no oom w/ =file)
- Parallel(32) 2m58s for 200 queries (no oom w/ =file BUT 2 rounds of OOM errors - maybe because needs 2 rounds of saturating threads)

In [None]:
for domain in domains:
    train_idxs, val_test_idxs = train_test_split(all_domain_queries[domain].index, train_size=0.9, test_size=0.1, random_state=0)
    test_idxs, val_idxs = train_test_split(val_test_idxs, train_size=0.5, test_size=0.5, random_state=0)
    train_qids = all_domain_queries[domain].iloc[train_idxs]["qid"]
    test_qids = all_domain_queries[domain].iloc[test_idxs]["qid"]
    val_qids = all_domain_queries[domain].iloc[val_idxs]["qid"]

    # save train and test qids
    train_qids.to_csv(f"../BM25_DOCUMENT/{domain}_train_qids.csv", index=False, header=False)
    test_qids.to_csv(f"../BM25_DOCUMENT/{domain}_test_qids.csv", index=False, header=False)
    val_qids.to_csv(f"../BM25_DOCUMENT/{domain}_val_qids.csv", index=False, header=False)
    print(f"Saved train, val, and test qids for {domain} to ../BM25_DOCUMENT/")
    val_queries = all_domain_queries[domain].iloc[val_idxs]
    val_qrels = all_domain_qrels[domain]


    # run and save the results
    result = pt.Experiment(
        retr_systems=[bm25.parallel(16)], 
        topics=val_queries, 
        qrels=val_qrels, 
        eval_metrics=["map", "ndcg", "recall"], 
        names=[f"BM25_{domain}_val"],
        save_dir=f"../BM25_DOCUMENT/",
        filter_by_qrels=True, # many queries in topics dont have qrels
        # filter_by_queries=True, # many qrels dont have queries
        # verbose=True,
        # batch_size=\\
    )
    print(f"Saved results for {domain} to ../BM25_DOCUMENT/")
    print(result)

# Random tests stuff

In [None]:
val_book_qrels = pickle.load(open("../DATA/val_book_qrels.pkl", 'rb'))
val_book_queries_documents = pickle.load(open("../DATA/val_book_queries_documents.pkl", 'rb'))

In [None]:
val_book_qrels

In [None]:
val_book_queries_documents[0]

In [None]:
book_queries = pickle.load(open("../DATA/book_queries.pkl", 'rb'))

In [14]:
# New Pipeline using Luis's data

domains = ['book', 'movie', 'music', 'game']
all_domain_queries, all_domain_qrels = {}, {}
for domain in domains:
    qrels = {}
    queries = {}
    for split in ['train', 'val', 'test']:
        qrels_dict = pickle.load(open(f"../DATA/{split}_{domain}_qrels.pkl", 'rb'))
        queries_documents = pickle.load(open(f"../DATA/{split}_{domain}_queries_documents.pkl", 'rb'))
        queries_dict = {
            qid : "".join([x if x.isalnum() else " " for x in query]) 
            for qid, (query, _) in zip(qrels_dict.keys(), queries_documents)
        }
        qrels[split] = pd.DataFrame([(qid, docno, 1) for qid, docno in qrels_dict.items()], columns=['qid', 'docno', 'label'])
        queries[split] = pd.DataFrame(list(queries_dict.items()), columns=['qid', 'query'])

    all_domain_queries[domain] = queries
    all_domain_qrels[domain] = qrels



In [15]:
for domain in domains:
    for split in ['train', 'val', 'test']:
        assert set(all_domain_queries[domain][split].qid) == set(all_domain_qrels[domain][split].qid)


In [17]:
# set up index and bm25 retriever
index_path = "/Users/rohanjha/Desktop/college/research/TipTongue/document_index/data.properties"
index = pt.IndexFactory.of(index_path) #, memory=['lexicon']) #, memory=True)

bm25 = pt.BatchRetrieve(
    index, 
    wmodel="BM25", 
    # verbose=True,
    # controls = {"bm25.b" : 0.75, "bm25.k_1" : 1.2}
)

# Run Experiments with gridsearched params
(Gridsearch code is below this experiment)

In [18]:
gridsearched_bm25_params = {
    'book' : {"bm25.b" : 0.45, "bm25.k_1" : 0.3},
    'movie' : {"bm25.b" : 0.6, "bm25.k_1" : 1.6},
    'music' : {"bm25.b" : 0, "bm25.k_1" : 2},
    'game' : {"bm25.b" : 0.3, "bm25.k_1" : 0.6},
}

In [19]:
mode = "default"
# mode = "gridsearched"

results = {domain : {} for domain in domains}

for domain in domains:
    # load gridsearched optimal params / default params
    if mode == "gridsearched":
        controls = gridsearched_bm25_params[domain]
    elif mode == "default":
        controls = {"bm25.b" : 0.75, "bm25.k_1" : 1.2}
    else:
        raise ValueError("Invalid mode")
    
    bm25 = pt.BatchRetrieve(
        index, 
        wmodel="BM25", 
        controls=controls,
        verbose=True,
    )

    for split in ["val", "test"]:
        # data
        queries = all_domain_queries[domain][split]
        qrels = all_domain_qrels[domain][split]
        print(len(queries), len(qrels))
        # experiment
        result = pt.Experiment(
            retr_systems=[bm25.parallel(16)], 
            topics=queries, 
            qrels=qrels, 
            eval_metrics=["map", "recip_rank", "ndcg", *[f"recall_{k}" for k in [1, 5, 10, 50, 100, 250, 500, 1000]]], 
            names=[f"BM25_{domain}_{split}_{mode}_params"],
            save_dir=f"../BM25_DOCUMENT/",
        )
        print(f"Saved results for {domain} {split} {mode} to ../BM25_DOCUMENT/")
        print(result)
        results[domain][split] = result

100 100


PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
No etc/terrier.properties, using terrier.default.properties for bootstrap confi

22:25:12.725 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:25:13.056 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:25:13.055 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:25:13.134 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:25:13.177 [main] WARN org.terrier.str

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
  warn("Avoiding reinit of PyTerrie

22:33:28.452 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:33:29.414 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:33:29.447 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:33:29.478 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:33:29.727 [main] WARN org.terrier.str

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
  warn("Avoiding reinit of PyTerrier")
  warn("Avoiding reinit of PyTerrier")
  warn("Avoiding reinit of PyTerrier")


22:37:46.220 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:37:46.225 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:37:46.232 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:37:46.222 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:37:46.227 [main] WARN org.22:37:46.23

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
  warn("Avoiding reinit of PyTerrie

22:41:57.152 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:41:57.763 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:41:57.783 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:41:57.832 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:41:58.025 [main] WARN org.terrier.str

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
  warn("Avoiding reinit of PyTerrier")
  warn("Avoiding reinit of PyTerrier")
  warn("Avoiding reinit of PyTerrier")
  warn("Avoiding reinit of PyTerrier")


22:44:57.421 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:44:57.776 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:44:57.863 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:44:57.905 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:44:57.927 [main] WARN org.terrier.str

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
  warn("Avoiding reinit of PyTerrier")


22:47:48.944 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:47:49.400 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:47:49.411 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:47:49.486 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:47:49.580 [main] WARN org.terrier.str

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
  warn("Avoiding reinit of PyTerrier")
  warn("Avoiding reinit of PyTerrier")
  warn("Avoiding reinit of PyTerrier")


22:50:43.474 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:50:43.474 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:50:43.501 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:50:43.611 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
22:50:43.649 [main] WARN org.terrier.str

In [20]:
# Save results to csv
all_results = pd.concat([results[domain][split] for domain in domains for split in ["val", "test"]])
all_results["domain"] = all_results["name"].apply(lambda x: x.split("_")[1])
all_results["split"] = all_results["name"].apply(lambda x: x.split("_")[2])
all_results["name"] = all_results["name"].apply(lambda x: x.split("_")[0] + "-document")
# add columns for b and k_1 from bm25
if mode == "gridsearched":
    all_results["bm25.b"] = all_results["domain"].apply(lambda domain: gridsearched_bm25_params[domain]["bm25.b"])
    all_results["bm25.k_1"] = all_results["domain"].apply(lambda domain: gridsearched_bm25_params[domain]["bm25.k_1"])
elif mode == "default":
    all_results["bm25.b"] = 0.75
    all_results["bm25.k_1"] = 1.2
else:
    raise ValueError("Invalid mode")
# move domain, split, b, k_1 to be second through fifth columns
cols = all_results.columns.tolist()
cols = cols[:1] + cols[-4:] + cols[1:-4]
all_results = all_results[cols]
all_results.head()
all_results.to_csv(f"../BM25_DOCUMENT/bm25_document_{mode}_results.csv", index=False)


In [22]:
all_results

Unnamed: 0,name,domain,split,bm25.b,bm25.k_1,map,recip_rank,ndcg,recall_1,recall_5,recall_10,recall_50,recall_100,recall_250,recall_500,recall_1000
0,BM25-passage,book,val,0.75,1.2,0.134365,0.134365,0.1893,0.1,0.16,0.2,0.26,0.29,0.36,0.41,0.49
0,BM25-passage,book,test,0.75,1.2,0.112696,0.112696,0.178842,0.07,0.15,0.19,0.33,0.36,0.41,0.43,0.51
0,BM25-passage,movie,val,0.75,1.2,0.046662,0.046662,0.105756,0.02,0.08,0.09,0.18,0.25,0.29,0.34,0.44
0,BM25-passage,movie,test,0.75,1.2,0.039897,0.039897,0.088859,0.01,0.06,0.09,0.16,0.2,0.23,0.29,0.36
0,BM25-passage,music,val,0.75,1.2,0.012846,0.012846,0.023878,0.01,0.02,0.02,0.02,0.03,0.03,0.06,0.1
0,BM25-passage,music,test,0.75,1.2,0.013485,0.013485,0.02703,0.01,0.02,0.03,0.04,0.04,0.07,0.07,0.11
0,BM25-passage,game,val,0.75,1.2,0.14494,0.14494,0.215829,0.1,0.18,0.23,0.29,0.38,0.47,0.57,0.6
0,BM25-passage,game,test,0.75,1.2,0.152924,0.152924,0.231247,0.11,0.19,0.26,0.39,0.46,0.52,0.57,0.62


# Run Gridsearch and experiments

In [24]:
# Run Experiments
mode = "gridsearched"
NUM_SAMPLES = 50
BM25_B_RANGE = [0, 0.15, 0.3, 0.45, 0.6, 0.75 , 0.9, 1]
BM25_K1_RANGE = [0.3, 0.6, 0.9, 1.2, 1.4, 1.6, 2]

results = {domain : {} for domain in domains}

bm25 = pt.BatchRetrieve(
    index, 
    wmodel="BM25", 
    verbose=True,
    controls = {"bm25.b" : 0.75, "bm25.k_1" : 1.2},
)

for domain in domains:

    # take random queries from train for gridsearch
    train_queries = all_domain_queries[domain]['train'].sample(NUM_SAMPLES, random_state=0)
    train_qrels = all_domain_qrels[domain]['train'].iloc[train_queries.index]
    print(len(train_queries), len(train_qrels))

    # run gridsearch over b and k1 parameters
    print(f"Running gridsearch over b and k1 parameters for {domain=}")
    bm25_opt = pt.GridSearch(
        bm25,
        {
            bm25: {
                "bm25.b" : BM25_B_RANGE,
                "bm25.k_1": BM25_K1_RANGE,
            }
        },
        train_queries,
        train_qrels,
        metric="map",
        jobs=4,
        verbose=True,
    )

    for split in ["val", "test"]:
        # data
        queries = all_domain_queries[domain][split]
        qrels = all_domain_qrels[domain][split]
        print(len(queries), len(qrels))
        # experiment
        result = pt.Experiment(
            retr_systems=[bm25_opt.parallel(16)], 
            topics=queries, 
            qrels=qrels, 
            eval_metrics=["map", "ndcg", "recall"], 
            names=[f"BM25_{domain}_{split}_{mode}_params"],
            save_dir=f"../BM25_DOCUMENT/",
        )
        results[domain][split] = result
        print(f"Saved results for {domain} {split} to ../BM25_DOCUMENT/")
        print(result)

50 50
Running gridsearch over b and k1 parameters for domain='book'


PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


23:30:23.897 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
23:30:23.901 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
23:30:23.923 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.
23:30:23.928 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 8.7 GiB of memory would be required.


In [None]:
# Save results to csv
all_results = pd.concat([results[domain][split] for domain in domains for split in ["val", "test"]])
all_results["domain"] = all_results["name"].apply(lambda x: x.split("_")[1])
all_results["split"] = all_results["name"].apply(lambda x: x.split("_")[2])
all_results["name"] = all_results["name"].apply(lambda x: x.split("_")[0] + "-document")
# add columns for b and k_1 from bm25
if mode == "gridsearched":
    all_results["bm25.b"] = all_results["domain"].apply(lambda domain: gridsearched_bm25_params[domain]["bm25.b"])
    all_results["bm25.k_1"] = all_results["domain"].apply(lambda domain: gridsearched_bm25_params[domain]["bm25.k_1"])
elif mode == "default":
    all_results["bm25.b"] = 0.75
    all_results["bm25.k_1"] = 1.2
else:
    raise ValueError("Invalid mode")
# move domain, split, b, k_1 to be second through fifth columns
cols = all_results.columns.tolist()
cols = cols[:1] + cols[-4:] + cols[1:-4]
all_results = all_results[cols]
all_results.head()
all_results.to_csv(f"../BM25_DOCUMENT/bm25_document_{mode}_results.csv", index=False)