In [1]:
! pip install ir-datasets
! pip install rank-bm25
! pip install nltk
! pip install pandas
! pip install numpy





[notice] A new release of pip is available: 23.1.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.1.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.1.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.1.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.1.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Install dependencies if needed (uncomment if running in a notebook environment):
# !pip install ir-datasets
# !pip install rank-bm25
# !pip install nltk
# !pip install pandas
# !pip install numpy

import ir_datasets
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from rank_bm25 import BM25Okapi
from collections import defaultdict
import nltk

nltk.download('punkt', quiet=True)

# -----------------------------------------------------
# Load datasets
# -----------------------------------------------------
def load_datasets():
    datasets = {
        "ru": ir_datasets.load("neuclir/1/ru/hc4-filtered"),
        "zh": ir_datasets.load("neuclir/1/zh/hc4-filtered"),
        "fa": ir_datasets.load("neuclir/1/fa/hc4-filtered")
    }
    return datasets

datasets = load_datasets()
dataset = datasets["ru"]

# -----------------------------------------------------
# Prepare documents and queries
# -----------------------------------------------------
docs_iter = list(dataset.docs_iter())
queries_iter = list(dataset.queries_iter())
qrels_iter = list(dataset.qrels_iter())

documents = pd.DataFrame(docs_iter)
queries = pd.DataFrame(queries_iter)
qrels_df = pd.DataFrame(qrels_iter)

print("Documents columns:", documents.columns)
print("Queries columns:", queries.columns)
print("Qrels columns:", qrels_df.columns)

# The code expects documents to have 'doc_id' and 'text'.
if 'doc_id' not in documents.columns:
    # Try to guess column name
    # If there's a different column name for IDs, rename it.
    # Example: if the dataset provides 'id' instead of 'doc_id'
    if 'id' in documents.columns:
        documents.rename(columns={'id':'doc_id'}, inplace=True)
    else:
        raise ValueError("Documents DataFrame missing 'doc_id' column and couldn't infer an alternative.")

if 'text' not in documents.columns:
    # Try some common alternatives. For this dataset, it should be 'text'.
    # If something else is present, rename accordingly.
    # Example: if 'content' is available: documents.rename(columns={'content':'text'}, inplace=True)
    # Just raise an error if not found.
    raise ValueError("Documents DataFrame missing 'text' column. Please rename the appropriate column to 'text'.")

if 'query_id' not in queries.columns:
    # Try to guess column name
    # If there's 'id' column, rename it to 'query_id'.
    if 'id' in queries.columns:
        queries.rename(columns={'id':'query_id'}, inplace=True)
    else:
        raise ValueError("Queries DataFrame missing 'query_id' column and couldn't infer an alternative.")

if 'text' not in queries.columns:
    # Use the 'title' column as the text if available
    if 'title' in queries.columns:
        queries.rename(columns={"title": "text"}, inplace=True)
    else:
        # If 'title' not available, try 'description'
        if 'description' in queries.columns:
            queries.rename(columns={"description": "text"}, inplace=True)
        else:
            raise ValueError("Queries DataFrame missing a suitable text column. No 'text', 'title', or 'description' found.")

if 'query_id' not in qrels_df.columns or 'doc_id' not in qrels_df.columns or 'relevance' not in qrels_df.columns:
    raise ValueError("Qrels DataFrame missing required columns (query_id, doc_id, relevance).")

print("Final Documents columns:", documents.columns)
print("Final Queries columns:", queries.columns)
print("Final Qrels columns:", qrels_df.columns)

# -----------------------------------------------------
# Build dictionaries and tokenize
# -----------------------------------------------------
doc_texts = {row['doc_id']: row['text'] for _, row in documents.iterrows()}
tokenized_docs = {d_id: word_tokenize(text.lower()) for d_id, text in doc_texts.items()}
doc_ids = list(tokenized_docs.keys())
doc_tokens_list = [tokenized_docs[d_id] for d_id in doc_ids]

def build_bm25(k1, b):
    return BM25Okapi(doc_tokens_list, k1=k1, b=b)

def compute_metrics(bm25, top_k=10):
    relevance_data = defaultdict(dict)
    for _, row in qrels_df.iterrows():
        qid, did, rel = row['query_id'], row['doc_id'], row['relevance']
        relevance_data[qid][did] = rel

    APs = []
    RR = []
    nDCG_vals = []

    for _, q_row in queries.iterrows():
        qid = q_row['query_id']
        q_text = q_row['text']
        if qid not in relevance_data:
            # No relevance judgments => skip
            continue

        q_tokens = word_tokenize(q_text.lower())
        scores = bm25.get_scores(q_tokens)
        ranked_indices = np.argsort(scores)[::-1]
        top_indices = ranked_indices[:top_k]
        retrieved_doc_ids = [doc_ids[i] for i in top_indices]

        rel_docs = relevance_data[qid]
        relevant_doc_ids = {d for d, r in rel_docs.items() if r > 0}
        if len(relevant_doc_ids) == 0:
            # no relevant docs for this query
            continue

        # AP
        num_relevant_found = 0
        precision_sum = 0.0
        for rank, d_id in enumerate(retrieved_doc_ids, start=1):
            if d_id in relevant_doc_ids:
                num_relevant_found += 1
                precision_sum += num_relevant_found / rank
        AP = precision_sum / len(relevant_doc_ids)
        APs.append(AP)

        # RR
        rr_val = 0.0
        for rank, d_id in enumerate(retrieved_doc_ids, start=1):
            if d_id in relevant_doc_ids:
                rr_val = 1.0 / rank
                break
        RR.append(rr_val)

        # nDCG@10
        def dcg(rels):
            return sum((2**r - 1)/np.log2(idx+2) for idx, r in enumerate(rels))

        retrieved_rels = [rel_docs.get(d_id,0) for d_id in retrieved_doc_ids]
        ideal_rels = sorted(rel_docs.values(), reverse=True)[:top_k]

        DCG = dcg(retrieved_rels)
        IDCG = dcg(ideal_rels) if ideal_rels else 0
        nDCG = (DCG / IDCG) if IDCG > 0 else 0.0
        nDCG_vals.append(nDCG)

    mean_map = np.mean(APs) if APs else 0.0
    mean_mrr = np.mean(RR) if RR else 0.0
    mean_ndcg = np.mean(nDCG_vals) if nDCG_vals else 0.0

    return mean_map, mean_mrr, mean_ndcg

# Parameter Tuning
k1 = 1.2
b = 0.75
iterations = 5
learning_rate = 0.1

for it in range(iterations):
    bm25_model = build_bm25(k1, b)
    base_map, base_mrr, base_ndcg = compute_metrics(bm25_model)

    # Try k1 + lr
    bm25_test = build_bm25(k1 + learning_rate, b)
    test_map1, _, _ = compute_metrics(bm25_test)

    # Try k1 - lr
    bm25_test = build_bm25(k1 - learning_rate, b)
    test_map2, _, _ = compute_metrics(bm25_test)

    # Update k1
    if test_map1 > base_map and test_map1 > test_map2:
        k1 = k1 + learning_rate
        base_map = test_map1
    elif test_map2 > base_map:
        k1 = k1 - learning_rate
        base_map = test_map2

    # Try b + lr
    bm25_test = build_bm25(k1, b + learning_rate)
    test_map3, _, _ = compute_metrics(bm25_test)

    # Try b - lr
    bm25_test = build_bm25(k1, b - learning_rate)
    test_map4, _, _ = compute_metrics(bm25_test)

    # Update b
    if test_map3 > base_map and test_map3 > test_map4:
        b = b + learning_rate
        base_map = test_map3
    elif test_map4 > base_map:
        b = b - learning_rate
        base_map = test_map4

    # Decay learning rate
    learning_rate *= 0.5

# Final evaluation
final_bm25 = build_bm25(k1, b)
final_map, final_mrr, final_ndcg = compute_metrics(final_bm25)

print("Final BM25 parameters:")
print("k1 =", k1)
print("b =", b)
print("MAP =", final_map)
print("MRR =", final_mrr)
print("nDCG@10 =", final_ndcg)


[INFO] If you have a local copy of https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/rus/ids.0.jsonl.gz, you can symlink it here to avoid downloading it again: C:\Users\OmkarKadam\.ir_datasets\downloads\4763df966f6ea953c731ef2d572044e5
[INFO] [starting] https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/rus/ids.0.jsonl.gz
[INFO] [finished] https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/rus/ids.0.jsonl.gz: [00:06] [26.8MB] [4.04MB/s]
[INFO] If you have a local copy of https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/rus/ids.1.jsonl.gz, you can symlink it here to avoid downloading it again: C:\Users\OmkarKadam\.ir_datasets\downloads\c19fb0dd1aceb0f6fd02f92818fa55b7
[INFO] [starting] https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/rus/ids.1.jsonl.gz
[INFO] [finished] https://raw.githubusercontent.com/hltcoe/HC4/main/resources/hc4/rus/ids.1.jsonl.gz: [00:09] [26.8MB] [2.87MB/s]
[INFO] If you have a local copy of htt

Documents columns: Index(['doc_id', 'title', 'text', 'url', 'time', 'cc_file'], dtype='object')
Queries columns: Index(['query_id', 'title', 'description', 'ht_title', 'ht_description',
       'mt_title', 'mt_description', 'narrative_by_relevance', 'report',
       'report_url', 'report_date', 'translation_lang'],
      dtype='object')
Qrels columns: Index(['query_id', 'doc_id', 'relevance', 'iteration'], dtype='object')
Final Documents columns: Index(['doc_id', 'title', 'text', 'url', 'time', 'cc_file'], dtype='object')
Final Queries columns: Index(['query_id', 'text', 'description', 'ht_title', 'ht_description',
       'mt_title', 'mt_description', 'narrative_by_relevance', 'report',
       'report_url', 'report_date', 'translation_lang'],
      dtype='object')
Final Qrels columns: Index(['query_id', 'doc_id', 'relevance', 'iteration'], dtype='object')
Final BM25 parameters:
k1 = 1.3
b = 0.6
MAP = 0.008406819517930628
MRR = 0.04012345679012346
nDCG@10 = 0.007769456573977662
