In [None]:
!pip install rank_bm25
!pip install pytrec_eval

In [None]:
import json
import numpy as np
import pandas as pd
from rank_bm25 import BM25Okapi
import pytrec_eval
from nltk.tokenize import word_tokenize
import nltk

# Download required NLTK data
nltk.download('punkt')

In [None]:
# Load and prepare dataset
path_to_json = '/kaggle/input/financial-times/merged_output.json'
with open(path_to_json, 'r') as file:
    data = json.load(file)

# Extract DOCNO and TEXT
extracted_data = [
    {"DOCNO": entry["DOCNO"], "TEXT": entry["TEXT"]}
    for entry in data
]

# Prepare documents for BM25
documents = [doc["TEXT"] for doc in extracted_data]
docnos = [doc["DOCNO"] for doc in extracted_data]

# Tokenize documents
tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]

# Create BM25 index
bm25 = BM25Okapi(tokenized_docs)

In [None]:
def load_queries(queries_file):
    """Load queries from JSON file"""
    with open(queries_file, 'r') as f:
        queries_data = json.load(f)
    
    query_texts = [query.get('text', '') for query in queries_data]
    query_ids = [query.get('id', str(idx)) for idx, query in enumerate(queries_data)]
    return query_texts, query_ids

def load_qrels(qrels_path):
    """Load relevance judgments"""
    qrels = {}
    with open(qrels_path, 'r') as f:
        for line in f:
            query_id, _, doc_id, relevance = line.strip().split()
            if query_id not in qrels:
                qrels[query_id] = {}
            qrels[query_id][doc_id] = int(relevance)
    return qrels

def retrieve_bm25(query_texts, bm25_model, docnos, top_k=10):
    """Retrieve documents using BM25"""
    results = {}
    
    for idx, query in enumerate(query_texts):
        # Tokenize query
        tokenized_query = word_tokenize(query.lower())
        
        # Get BM25 scores
        scores = bm25_model.get_scores(tokenized_query)
        
        # Get top-k documents
        top_indices = np.argsort(scores)[::-1][:top_k]
        
        # Store results
        results[str(idx)] = {
            docnos[i]: float(scores[i]) 
            for i in top_indices
        }
    
    return results

def compute_aggregated_measures(metrics):
    """Compute aggregated evaluation metrics"""
    aggregated_metrics = {}
    
    metric_keys = [
        'ndcg', 'map', 'recip_rank', 
        'P_5', 'P_10',
        'recall_5', 'recall_10',
    ]
    
    for metric in metric_keys:
        metric_values = []
        for query_metrics in metrics.values():
            if metric in query_metrics:
                metric_values.append(query_metrics[metric])
        
        if metric_values:
            aggregated_metrics[f'{metric}_mean'] = np.mean(metric_values)
            aggregated_metrics[f'{metric}_median'] = np.median(metric_values)
            aggregated_metrics[f'{metric}_std'] = np.std(metric_values)
    
    return aggregated_metrics

In [None]:
# Load queries and qrels
query_texts, query_ids = load_queries('/kaggle/input/query-and-qrels/queries.json')
qrels = load_qrels('/kaggle/input/query-and-qrels/filtered_data.txt')

# Retrieve results using BM25
run = retrieve_bm25(query_texts, bm25, docnos)

# Map query indices to query IDs
run_with_query_ids = {
    query_ids[int(k)]: v for k, v in run.items()
}

# Prepare for evaluation
corrected_version = {
    str(query_id): {str(doc_id): float(score) for doc_id, score in doc_scores.items()}
    for query_id, doc_scores in run_with_query_ids.items()
}

# Initialize evaluator
evaluator = pytrec_eval.RelevanceEvaluator(
    qrels,
    {
        'ndcg', 'map', 'recip_rank',
        'P_5', 'P_10',
        'recall_5', 'recall_10',
    }
)

# Evaluate
metrics = evaluator.evaluate(corrected_version)

# Compute and display aggregated metrics
print("BM25 Evaluation Results:")
aggregated_measures = compute_aggregated_measures(metrics)
for metric, value in sorted(aggregated_measures.items()):
    print(f"{metric}: {value}")

# Save results
results_df = pd.DataFrame([aggregated_measures])
results_df.to_csv('bm25_evaluation_results.csv', index=False)