# BM25

## 1. BM25 from scratch

In [1]:
import math
from collections import Counter

def calculate_bm25(query, document, corpus, k1=1.5, b=0.75):
    """
    Calculate BM25 score for a given query and document.
    
    Args:
    query (str): The search query
    document (str): The document text
    corpus (list): List of all documents in the corpus
    k1 (float): BM25 parameter, typically in [1.2, 2.0]
    b (float): BM25 parameter, usually 0.75
    
    Returns:
    float: BM25 score
    """
    
    def tokenize(text):
        return text.lower().split()
    
    # Tokenize query and document
    query_terms = tokenize(query)
    doc_terms = tokenize(document)
    
    # Calculate document length and average document length
    doc_length = len(doc_terms)
    avg_doc_length = sum(len(tokenize(doc)) for doc in corpus) / len(corpus)
    
    # Calculate term frequencies
    term_frequencies = Counter(doc_terms)
    
    # Calculate IDF for query terms
    N = len(corpus)
    idf_scores = {}
    for term in query_terms:
        n = sum(1 for doc in corpus if term in tokenize(doc))
        idf = math.log((N - n + 0.5) / (n + 0.5) + 1)
        idf_scores[term] = idf
    
    # Calculate BM25 score
    score = 0
    for term in query_terms:
        if term in term_frequencies:
            tf = term_frequencies[term]
            idf = idf_scores[term]
            numerator = tf * (k1 + 1)
            denominator = tf + k1 * (1 - b + b * (doc_length / avg_doc_length))
            score += idf * (numerator / denominator)
    
    return score

In [2]:
corpus = [
    "The quick brown fox jumps over the lazy dog",
    "A quick brown dog outfoxes a lazy fox",
    "The lazy fox is outfoxed by a quick brown dog"
]

query = "quick brown fox"
document = corpus[0]

score = calculate_bm25(query, document, corpus)
print(f"BM25 score: {score}")

BM25 score: 0.4005941778735677


## 2. Using rank_bm25 package

source: https://pypi.org/project/rank-bm25/

In [1]:
!pip install rank_bm25

Collecting rank_bm25
  Using cached rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Using cached rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
from rank_bm25 import BM25Okapi

corpus = [
    "Hello there good man!",
    "It is quite windy in London",
    "How is the weather today?"
]

tokenized_corpus = [doc.split(" ") for doc in corpus]

bm25 = BM25Okapi(tokenized_corpus)
bm25

<rank_bm25.BM25Okapi at 0x7ff1da2cb6a0>

In [6]:
query = "windy London"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
doc_scores

array([0.        , 0.93729472, 0.        ])

In [8]:
bm25.get_top_n(tokenized_query, corpus, n=1)

['It is quite windy in London']