In [None]:
import math
from collections import defaultdict
from collections import Counter

In [None]:
def create_index(documents):
    # Create a dict to store the term freqs
    index = defaultdict(dict)

    # Create a dict to store the document lengths
    doc_len = {}

    # Create a set to store the unique terms
    terms = set()

    # Loop over the documents
    for i, document in enumerate(documents):

        # Loop over the terms in the document
        for term in document:

            # Change the term frequency
            index[term][i] = index[term].get(i, 0) + 1

            # Add the term to the set
            terms.add(term)

        # Store the length of the document
        doc_len[i] = len(document)

    # Compute the average document length
    avgdl = sum(doc_len.values()) / len(doc_len)

    # Compute the idf for each term
    idf = {}

    for term in terms:
        df = len(index[term])
        idf[term] = math.log(len(documents)/df)


    return index, doc_len, avgdl, idf


In [None]:
documents = ["just some text here text here here", "another text here","just another here here "]

documents2 = [
    ["just", "some", "text", "here", "text", "here", "here"],
    ["another", "text", "here"],
    ["just", "another", "here", "here"]
]

tf = []

for doc in documents:
  tf.append(dict(Counter(doc.split())))

index, doc_len, avgdl, idf = create_index(documents2)


In [None]:
# Print the results
print(f"Index: {index}")
print(f"Document Lengths: {doc_len}")
print(f"Average Document Length: {avgdl}")
print(f"Term Frequencies per doc in order: {tf}")
print(f"Inverse Document Frequencies: {idf}")


Index: defaultdict(<class 'dict'>, {'just': {0: 1, 2: 1}, 'some': {0: 1}, 'text': {0: 2, 1: 1}, 'here': {0: 3, 1: 1, 2: 2}, 'another': {1: 1, 2: 1}})
Document Lengths: {0: 7, 1: 3, 2: 4}
Average Document Length: 4.666666666666667
Term Frequencies per doc in order: [{'just': 1, 'some': 1, 'text': 2, 'here': 3}, {'another': 1, 'text': 1, 'here': 1}, {'just': 1, 'another': 1, 'here': 2}]
Inverse Document Frequencies: {'here': 0.0, 'text': 0.4054651081081644, 'just': 0.4054651081081644, 'some': 1.0986122886681098, 'another': 0.4054651081081644}


In [None]:
def tf_perc(iindex):
    tf = []
    for doc in iindex.values():
        term_freq = {}
        total_words = sum(doc.values())
        for term in doc:
            term_freq[term] = doc[term] / total_words
        tf.append(term_freq)
    return tf

# Example usage
tf2 = tf_perc(index)
print(tf2)

[{0: 0.5, 2: 0.5}, {0: 1.0}, {0: 0.6666666666666666, 1: 0.3333333333333333}, {0: 0.5, 1: 0.16666666666666666, 2: 0.3333333333333333}, {1: 0.5, 2: 0.5}]


Code below is inspired by: https://medium.com/@evertongomede/understanding-the-bm25-ranking-algorithm-19f6d45c6ce

In [None]:
#BM25 ranking algorithm

def bm25_ranking(tf, idf, docs, query):
    k1 = 1.2
    b = 0.75
    scores = {}

    for doc_id in range(len(docs)):
        score = 0
        doc = docs[doc_id]
        for term in query:
            if str(term) in list(tf[doc_id].keys()):

                score += idf[term] * ((tf[doc_id][term] * (k1 + 1)) / (tf[doc_id][term] + k1 * (1 - b + b * (len(doc) / avgdl))))

        scores[doc_id] = score
    return scores

# Example usage
query = ["just", "some", "text", "here"]
scores = bm25_ranking(tf, idf, documents2, query)
print(scores)


{0: 1.7374478838619467, 1: 0.47484126729016973, 2: 0.43063190792177464}


In [None]:
# NOW USE ON MS MARCO + VAL WITH GROUND TRUTH