In [1]:
# Basic imports needed
import math
from collections import defaultdict
from collections import Counter

In [2]:
def create_index(documents):
    
    # Creating a dict to store the term freqs
    index = defaultdict(dict)

    # Creating a dict to store the document lengths
    doc_len = {}

    # Creating a set to store the unique terms
    terms = set()

    # Looping over the documents
    for i, document in enumerate(documents):

        # Loop over the terms in the document
        for term in document:

            # Change the term frequency
            index[term][i] = index[term].get(i, 0) + 1

            # Add the term to the set
            terms.add(term)

        # Store the length of the document
        doc_len[i] = len(document)

    # Compute the average document length
    avgdl = sum(doc_len.values()) / len(doc_len)

    # Compute the idf for each term
    idf = {}

    #print(terms)

    for term in terms:
        df = len(index[term])
        idf[term] = math.log(len(documents)/df)


    return index, doc_len, avgdl, idf


In [3]:
# documents = ["just some text here text here here", "another text here","just another here here "]

# Custom dataset

documents2 = [
    ["just", "some", "text", "here", "text", "here", "here"],
    ["another", "text", "here"],
    ["just", "another", "here", "here"]
]

tf = []

for doc in documents2:
  tf.append(dict(Counter(doc)))

index, doc_len, avgdl, idf = create_index(documents2)


In [4]:
# Print the results
print(f"Index: {index}")
print(f"Document Lengths: {doc_len}")
print(f"Average Document Length: {avgdl}")
print(f"Term Frequencies per doc in order: {tf}")
print(f"Inverse Document Frequencies: {idf}")


Index: defaultdict(<class 'dict'>, {'just': {0: 1, 2: 1}, 'some': {0: 1}, 'text': {0: 2, 1: 1}, 'here': {0: 3, 1: 1, 2: 2}, 'another': {1: 1, 2: 1}})
Document Lengths: {0: 7, 1: 3, 2: 4}
Average Document Length: 4.666666666666667
Term Frequencies per doc in order: [{'just': 1, 'some': 1, 'text': 2, 'here': 3}, {'another': 1, 'text': 1, 'here': 1}, {'just': 1, 'another': 1, 'here': 2}]
Inverse Document Frequencies: {'text': 0.4054651081081644, 'some': 1.0986122886681098, 'another': 0.4054651081081644, 'here': 0.0, 'just': 0.4054651081081644}


# TF percentages rather than numbers, might be useful at some point. NOT USED NOW

In [6]:
def tf_perc(iindex):
    tf = []
    for doc in iindex.values():
        term_freq = {}
        total_words = sum(doc.values())
        for term in doc:
            term_freq[term] = doc[term] / total_words
        tf.append(term_freq)
    return tf

# Example usage
tf2 = tf_perc(index)
print(tf2)

[{0: 0.5, 2: 0.5}, {0: 1.0}, {0: 0.6666666666666666, 1: 0.3333333333333333}, {0: 0.5, 1: 0.16666666666666666, 2: 0.3333333333333333}, {1: 0.5, 2: 0.5}]


Code below is inspired by: https://medium.com/@evertongomede/understanding-the-bm25-ranking-algorithm-19f6d45c6ce

In [12]:
#BM25 ranking algorithm

# def bm25_ranking_index(index, query):
#     k1 = 1.2
#     b = 0.75
#     scores = defaultdict(int)
#     avgdl = sum(map(len, index.values())) / len(index)
#     for term in query:
#         if term in index:
#             idf = len(index) / len(index[term])
#             for doc_id, tf in index[term].items():
#                 score = idf * ((tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (len(index[doc_id]) / avgdl))))
#                 scores[doc_id] += score
#     return dict(sorted(scores.items(), key=lambda item: item[1], reverse=True))

def bm25_ranking(tf, idf, docs, query):
    
    # Hyperparams to specify
    k1 = 1.2
    b = 0.75
    scores = {}
    
    # Looping through the different docs
    for doc_id in range(len(docs)):
        score = 0
        doc = docs[doc_id]
        
        # Loop for term in query in the doc
        for term in query:
            if str(term) in list(tf[doc_id].keys()):
                
                # Calculating/updating the score
                score += idf[term] * ((tf[doc_id][term] * (k1 + 1)) / (tf[doc_id][term] + k1 * (1 - b + b * (len(doc) / avgdl))))

        scores[doc_id] = score
    # sort scores / ranking
    sorted_scores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)}
    return sorted_scores

from collections import defaultdict


# Example usage
query = ["just", "some", "text", "here"]
scores = bm25_ranking(tf, idf, documents2, query)
print(scores)


{0: 33.29319955406912, 2: 11.870680044593088, 1: 11.282051282051281}
{0: 33.29319955406912, 2: 11.870680044593088, 1: 11.282051282051281}


# END

In [None]:
# from relevancy import relevancy_lookup
# import csv

# def process_qrel_file(qrel_path):
#     relevancies = relevancy_lookup()

#     with open(qrel_path) as file:
#         qrel_file = csv.reader(file, delimiter="\t")
#         for line in qrel_file:
#             query, document, relevancy = parse_qrel_line(line)
#             relevancies.add(query, document, relevancy)
#     return relevancies

# def parse_qrel_line(line):
#     #query_id, _, document_id, relevance
#     line = line[0].split()
#     return int(line[0]), line[2], int(line[3])

# qrel_path = "msmarco-docdev-qrels.tsv"
# relevancies = process_qrel_file(qrel_path)
# print(relevancies.relevancies)

In [8]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
overlay         108G   27G   81G  26% /
tmpfs            64M     0   64M   0% /dev
shm             5.8G     0  5.8G   0% /dev/shm
/dev/root       2.0G  1.1G  885M  55% /usr/sbin/docker-init
tmpfs           6.4G  208K  6.4G   1% /var/colab
/dev/sda1        44G   28G   16G  65% /etc/hosts
tmpfs           6.4G     0  6.4G   0% /proc/acpi
tmpfs           6.4G     0  6.4G   0% /proc/scsi
tmpfs           6.4G     0  6.4G   0% /sys/firmware


In [9]:
!cat /proc/cpuinfo

processor	: 0
vendor_id	: AuthenticAMD
cpu family	: 23
model		: 49
model name	: AMD EPYC 7B12
stepping	: 0
microcode	: 0xffffffff
cpu MHz		: 2249.998
cache size	: 512 KB
physical id	: 0
siblings	: 2
core id		: 0
cpu cores	: 1
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid tsc_known_freq pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext ssbd ibrs ibpb stibp vmmcall fsgsbase tsc_adjust bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 clzero xsaveerptr arat npt nrip_save umip rdpid
bugs		: sysret_ss_attrs null_seg spectre_v1 spectre_v2 spec_store_bypass retbleed smt_rsb
bogomips	: 4499.