In [None]:
import math
from collections import defaultdict
from collections import Counter

In [None]:
def create_index(documents):
    # Create a dict to store the term freqs
    index = defaultdict(dict)

    # Create a dict to store the document lengths
    doc_len = {}

    # Create a set to store the unique terms
    terms = set()

    # Loop over the documents
    for i, document in enumerate(documents):

        # Loop over the terms in the document
        for term in document:

            # Change the term frequency
            index[term][i] = index[term].get(i, 0) + 1

            # Add the term to the set
            terms.add(term)

        # Store the length of the document
        doc_len[i] = len(document)

    # Compute the average document length
    avgdl = sum(doc_len.values()) / len(doc_len)

    # Compute the idf for each term
    idf = {}

    for term in terms:
        df = len(index[term])
        idf[term] = math.log(len(documents)/df)


    return index, doc_len, avgdl, idf


In [None]:
documents = ["just some text here text here here", "another text here","just another here here "]

documents2 = [
    ["just", "some", "text", "here", "text", "here", "here"],
    ["another", "text", "here"],
    ["just", "another", "here", "here"]
]

tf = []

for doc in documents:
  tf.append(dict(Counter(doc.split())))

index, doc_len, avgdl, idf = create_index(documents2)


In [None]:
# Print the results
print(f"Index: {index}")
print(f"Document Lengths: {doc_len}")
print(f"Average Document Length: {avgdl}")
print(f"Term Frequencies per doc in order: {tf}")
print(f"Inverse Document Frequencies: {idf}")


Index: defaultdict(<class 'dict'>, {'just': {0: 1, 2: 1}, 'some': {0: 1}, 'text': {0: 2, 1: 1}, 'here': {0: 3, 1: 1, 2: 2}, 'another': {1: 1, 2: 1}})
Document Lengths: {0: 7, 1: 3, 2: 4}
Average Document Length: 4.666666666666667
Term Frequencies per doc in order: [{'just': 1, 'some': 1, 'text': 2, 'here': 3}, {'another': 1, 'text': 1, 'here': 1}, {'just': 1, 'another': 1, 'here': 2}]
Inverse Document Frequencies: {'here': 0.0, 'text': 0.4054651081081644, 'just': 0.4054651081081644, 'some': 1.0986122886681098, 'another': 0.4054651081081644}


In [None]:
def tf_perc(iindex):
    tf = []
    for doc in iindex.values():
        term_freq = {}
        total_words = sum(doc.values())
        for term in doc:
            term_freq[term] = doc[term] / total_words
        tf.append(term_freq)
    return tf

# Example usage
tf2 = tf_perc(index)
print(tf2)

[{0: 0.5, 2: 0.5}, {0: 1.0}, {0: 0.6666666666666666, 1: 0.3333333333333333}, {0: 0.5, 1: 0.16666666666666666, 2: 0.3333333333333333}, {1: 0.5, 2: 0.5}]


Code below is inspired by: https://medium.com/@evertongomede/understanding-the-bm25-ranking-algorithm-19f6d45c6ce

In [None]:
#BM25 ranking algorithm

def bm25_ranking(tf, idf, docs, query):
    k1 = 1.2
    b = 0.75
    scores = {}

    for doc_id in range(len(docs)):
        score = 0
        doc = docs[doc_id]
        for term in query:
            if str(term) in list(tf[doc_id].keys()):

                score += idf[term] * ((tf[doc_id][term] * (k1 + 1)) / (tf[doc_id][term] + k1 * (1 - b + b * (len(doc) / avgdl))))

        scores[doc_id] = score
    return scores

# Example usage
query = ["just", "some", "text", "here"]
scores = bm25_ranking(tf, idf, documents2, query)
print(scores)


{0: 1.7374478838619467, 1: 0.47484126729016973, 2: 0.43063190792177464}


In [3]:
import pandas as pd
msmarco_short = pd.DataFrame(pd.read_csv("train_msmarco_short.csv"))

In [5]:
msmarco_short.head(500)

Unnamed: 0.1,Unnamed: 0,answer,question,context
0,0,"Depona Ab is a library in Vilhelmina, Sweden.",Depona ab,"Depona Ab is a library in Vilhelmina, Sweden. ..."
1,1,"$43,746 for the 2014-2015 academic year.",Nyu tuition cost,tuition for new york university is $ 43746 for...
2,2,Before the age of 2–4 years.,At what age do kids start to hold memories?,"Childhood amnesia, also called infantile amnes..."
3,3,Americans brush for just under the two minutes...,Average teeth brushing time,"On average, Americans brush for just under the..."
4,4,"Yes, funner is a word.",Is funner a word?,"Funner is, of course, a word in the same sense..."
...,...,...,...,...
495,495,"If the left ear is ringing,someone the person ...",The meaning of your left ear is ringing,Quick Answer. The superstition about a person'...
496,496,Average high temperature of 83° and an average...,Average monthly temperatures in orlando florida?,When you stay at the Orlando World Center Marr...
497,497,"Yes, Debit is capitalized.",Is debit capitalized,Debit the cost of the asset and credit the loa...
498,498,The period of time during which a contract con...,Term contract definition,term of a contract - the period of time during...


In [1]:
# https://github.com/castorini/pygaggle/blob/master/docs/experiments-msmarco-passage-subset.md
!wget https://www.dropbox.com/s/5xa5vjbjle0c8jv/msmarco_ans_small.zip -P data

--2023-11-03 15:22:16--  https://www.dropbox.com/s/5xa5vjbjle0c8jv/msmarco_ans_small.zip
Resolving www.dropbox.com (www.dropbox.com)... 162.125.6.18, 2620:100:601c:18::a27d:612
Connecting to www.dropbox.com (www.dropbox.com)|162.125.6.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/raw/5xa5vjbjle0c8jv/msmarco_ans_small.zip [following]
--2023-11-03 15:22:16--  https://www.dropbox.com/s/raw/5xa5vjbjle0c8jv/msmarco_ans_small.zip
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc0861394438817505de0b26406d.dl.dropboxusercontent.com/cd/0/inline/CG3oWsa1x3QU_AeJX0Dz8zwW8CLy8dLwxRmJkQn33hH3j0toaMAg5hLwNhvQgq-Ki42XIds1gyU5hqzRVshvMIDaRcztf16W4tnW2aJvw5HUccUGV65v4TAtWJXW0ee5VAYz_6LSsb79HU4FfQj21G_5/file# [following]
--2023-11-03 15:22:16--  https://uc0861394438817505de0b26406d.dl.dropboxusercontent.com/cd/0/inline/CG3oWsa1x3QU_AeJX0Dz8zwW8CLy8dLwxRmJkQn33hH3j0toaMAg5hLwNhvQgq-Ki42XIds1gyU

In [2]:
!unzip data/msmarco_ans_small.zip -d data

Archive:  data/msmarco_ans_small.zip
   creating: data/msmarco_ans_small/
  inflating: data/msmarco_ans_small/queries.dev.small.tsv  
  inflating: data/msmarco_ans_small/run.dev.small.tsv  
  inflating: data/msmarco_ans_small/scores  
  inflating: data/msmarco_ans_small/qrels.dev.small.tsv  


In [31]:
df1 = pd.DataFrame(pd.read_table('/content/data/msmarco_ans_small/qrels.dev.small.tsv',delimiter='\t',header=None))

In [32]:
df1.columns = ['Topic', 'Iteration', 'Document#', 'Relevancy']

In [33]:
df1

Unnamed: 0,Topic,Iteration,Document#,Relevancy
0,352818,0,7072358,1
1,1089760,0,7081626,1
2,1089312,0,7086188,1
3,1087904,0,7097707,1
4,1087589,0,7100724,1
...,...,...,...,...
109,877938,0,7978897,1
110,348594,0,7982142,1
111,188714,0,8003843,1
112,188714,0,4321745,1


In [None]:
# from relevancy import relevancy_lookup
# import csv

# def process_qrel_file(qrel_path):
#     relevancies = relevancy_lookup()

#     with open(qrel_path) as file:
#         qrel_file = csv.reader(file, delimiter="\t")
#         for line in qrel_file:
#             query, document, relevancy = parse_qrel_line(line)
#             relevancies.add(query, document, relevancy)
#     return relevancies

# def parse_qrel_line(line):
#     #query_id, _, document_id, relevance
#     line = line[0].split()
#     return int(line[0]), line[2], int(line[3])

# qrel_path = "msmarco-docdev-qrels.tsv"
# relevancies = process_qrel_file(qrel_path)
# print(relevancies.relevancies)

In [None]:
# NOW USE ON MS MARCO + INDEX EXPORT + VAL WITH GROUND TRUTH