In [1]:
import math
from collections import defaultdict
from collections import Counter
import pandas as pd

In [4]:
def create_index(documents):
    # Create a dict to store the term freqs
    index = defaultdict(dict)

    # Create a dict to store the document lengths
    doc_len = {}

    # Create a set to store the unique terms
    terms = set()

    # Loop over the documents
    for i, document in enumerate(documents):

        # Loop over the terms in the document
        for term in document:

            # Change the term frequency
            index[term][i] = index[term].get(i, 0) + 1

            # Add the term to the set
            terms.add(term)

        # Store the length of the document
        doc_len[i] = len(document)

    # Compute the average document length
    avgdl = sum(doc_len.values()) / len(doc_len)

    # Compute the idf for each term
    idf = {}

    for term in terms:
        df = len(index[term])
        idf[term] = math.log(len(documents)/df)


    return index, doc_len, avgdl, idf


In [6]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import json
import gzip

class InvertedIndex:
    def __init__(self) -> None:
        self.index = defaultdict(dict)
        self.doc_lengths = {}
        self.avgdl = 0
        self.idf = {}

    @staticmethod
    def preprocess_text(text):
        stop_words = set(stopwords.words('english'))
        ps = PorterStemmer()
        text = text.lower()
        tokens = word_tokenize(text)
        tokens = [ps.stem(token) for token in tokens if token.isalnum() and token not in stop_words]
        return tokens

    def build_index(self, file_name: str):
        self.docs = pd.read_csv(file_name)
        self.docs['passage'] = self.docs['passage'].apply(InvertedIndex.preprocess_text)

        total_tokens = 0
        for index, row in self.docs.iterrows():
            doc_id, tokens = row['pid'], row['passage']
            total_tokens += len(tokens)
            for term in tokens:
                self.index[term][doc_id] = self.index[term].get(doc_id, 0) + 1

            self.doc_lengths[doc_id] = len(tokens)

        self.avgdl = total_tokens / len(self.docs)
        self.compute_idf()

    def compute_idf(self):
        total_docs = len(self.docs)
        for term in self.index:
            doc_freq = len(self.index[term])
            self.idf[term] = math.log((total_docs - doc_freq + 0.5) / (doc_freq + 0.5) + 1.0)

    def save_index(self, file_name: str):
        with gzip.open(file_name, 'wt', compresslevel=5) as file:
            json.dump({'index': self.index, 'doc_lengths': self.doc_lengths, 'avgdl': self.avgdl, 'idf': self.idf}, file)

    def load_index(self, file_name: str):
        with gzip.open(file_name, 'rt') as file:
            data = json.load(file)
            self.index = defaultdict(dict, data['index'])
            self.doc_lengths = data['doc_lengths']
            self.avgdl = data['avgdl']
            self.idf = data['idf']


class RetrivalModel:
    def __init__(self, index: InvertedIndex) -> None:
        self.index = index
        self.len_C = len(self.index.index)

    def query_likelihood(self, query, lambd):
        scores = {}

        for doc_id, len_doc in self.index.doc_lengths.items():
            p_q_Md = 0
            for term in query:
                df = self.index.index.get(term, {}).get(doc_id, 0)
                cf = sum(self.index.index[term].values())
                
                ts = (1 - lambd) * (df / len_doc) + (lambd * (cf / self.len_C))
                if ts != 0:
                    p_q_Md += math.log(ts)

            scores[doc_id] = p_q_Md

        sorted_scores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)}
        return sorted_scores

    def bm25_ranking(self, query):
        # Hyperparams to specify
        k1 = 1.2
        b = 0.75
        scores = {}

        # Looping through the different docs
        for doc_id, len_doc in self.index.doc_lengths.items():
            score = 0
            
            # Loop for term in query in the doc
            for term in query:
                # Calculating/updating the score
                tf =  self.index.index.get(term, {}).get(doc_id, 0)
                score += self.index.idf[term] * ((tf * (k1 + 1)) / (tf + k1 * (1 - b + b * len_doc / self.index.avgdl)))

            scores[doc_id] = score 
        
        # sort scores / ranking
        sorted_scores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)}
        return sorted_scores    

In [3]:
data_fine_name = r"MSMARCO_SMALL\collection_small.csv"
index_file_name = 'index.json.gz'

index = InvertedIndex()
build = True

if build:
    index.build_index(data_fine_name)
    index.save_index(index_file_name)
else:
    index.load_index(index_file_name)

In [12]:
queries = pd.read_csv(r"MSMARCO_SMALL\queries_small.csv")
queries = queries['query'].apply(InvertedIndex.preprocess_text)
retrival_model = RetrivalModel(index)


print(retrival_model.bm25_ranking(queries.iloc[0]))
print(retrival_model.query_likelihood(queries.iloc[0], 0.1))

{893118: 13.446318938777125, 6029644: 11.686150277961737, 2559987: 10.863866250240946, 823423: 10.00789494060924, 3753250: 9.874816556464424, 4212316: 9.648856533129337, 2164761: 9.56064016183682, 6755179: 9.375358100983327, 6138246: 9.226429757186523, 621262: 9.193744340314478, 4601640: 8.997060453431637, 6902826: 8.816471875291235, 6803055: 8.744140546033744, 3970885: 8.66103531329676, 3347895: 8.590828081139572, 2997444: 8.540745923730398, 5254586: 8.53364015052918, 668300: 8.515516799157282, 6276834: 8.317105416201317, 8135136: 8.292919373923684, 3698880: 8.27462786371696, 7514078: 8.203853060988333, 1347087: 8.177372004290506, 909077: 8.111553234294334, 8558487: 8.014811338676221, 779082: 8.009667531992337, 5755246: 7.904677197130646, 5644701: 7.874290463716213, 2713288: 7.8676262314904655, 8613686: 7.850573104122729, 4748738: 7.832913671799291, 6881072: 7.824993530104884, 7997993: 7.764216631951182, 6989005: 7.729851140293399, 4503005: 7.538186663014471, 8119943: 7.53302420611152

In [22]:
def query_likelihood(index, query, lambd):
    scores = {}

    for doc_id, len_doc in index.doc_lengths.items():
        p_q_Md = 0
        for term in query:
            df = index.index.get(term, {}).get(doc_id, 0)
            cf = sum(index.index[term].values())
            
            ts = (1 - lambd) * (df / len_doc) + (lambd * (cf / len(index.index)))
            if ts != 0:
                p_q_Md += math.log(ts)

        scores[doc_id] = p_q_Md

    sorted_scores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)}
    return sorted_scores

def bm25_ranking(index, idf, avgdl, query):
    # Hyperparams to specify
    k1 = 1.2
    b = 0.75
    scores = {}

    # Looping through the different docs
    for doc_id, len_doc in index.doc_lengths.items():
        score = 0
        
        # Loop for term in query in the doc
        for term in query:
            # Calculating/updating the score
            tf =  index.index.get(term, {}).get(doc_id, 0)
            score += idf[term] * ((tf * (k1 + 1)) / (tf + k1 * (1 - b + b * len_doc / avgdl)))

        scores[doc_id] = score 
    
    # sort scores / ranking
    sorted_scores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)}
    return sorted_scores


In [17]:
documents = [
    ["Apple", "Samsung"],
    ["Apple", "Apple", "Apple", "Samsung"],
    ["Phone", "Samsung", "Phone", "Apple", "Phone", "Apple", "Samsung"]
]

query = ["Apple", "Phone"]

tf = []

for doc in documents:
  tf.append(dict(Counter(doc)))

index, doc_len, avgdl, idf = create_index(documents)

In [59]:
collection = pd.DataFrame(pd.read_csv(r"MSMARCO_SMALL\collection_small.csv"))

print(collection['passage'][8770])

collection['passage'] = collection['passage'].str.split()

list_of_lists_collection = collection['passage'].tolist()

list_of_lists_collection_pid = collection['pid'].tolist()

tf = []

for doc in list_of_lists_collection:
  tf.append(dict(Counter(doc)))

index, doc_len, avgdl, idf = create_index(list_of_lists_collection)

This might be pretty basic, but I can't remember... I'm typing WORDS into a cell and need to insert a. hard return to start a new line in same cell. However, whenever I hit ENTER, my cursor moves to next cell.


{'http://www.uga.edu/profile/mission/.': 9.087268374386188,
 'Trial': 8.394121193826242,
 'deep': 5.476350461741964,
 'converts': 6.8900437970499695,
 'Modern-day': 9.087268374386188,
 '1852,': 9.087268374386188,
 'observance': 9.087268374386188,
 'Langlot': 9.087268374386188,
 'Poverty': 8.394121193826242,
 'A4.': 9.087268374386188,
 '(Splenda)': 9.087268374386188,
 'heir': 8.394121193826242,
 'mystery.': 8.394121193826242,
 'coffeemakers,': 9.087268374386188,
 'prey': 7.477830461952088,
 'under-current': 9.087268374386188,
 'checkboxes,': 9.087268374386188,
 '121.7': 9.087268374386188,
 'UNIT': 9.087268374386188,
 'Himmler,': 9.087268374386188,
 'september': 9.087268374386188,
 'Hospital,': 7.988656085718079,
 'armed': 6.689373101587818,
 'Iraq': 8.394121193826242,
 'RV.': 9.087268374386188,
 'myxoma': 9.087268374386188,
 '6in).': 9.087268374386188,
 '43,560': 9.087268374386188,
 'Regards.': 9.087268374386188,
 'soil': 5.7914315083818595,
 'utilization.': 9.087268374386188,
 'cite': 

In [23]:
# Print the results
# print(f"Index: {index}")
# print(f"Document Lengths: {doc_len}")
# print(f"Average Document Length: {avgdl}")
# print(f"Term Frequencies per doc in order: {tf}")
# print(f"Inverse Document Frequencies: {idf}")

In [34]:
def query_likelihood(tf, query):
    scores = []

    for doc in tf:
        score = 1
        for term in query:
            if term in doc:
                score *= doc[term] / sum(doc.values())
            else:
                score *= 0
        scores.append(score)

    return scores

global len_C
len_C = len(index)

def prob_t_Md(t, d, lambd):
    cf = sum(index[t].values())
    df = d[t] if t in d else 0
    len_doc = sum(d.values())
    
    p_t_Md = (1 - lambd) * (df / len_doc) + (lambd * (cf / len_C))

    return p_t_Md

def score_doc(q, d, lambd):
    p_q_Md = 0
    for t in q:
      ts = prob_t_Md(t, d, lambd)
      if ts != 0:
        p_q_Md += math.log(ts)

    return p_q_Md

def query_likelihood_smoothing(tf, query, lambd):
    scores = {}

    for doc_nr, doc in enumerate(tf):
        scores[list_of_lists_collection_pid[doc_nr]] = score_doc(query, doc, lambd) 

    sorted_scores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)}
    return sorted_scores
    


In [None]:
#BM25 ranking algorithm

def bm25_ranking(tf, idf, docs, query):
    # Hyperparams to specify
    k1 = 1.2
    b = 0.75
    scores = {}

    # Looping through the different docs
    for doc_nr in range(len(docs)):
        score = 0
        doc = docs[doc_nr]
        
        # Loop for term in query in the doc
        for term in query:
            if str(term) in list(tf[doc_nr].keys()):

                # Calculating/updating the score
                score += idf[term] * ((tf[doc_nr][term] * (k1 + 1)) / (tf[doc_nr][term] + k1 * (1 - b + b * (len(doc) / avgdl))))

        scores[list_of_lists_collection_pid[doc_nr]] = score 
    
    # sort scores / ranking
    # print(scores)
    sorted_scores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)}
    return sorted_scores

# Use on MSMARCO

# using the queries and the corpus from earlier on, qrel file only used for eval later on.

queries = pd.DataFrame(pd.read_csv(r"C:\Users\Raul\Documents\Information-Retravel-System\MSMARCO_SMALL\queries_small.csv"))

#print(queries.head(5))

queries['query'] = queries['query'].str.split()

list_of_lists_query = queries['query'].tolist()

list_of_qids = queries['qid'].tolist()

#print(list_of_lists_collection[0])

In [None]:
qrel = pd.DataFrame(pd.read_csv(r"C:\Users\Raul\Documents\Information-Retravel-System\MSMARCO_SMALL\qrel_small.csv"))

scores = []
count = 0
eval_lists = []
for query in list_of_lists_query:
  #print(count)
  scores.append([list_of_qids[count], bm25_ranking(tf, idf, list_of_lists_collection, query)])
  
  count += 1
    

print(scores[2])

In [38]:
# for i, d in enumerate(tf):
#     print(f'The score for document {i} is {score_doc(query, d, 0.1)}')

# query_likelihood(tf, query)

queries = pd.DataFrame(pd.read_csv(r"MSMARCO_SMALL\queries_small.csv"))

queries['query'] = queries['query'].str.split()

list_of_lists_query = queries['query'].tolist()

list_of_qids = queries['qid'].tolist()

In [40]:
collection

Unnamed: 0,pid,passage
0,2567,"[Your, storage, space., Your, iCloud, account,..."
1,2869,"[Designer, Lloyd, Klein, Accused, of, Naked, A..."
2,3031,"[The, eleven, ships, of, the, Carnival, Conque..."
3,3203,"[It's, a, good, time, to, buy, in, St., Michae..."
4,4769,"[There, are, two, types, of, rainforests,, tro..."
...,...,...
8837,8839469,"[Some, states, have, never, recognized, common..."
8838,8839752,"[What, is, Patient, Access?, With, Patient, Ac..."
8839,8841026,"[In, the, case, of, a, Tesla,, you, apparently..."
8840,8841287,"[Your, due, dates, will, be, calculated, based..."


In [53]:
# for i, d in enumerate(tf):
#     print(f'The score for document {i} is {score_doc(query, d, 0.1)}')
query = list_of_lists_query[2]

scores = query_likelihood_smoothing(tf, query, 0.1)
scores
# print(query)

# for pid in list(scores.keys())[:10]:
#     print(collection[collection['pid'] == pid]['passage'].values)

{7747592: -24.572814862415186,
 2592567: -25.254342857043905,
 3090023: -25.28490045128667,
 2816293: -25.566558483793276,
 2975441: -27.382496929577705,
 8408778: -27.509356389483628,
 726325: -27.70179515192501,
 178347: -28.00253662987029,
 2890165: -28.022008525803063,
 3627250: -28.123749877890216,
 8604986: -28.13228437528432,
 7454508: -28.212782086086378,
 1272433: -28.28375197896845,
 6858732: -28.325670935743908,
 929009: -28.36139905440673,
 6966000: -28.396247900924457,
 659506: -28.448432316916026,
 6696766: -28.715693455561226,
 7970737: -28.80782906403816,
 3591671: -28.88411664283845,
 8390831: -28.999387478775553,
 3222402: -29.02127319453399,
 5166541: -29.02127319453399,
 369313: -29.02905455980116,
 4805947: -29.030155299171614,
 6462322: -29.07483949530085,
 3304978: -29.12760296929963,
 692177: -29.205370152433733,
 4311564: -29.2129911124091,
 1265279: -29.22186584753843,
 493747: -29.254030765362458,
 3285998: -29.254030765362458,
 3922054: -29.254030765362458,


In [25]:
# query_likelihood(tf, query)
print(list_of_lists_query[0])

# query_likelihood(tf, list_of_lists_query[0])

['how', 'much', 'do', 'new', 'staff', 'accountants', 'make']


In [46]:
qrel = pd.DataFrame(pd.read_csv(r"C:\Users\Raul\Documents\Information-Retravel-System\MSMARCO_SMALL\qrel_small.csv"))



[0.0, 0.0, 0.12244897959183672]

In [None]:
from tqdm import tqdm
import numpy as np

eval_list = []

for query, docs in tqdm(scores):
    count = 0
    for doc, score in docs.items():
        count+=1
        if count > 20:
            break
        topic_doc_pair = (np.int64(query), np.int64(doc))

        if ((qrel['Topic'] == topic_doc_pair[0]) & (qrel['Document#'] == topic_doc_pair[1])).any():
        
            rel = 1
        
        else:
            
            rel = 0
            
        eval_list.append([query, rel, doc, list(docs.keys()).index(doc)+1, score, 'RUN1'])
        
print(eval_list[0])

In [None]:
with open('BM25_eval.txt', 'w') as f:
    for sublist in eval_list:
        line = ' '.join([str(elem) for elem in sublist])
        f.write(line + '\n')