In [55]:
import math
from collections import defaultdict
from collections import Counter
import pandas as pd

In [58]:
import time
import math
import sys
from collections import defaultdict
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.metrics import ndcg_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pickle
import gzipx
from rank_bm25 import BM25Okapi

from pympler import asizeof

class Index:
    def __init__(self) -> None:
        self.index = defaultdict(dict)
        self.embeddings = None
        self.bm25Okapi = None
        self.doc_lengths = {}
        self.avgdl = 0
        self.idf = {}
        self.cf = {}

    @staticmethod
    def preprocess_text(text):
        stop_words = set(stopwords.words('english'))
        ps = PorterStemmer()
        text = text.lower()
        tokens = word_tokenize(text)
        tokens = [ps.stem(token) for token in tokens if token.isalnum() and token not in stop_words]
        return tokens

    def load_file(self, file_name):
        file_extension = file_name.split('.')[-1].lower()

        if file_extension == 'csv':
            self.docs = pd.read_csv(file_name)
        elif file_extension == 'tsv':
            self.docs = pd.read_csv(file_name, delimiter='\t',header=None)
            self.docs.columns = ['pid', 'passage']
        else:
            raise ValueError("Unsupported file format. Supported formats: CSV (.csv) and TSV (.tsv)")

    def build_index(self, file_name: str):
        times = []

        self.load_file(file_name=file_name)

        start_time = time.time()
        self.docs['passage'] = self.docs['passage'].apply(self.preprocess_text)
        end_time = time.time()
        times.append(end_time - start_time)

        start_time = time.time()
        total_tokens = 0
        for index, row in self.docs.iterrows():
            doc_id, tokens = row['pid'], row['passage']
            total_tokens += len(tokens)
            for term in tokens:
                self.index[term][doc_id] = self.index[term].get(doc_id, 0) + 1
                if term not in self.cf:
                    self.cf[term] = 1
                else:
                    self.cf[term] += 1


            self.doc_lengths[doc_id] = len(tokens)

        self.avgdl = total_tokens / len(self.docs)
        self.compute_idf()
        end_time = time.time()
        times.append(end_time - start_time)

        start_time = time.time()
        corpus = [' '.join(i) for i in list(self.docs['passage'])]
        self.vectorizer = TfidfVectorizer()
        self.embeddings = self.vectorizer.fit_transform(corpus)
        end_time = time.time()
        times.append(end_time - start_time)

        start_time = time.time()
        self.bm25Okapi = BM25Okapi(self.docs['passage'])
        end_time = time.time()
        times.append(end_time - start_time)

        return times

    def compute_idf(self):
        total_docs = len(self.docs)
        for term in self.index:
            doc_freq = len(self.index[term])
            self.idf[term] = np.log((total_docs - doc_freq + 0.5) / (doc_freq + 0.5) + 1.0)

    def save_index(self, file_name: str):
        with gzip.open(file_name, 'wb', compresslevel=9) as file:
            pickle.dump({'index': self.index, 
                         'doc_lengths': self.doc_lengths, 
                         'avgdl': self.avgdl, 
                         'idf': self.idf, 
                         'cf': self.cf, 
                         'vectorizer': self.vectorizer, 
                         'embeddings': self.embeddings,
                         'bm25Okapi': self.bm25Okapi}, file)

    def load_index(self, file_name: str, get_size=False):
        with gzip.open(file_name, 'rb') as file:
            data = pickle.load(file)
            self.index = data['index']
            self.doc_lengths = data['doc_lengths']
            self.avgdl = data['avgdl']
            self.idf = data['idf']
            self.cf = data['cf']
            self.embeddings = data['embeddings']
            self.vectorizer = data['vectorizer']
            self.bm25Okapi = data['bm25Okapi']

            if get_size:
                return [self.memory_usage(self.index) + self.memory_usage(self.doc_lengths) + self.memory_usage(self.avgdl) + self.memory_usage(self.idf) + self.memory_usage(self.cf),
                        self.memory_usage(self.embeddings) + self.memory_usage(self.vectorizer),
                        self.memory_usage(self.bm25Okapi)]


    def memory_usage(self, obj):
        return asizeof.asizeof(obj) / (1024 * 1024)

class RetrievalModel:
    def __init__(self, index: Index) -> None:
        self.index = index
        self.len_C = len(self.index.index)

    def preselect_docs(self, query):
        query_terms = set(query)

        relevant_docs = set()

        for term in query_terms:
            relevant_docs.update(self.index.index.get(term, {}).keys())

        return relevant_docs


    def query_likelihood(self, query, lambd=0.35):
        scores = {}

        for doc_id in self.preselect_docs(query):
            len_doc = self.index.doc_lengths[doc_id]

            p_q_Md = 0

            for term in query:
                df = self.index.index.get(term, {}).get(doc_id, 0)
                if len(self.index.index[term]) == 0: continue

                cf = self.index.cf[term]
                p_q_Md += np.log((1 - lambd) * (df / len_doc) + (lambd * (cf / self.len_C)))

            scores[doc_id] = p_q_Md

        sorted_scores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)}
        return sorted_scores

    def bm25_ranking(self, query, k1 = 1.2, b = 0.75):
        scores = {}

        for term in query:
            score = 0

            if len(self.index.index[term]) == 0: continue

            idf_value = self.index.idf[term]

            for doc_id, tf in self.index.index[term].items():
                len_doc = self.index.doc_lengths[doc_id]
                tf =  self.index.index.get(term, {}).get(doc_id, 0)
                score = idf_value * ((tf * (k1 + 1)) / (tf + k1 * (1 - b + b * len_doc / self.index.avgdl)))

                if not doc_id in scores:
                    scores[doc_id] = score
                else:
                    scores[doc_id] += score

        # sort scores / ranking
        sorted_scores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)}

        return sorted_scores


    def embeddings_cosign_sim(self, query):
        query = [" ".join(list(query))]
        vec_query = self.index.vectorizer.transform(query)

        cos = cosine_similarity(self.index.embeddings, vec_query)

        scores = dict(zip(self.index.doc_lengths.keys(), cos.flatten()))

        non_zero_scores = {k: v for k, v in scores.items() if v != 0}

        sorted_scores = {k: v for k, v in sorted(non_zero_scores.items(), key=lambda item: item[1], reverse=True)}

        return sorted_scores

    def bm25Okapi(self, query):

        scores = {}

        scores = self.index.bm25Okapi.get_scores(query)

        scores = dict(zip(list(self.index.doc_lengths.keys()), list(scores)))

        non_zero_scores = {k: v for k, v in scores.items() if v != 0}

        sorted_scores = {k: v for k, v in sorted(non_zero_scores.items(), key=lambda item: item[1], reverse=True)}

        return sorted_scores


    def bm25_impl_one(self, query):
        # Hyperparams to specify
        k1 = 1.2
        b = 0.75
        scores = {}

        for doc_id in self.preselect_docs(query):
            len_doc = self.index.doc_lengths[doc_id]
            if len_doc == 0:
                continue
            score = 0

            # Loop for term in query in the doc
            for term in query:

                # Calculating/updating the score
                tf =  self.index.index.get(term, {}).get(doc_id, 0)
                idf_value = self.index.idf.get(term, 0)  # Use 0 as the default value if term is not in idf
                score += idf_value * ((tf * (k1 + 1)) / (tf + k1 * (1 - b + b * len_doc / self.index.avgdl)))

            scores[doc_id] = score


        sorted_scores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)}

        return sorted_scores

    def evaluate_model(self, qrel_file, query_file, lambd=0.5, output_file='evaluation_results.csv'):
        # Parse qrel file
        qrel_data = pd.read_csv(qrel_file)

        # Read query file and preprocess queries
        query_data = pd.read_csv(query_file)
        query_data['query'] = query_data['query'].apply(self.index.preprocess_text)

        # Create a DataFrame to store results
        results = pd.DataFrame(columns=['qid','ql_ndcg', 'bm25_ndcg', 'cos_sim_ndcg'])
        results['qid'] = query_data['qid']
        results['ql_ndcg'] = None
        results['bm25_ndcg'] = None
        results['cos_sim_ndcg'] = None
        results['bm25Okapi'] = None
        results['bm25_impl_one'] = None

        # query_data = query_data.loc[query_data['qid'] == 722737]
        ql_times = []
        bm25_times = []
        bm25_impl_one_times = []
        embeddings_times = []
        bm25Okapi_times = []
        # Evaluate each query
        for qid, query in zip(query_data['qid'], query_data['query']):
            relevant_docs = qrel_data[(qrel_data['Topic'] == qid) & (qrel_data['Relevancy'] == 1)]['Document#'].tolist()

            # Query Likelihood
            start_time = time.time()
            ql_scores = self.query_likelihood(query, lambd)
            end_time = time.time()
            ql_times.append(end_time - start_time)
            ranked_docs_ql = np.array(list(ql_scores.keys()))
            ranked_values_ql = np.array(list(ql_scores.values()))

            # Create binary list for relevant and non-relevant documents
            y_true_ql = np.isin(ranked_docs_ql, relevant_docs)

            # BM25
            start_time = time.time()
            bm25_scores = self.bm25_ranking(query)
            end_time = time.time()
            bm25_times.append(end_time - start_time)
            ranked_docs_bm25 = np.array(list(bm25_scores.keys()))
            ranked_values_bm25 = np.array(list(bm25_scores.values()))

            # Create binary list for relevant and non-relevant documents
            y_true_bm25 = np.isin(ranked_docs_bm25, relevant_docs)

            # Embeddings
            start_time = time.time()
            cos_sim_scores = self.embeddings_cosign_sim(query)
            end_time = time.time()
            embeddings_times.append(end_time - start_time)
            ranked_docs_cos_sim = np.array(list(cos_sim_scores.keys()))
            ranked_values_cos_sim = np.array(list(cos_sim_scores.values()))

            # Create binary list for relevant and non-relevant documents
            y_true_cos_sim = np.isin(ranked_docs_cos_sim, relevant_docs)


            #BM25Okapi
            start_time = time.time()
            BM25Okapi_scores = self.bm25Okapi(query)
            end_time = time.time()
            bm25Okapi_times.append(end_time - start_time)
            ranked_docs_bm25Okapi = np.array(list(BM25Okapi_scores.keys()))
            ranked_values_bm25Okapi = np.array(list(BM25Okapi_scores.values()))

            # Create binary list for relevant and non-relevant documents
            y_true_bm25Okapi = np.isin(ranked_docs_bm25Okapi, relevant_docs)

            #bm25_impl_one
            start_time = time.time()
            bm25_impl_one_scores = self.bm25_impl_one(query)
            end_time = time.time()
            bm25_impl_one_times.append(end_time - start_time)
            ranked_docs_bm25_impl_one = np.array(list(bm25_impl_one_scores.keys()))
            ranked_values_bm25_impl_one = np.array(list(bm25_impl_one_scores.values()))

            # Create binary list for relevant and non-relevant documents
            y_true_bm25_impl_one = np.isin(ranked_docs_bm25_impl_one, relevant_docs)

            if len(y_true_ql) < 2 or len(y_true_bm25) < 2 or len(y_true_cos_sim) < 2: continue

            # Calculate NDCG scores
            ndcg_ql = ndcg_score([y_true_ql], [ranked_values_ql])
            ndcg_bm25 = ndcg_score([y_true_bm25], [ranked_values_bm25])
            ndcg_cos_sim = ndcg_score([y_true_cos_sim], [ranked_values_cos_sim])
            ndcg_bm25Okapi = ndcg_score([y_true_bm25Okapi], [ranked_values_bm25Okapi])
            ndcg_bm25_impl_one = ndcg_score([y_true_bm25_impl_one], [ranked_values_bm25_impl_one])

            


            # Append results to the DataFrame
            results.loc[results['qid'] == qid] = [qid, ndcg_ql, ndcg_bm25, ndcg_cos_sim, ndcg_bm25Okapi, ndcg_bm25_impl_one]

        # Save results to a CSV file
        results.to_csv(output_file, index=False)

        return [np.average(ql_times), np.average(bm25_times), np.average(bm25_impl_one_times), np.average(embeddings_times), np.average(bm25Okapi_times)]

In [34]:
data_fine_name = r"MSMARCO_SMALL\collection_0.1.csv"
index_file_name = 'index.json.gz'

index = Index()
build = True

if build:
    times, index_size = index.build_index(data_fine_name)
    index.save_index(index_file_name)
else:
    index.load_index(index_file_name)


In [61]:
output_file = 'index_results.csv'

percentages = [0.1, 1, 2, 5, 10, 20]

results_index = pd.DataFrame(columns=['percentage_of_col', 'preprocessing_time', 'inverted_index_time', 'vercor_index_time', 'existing_implemention_time'])

for percentage in percentages:
    data_fine_name = f"MSMARCO_SMALL\collection_{percentage}.csv"
    index_file_name = f'index_{percentage}.json.gz'

    index = Index()

    times = index.build_index(data_fine_name)
    index.save_index(index_file_name)
    print(times)
    results_index.loc[len(results_index.index)] = [percentage] + times

results_index.to_csv(output_file, index=False)

[15.516738891601562, 0.8827648162841797, 0.2670776844024658, 0.13420748710632324]
[125.27284455299377, 8.292277097702026, 2.4589576721191406, 1.216179370880127]
[253.64327335357666, 16.65377426147461, 4.974998950958252, 2.3776464462280273]
[642.5453038215637, 45.31479740142822, 13.046379089355469, 6.780766487121582]
[1298.010850429535, 85.57694458961487, 25.10372543334961, 13.121039867401123]
[2502.9415130615234, 175.954283952713, 54.66093707084656, 29.36467409133911]


In [63]:
percentages = [0.1, 1, 2, 5, 10, 20]

sizes_df = pd.DataFrame(columns=['percentage_of_col', 
                                 'inverted_index', 
                                 'TF_IDF_embeddings',  
                                 'BM25Okapi'])

for percentage in percentages:
    index_file_name = f'index_{percentage}.json.gz'
    query_file = f"MSMARCO_SMALL\queries_{percentage}.csv"
    qrel_file = f"MSMARCO_SMALL\qrel_{percentage}.csv"

    index = Index()
    size = index.load_index(index_file_name, get_size=True)
    print(percentage, size)

    sizes_df.loc[len(sizes_df.index)] = [percentage] + size

sizes_df.to_csv('sizes.csv', index=False)

0.1 [26.539085388183594, 5.99493408203125, 21.3951416015625]
1 [186.50894165039062, 34.87218475341797, 200.80287170410156]
2 [358.7286834716797, 65.38561248779297, 401.15380096435547]
5 [854.5349502563477, 150.8685760498047, 996.8666381835938]
10 [1656.4396667480469, 281.41382598876953, 1982.8914031982422]
20 [3290.353202819824, 547.0382614135742, 3962.390151977539]


In [64]:
percentages = [0.1, 1, 2, 5, 10, 20]
# percentages = [0.1, 0.1, 0.1, 0.1]

times_df = pd.DataFrame(columns=['percentage_of_col', 
                                 'QL', 
                                 'BM25 final implementation', 
                                 'BM25 initial implementation', 
                                 'Embeddings', 
                                 'BM25Okapi'])

for percentage in percentages:
    index_file_name = f'index_{percentage}.json.gz'
    query_file = f"MSMARCO_SMALL\queries_{percentage}.csv"
    qrel_file = f"MSMARCO_SMALL\qrel_{percentage}.csv"

    index = Index()
    index.load_index(index_file_name)

    retrival_model = RetrievalModel(index)
    model_times = retrival_model.evaluate_model(qrel_file, query_file, output_file=f'evaluation_results_{percentage}.csv')
    print(model_times)
    times_df.loc[len(times_df.index)] = [percentage] + model_times

times_df.to_csv("Times_models.csv")

[0.002711301896630264, 0.0004750461113162157, 0.001381862454298066, 0.006189119525072052, 0.013134142247641959]
[0.029487557411193847, 0.0059511327743530275, 0.016143603324890135, 0.0498958683013916, 0.14025989532470703]
[0.049515140056610105, 0.010955033302307128, 0.027992122173309326, 0.09839768648147583, 0.2809553933143616]
[0.13241675853729248, 0.030601599216461182, 0.07611552953720092, 0.24761239290237427, 0.6738634943962097]
[0.2259249210357666, 0.06120824098587036, 0.1330464029312134, 0.520922110080719, 1.3213372540473938]
[0.5476163530349731, 0.13780436038970947, 0.3136431550979614, 1.0619917941093444, 2.9175866532325743]


In [11]:
query_file = r"MSMARCO_SMALL\queries_small.csv"
qrel_file = r"MSMARCO_SMALL\qrel_small.csv"
retrival_model = RetrievalModel(index)

retrival_model.evaluate_model(qrel_file, query_file)

ql_time:  0.0031063759868795223
bm25_time:  0.0004903696703188347
bm25_impl_one_time:  0.0017046919374754934
embeddings_time:  0.005431151751315955
bm25Okapi_time:  0.01197624612938274


In [39]:
import time

query_file = r"MSMARCO_SMALL\queries_small.csv"
qrel_file = r"MSMARCO_SMALL\qrel_small.csv"

retrival_model = RetrievalModel(index)

query_data = pd.read_csv(query_file)
query_data['query'] = query_data['query'].apply(Index.preprocess_text)

query = query_data['query'].iloc[0]
# query = query_data.loc[query_data['qid'] == 722737, 'query'].iloc[0]

start_time = time.time()
retrival_model.query_likelihood(query, 0.35)
end_time = time.time()
print(f"query_likelihood took {end_time - start_time} seconds")

start_time = time.time()
retrival_model.bm25_ranking(query)
end_time = time.time()
print(f"bm25_ranking took {end_time - start_time} seconds")

start_time = time.time()
retrival_model.embeddings_cosign_sim(query)
end_time = time.time()
print(f"embeddings_cosign_sim took {end_time - start_time} seconds")

query_likelihood took 0.22401165962219238 seconds
bm25_ranking took 0.06699109077453613 seconds
embeddings_cosign_sim took 0.5279767513275146 seconds
