In [1]:
import math
from collections import defaultdict
from collections import Counter
import pandas as pd

In [97]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.metrics import ndcg_score, average_precision_score
import numpy as np
import random
import pickle
import gzip

class InvertedIndex:
    def __init__(self) -> None:
        self.index = defaultdict(dict)
        self.doc_lengths = {}
        self.avgdl = 0
        self.idf = {}

    @staticmethod
    def preprocess_text(text):
        stop_words = set(stopwords.words('english'))
        ps = PorterStemmer()
        text = text.lower()
        tokens = word_tokenize(text)
        tokens = [ps.stem(token) for token in tokens if token.isalnum() and token not in stop_words]
        return tokens

    def load_file(self, file_name):
        file_extension = file_name.split('.')[-1].lower()

        if file_extension == 'csv':
            self.docs = pd.read_csv(file_name)
        elif file_extension == 'tsv':
            self.docs = pd.read_csv(file_name, delimiter='\t',header=None)
            self.docs.columns = ['pid', 'passage']
        else:
            raise ValueError("Unsupported file format. Supported formats: CSV (.csv) and TSV (.tsv)")

    def build_index(self, file_name: str):
        self.load_file(file_name=file_name)

        self.docs['passage'] = self.docs['passage'].apply(InvertedIndex.preprocess_text)

        total_tokens = 0
        for index, row in self.docs.iterrows():
            doc_id, tokens = row['pid'], row['passage']
            total_tokens += len(tokens)
            for term in tokens:
                self.index[term][doc_id] = self.index[term].get(doc_id, 0) + 1

            self.doc_lengths[doc_id] = len(tokens)

        self.avgdl = total_tokens / len(self.docs)
        self.compute_idf()

    def compute_idf(self):
        total_docs = len(self.docs)
        for term in self.index:
            doc_freq = len(self.index[term])
            self.idf[term] = math.log((total_docs - doc_freq + 0.5) / (doc_freq + 0.5) + 1.0)

    def save_index(self, file_name: str):
        with gzip.open(file_name, 'wb', compresslevel=5) as file:
            pickle.dump({'index': self.index, 'doc_lengths': self.doc_lengths, 'avgdl': self.avgdl, 'idf': self.idf}, file)

    def load_index(self, file_name: str):
        with gzip.open(file_name, 'rb') as file:
            data = pickle.load(file)
            self.index = data['index']
            self.doc_lengths = data['doc_lengths']
            self.avgdl = data['avgdl']
            self.idf = data['idf']


class RetrievalModel:
    def __init__(self, index: InvertedIndex) -> None:
        self.index = index
        self.len_C = len(self.index.index)
    
    def preselect_docs(self, query, min_selected_docs=2):
        query_terms = set(query)

        relevant_docs = set()

        for term in query_terms:
            relevant_docs.update(self.index.index.get(term, {}).keys())

        while len(relevant_docs) < min_selected_docs:
            all_docs = set(self.index.doc_lengths.keys())
            additional_doc = random.choice(list(all_docs))
            relevant_docs.add(additional_doc)
        
        return relevant_docs

    def query_likelihood(self, query, lambd):
        scores = {}
        
        # for doc_id, len_doc in self.index.doc_lengths.items():
        for doc_id in self.preselect_docs(query):
            len_doc = self.index.doc_lengths[doc_id]
            if len_doc == 0:
                continue
            p_q_Md = 0
            for term in query:
                df = self.index.index.get(term, {}).get(doc_id, 0)
                cf = sum(self.index.index[term].values())
                
                ts = (1 - lambd) * (df / len_doc) + (lambd * (cf / self.len_C))
                if ts != 0:
                    p_q_Md += math.log(ts)

            scores[doc_id] = p_q_Md

        sorted_scores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)}
        return sorted_scores

    def bm25_ranking(self, query):
        # Hyperparams to specify
        k1 = 1.2
        b = 0.75
        scores = {}

        # Looping through the different docs
        # for doc_id, len_doc in self.index.doc_lengths.items():
        for doc_id in self.preselect_docs(query):
            len_doc = self.index.doc_lengths[doc_id]
            if len_doc == 0:
                continue
            score = 0
            
            # Loop for term in query in the doc
            for term in query:
                # Calculating/updating the score
                tf =  self.index.index.get(term, {}).get(doc_id, 0)
                idf_value = self.index.idf.get(term, 0)  # Use 0 as the default value if term is not in idf
                score += idf_value * ((tf * (k1 + 1)) / (tf + k1 * (1 - b + b * len_doc / self.index.avgdl)))

            scores[doc_id] = score 
        
        # sort scores / ranking
        sorted_scores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)}
        return sorted_scores

    def evaluate_model(self, qrel_file, query_file, lambd=0.5, output_file='evaluation_results.csv'):
        # Parse qrel file
        qrel_data = pd.read_csv(qrel_file)

        # Read query file and preprocess queries
        query_data = pd.read_csv(query_file)
        query_data['query'] = query_data['query'].apply(self.index.preprocess_text)

        # Create a DataFrame to store results
        results = pd.DataFrame(columns=['qid','ql_ndcg', 'bm25_ndcg'])
        results['qid'] = query_data['qid']
        results['ql_ndcg'] = None
        results['bm25_ndcg'] = None

        # Evaluate each query
        for qid, query in zip(query_data['qid'], query_data['query']):
            relevant_docs = qrel_data[(qrel_data['Topic'] == qid) & (qrel_data['Relevancy'] == 1)]['Document#'].tolist()

            # Query Likelihood
            ql_scores = self.query_likelihood(query, lambd)
            ranked_docs_ql = np.array(list(ql_scores.keys()))
            ranked_values_ql = np.array(list(ql_scores.values()))

            # Create binary list for relevant and non-relevant documents
            y_true_ql = np.isin(ranked_docs_ql, relevant_docs)

            # BM25
            bm25_scores = self.bm25_ranking(query)
            ranked_docs_bm25 = np.array(list(bm25_scores.keys()))
            ranked_values_bm25 = np.array(list(bm25_scores.values()))

            # Create binary list for relevant and non-relevant documents
            y_true_bm25 = np.isin(ranked_docs_bm25, relevant_docs)

            # Calculate NDCG scores
            ndcg_ql = ndcg_score([y_true_ql], [ranked_values_ql])
            ndcg_bm25 = ndcg_score([y_true_bm25], [ranked_values_bm25])

            # print("ql:", ndcg_ql, "rank", np.where(ranked_docs_ql == relevant_docs))
            # print("bm25:", ndcg_bm25, "rank", np.where(ranked_docs_bm25 == relevant_docs))

            # Append results to the DataFrame
            results.loc[results['qid'] == qid] = [qid, ndcg_ql, ndcg_bm25]

        # Save results to a CSV file
        results.to_csv(output_file, index=False)

In [75]:
data_fine_name = r"MSMARCO_SMALL\collection_small.csv"
# data_fine_name = r"data\collection.tsv"
index_file_name = 'index2.json.gz'

index = InvertedIndex()
build = True

if build:
    index.build_index(data_fine_name)
    index.save_index(index_file_name)
else:
    index.load_index(index_file_name)

pid         int64
passage    object
dtype: object


In [93]:
query_file = r"MSMARCO_SMALL\queries_small.csv"
qrel_file = r"MSMARCO_SMALL\qrel_small.csv"
retrival_model = RetrievalModel(index)

retrival_model.evaluate_model(qrel_file, query_file)

In [98]:
query_file = r"MSMARCO_SMALL\queries_small.csv"
qrel_file = r"MSMARCO_SMALL\qrel_small.csv"
retrival_model = RetrievalModel(index)

retrival_model.evaluate_model(qrel_file, query_file)

In [73]:
query_file = r"MSMARCO_SMALL\queries_small.csv"
query_data = pd.read_csv(query_file)
query_data['query'] = query_data['query'].apply(index.preprocess_text)

query = query_data['query'].iloc[1]

print(query)

len(retrival_model.preselect_docs(query))

['execution']


1