# DIS-Project-1: Document Retrieval

This notebook contains the code for generating the final "submission.csv" file for the DIS Project 1. This code performs preprocessing on the queries and then retrieves the top 10 ranked documents from BM25 retrieval model. The final output is saved in the "submission.csv" file. This notebook does not generate the preprocessing of the corpus all the IDF TF calculations as they are loaded on it. You can fine all the code with full implementation on https://github.com/JiayiLi1608/DIS_LJY_HS_AS Github Repository.

## Kaggle Intital Setup and Data Loading

This section contains the code for loading the data and setting up the Kaggle environment.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Libraries

All the Libraries used in the code are imported in this section.

In [None]:
import math
from collections import defaultdict
import pickle
import os
import gc
import nltk
from konlpy.tag import Okt
from nltk.stem import SnowballStemmer
import tqdm
import string
from nltk.util import ngrams
from joblib import dump
import time
from joblib import dump, load
import concurrent.futures
import time

nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)

! unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/ # Needed because in kaggle the wordnet is not unzipped

## Preprocessing
Bellow is the code for preprocessing the queries. The preprocessing steps include:
- Lowercasing
- Removing Punctuation
- Removing Stopwords
- Tokenization
- Lemmatization

Preprocessing is done a bit differently per language wise. Especially for the Korean language as it requires a different approach.

In [None]:
# Load stopwords
def load_stopwords(languages=['english', 'french', 'german', 'spanish', 'italian']):
    stop_words = set()
    for lang in languages:
        try:
            stop_words.update(nltk.corpus.stopwords.words(lang))
        except:
            pass
    return stop_words

def preprocess_text(text, lang):
    if not isinstance(text, str):
        text = ""
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    if lang in ['en', 'fr', 'de', 'es', 'it']:
        tokens = nltk.word_tokenize(text)
    elif lang == 'ko':
        tokens = okt.morphs(text)
    else:
        tokens = text.split()
    
    tokens = [word for word in tokens if word.lower() not in stop_words]
    
    if lang == 'en':
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    elif lang in ['fr', 'de', 'es', 'it']:
        stemmer = stemmer_dict.get(lang, None)
        if stemmer:
            tokens = [stemmer.stem(word) for word in tokens]
    
    if lang in ['fr', 'de', 'es', 'it'] and len(tokens) >= 2:
        n_grams = ['_'.join(gram) for gram in ngrams(tokens, 2)]
        tokens = tokens + n_grams
    
    cleaned_text = ' '.join(tokens)
    return cleaned_text

#stemmers and tokenizers
okt = Okt()
lemmatizer = nltk.WordNetLemmatizer()

stemmer_dict = {
    'fr': SnowballStemmer('french'),
    'de': SnowballStemmer('german'),
    'es': SnowballStemmer('spanish'),
    'it': SnowballStemmer('italian'),
    'en': SnowballStemmer('english')
}

print("Loading stopwords...")
stop_words = load_stopwords()

## BM25 Retrieval Model
Bellow is the class for the BM25 retrieval model. The class contains the following methods:
- `__init__`: Initializes the BM25 model with the given parameters.
- `build`: Builds the term-frequency and document frequency dictionaries, and the inverted index. 
- `precompute_idf`: Precomputes the IDF values for all the terms in the corpus.
- `calculate_scores`: Calculates the BM25 scores for all the documents in the corpus.
- `retrieve_top_n`: Returns the top N ranked documents for a given query.


In [None]:
class BM25:
    def __init__(self, k1=1.5, b=0.75):
        self.k1 = k1
        self.b = b
        self.corpus_size = 0.0
        self.avgdl = 0.0
        self.df = defaultdict(int)
        self.idf = {}
        self.inverted_index = defaultdict(list)
        self.term_freqs = []
        self.doc_lengths = 0
        self.precomputed_idf = 0

    def build(self, tokenized_corpus, lang):
        for doc_id, document in enumerate(tokenized_corpus):
            freq = defaultdict(int)
            for word in document:
                freq[word] += 1
            self.term_freqs.append(freq)
            for word in freq.keys():
                self.df[word] += 1
                self.inverted_index[word].append(doc_id)

        for word, freq in self.df.items():
            self.idf[word] = math.log(
                1 + (self.corpus_size - freq + 0.5) / (freq + 0.5)
            )
    def precompute_doc_lengths(self):
        return {doc_id: sum(self.term_freqs[doc_id].values()) for doc_id in tqdm.tqdm(range(self.corpus_size))}

    def precompute_idf(self):
        return {word: self.idf[word] for word in tqdm.tqdm(self.idf.keys())}

    def calculate_scores(self, query):
        scores = np.zeros(self.corpus_size)
        unique_query_terms = set(query)
        k1 = self.k1
        b = self.b
        avgdl = self.avgdl

        for word in unique_query_terms:
            if word not in self.precomputed_idf:
                continue
            idf = self.precomputed_idf[word]
            doc_ids = self.inverted_index[word]
            for doc_id in doc_ids:
                tf = self.term_freqs[doc_id][word]
                dl = self.doc_lengths[doc_id]
                score = idf * ((tf * (k1 + 1)) / (tf + k1 * (1 - b + dl / avgdl)))
                scores[doc_id] += score

        return scores

    def retrieve_top_n(self, query, n=10):
        scores = self.calculate_scores(query)
        if n >= len(scores):
            top_n_indices = np.argsort(scores)[::-1]
        else:
            top_n_indices = np.argpartition(scores, -n)[-n:]
            top_n_indices = top_n_indices[np.argsort(scores[top_n_indices])[::-1]]
        return top_n_indices

## Hybrid Search Class
The main purpose of this class is to load and preprocess the search queries.

In [None]:
def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

class HybridSearch:
    def __init__(self):
        pass

    def load_preprocessed_queries(self):
        print("Preprocessing the Queries")
        queries_path = f'/kaggle/input/dis-project-1/preprocessed_test_queries.pkl'
        langs_path = f'/kaggle/input/dis-project-1/test_query_langs.pkl'
        test_path = f'/kaggle/input/dis-project-1-document-retrieval/test.csv'
        if os.path.exists(langs_path):
            #preprocessed_queries = load_pickle(queries_path)
            query_langs = load_pickle(langs_path)
        print("Starting Preprocessing")
        test_cv = pd.read_csv(test_path)
        preprocessed_q = []
        for query, lang in tqdm.tqdm(zip(test_cv['query'], test_cv['lang'])):
            preprocessed_q.append(preprocess_text(query, lang))

        return preprocessed_q, query_langs

## Optimized Function for Loading Preprocessed Data
Bellow are the functions mainly aimed to load the preprocessed data very quickly. Due the limitations of Kaggle I/O speed, it takes a lot of time to load the data. For his reason we had to put all the data into batches, and then load them up again in batches while in parallel processing. This way we reduced a lot of time in loading the data. However still it takes a lot of time to load the data. 

In [None]:
def retrieve_top_n_batch(args):
    bm25_model, tokenized_query_batch, k = args
    return [bm25_model.retrieve_top_n(query, n=k) for query in tqdm.tqdm(tokenized_query_batch)]

def save_model_compressed(model, path):
    # Save the model with compression
    dump(model, path)

def load_model_mmap(path):
    # Load the model with memory mapping
    return load(path, mmap_mode='r+')

def load_pickle_file(file_path):
    with open(file_path, "rb") as f:
        return pickle.load(f)

def load_model_picklebatches(path, lang, batch_size=10):
    batch_per_lang = {"ar": 9, "de": 11, "en": 208, "es": 12, "fr": 11, "it": 12, "ko": 8}
    
    if lang == "en":
        batch_size = 10
    else:
        batch_size = 5
    
    term_freqs = []
    file_paths = [f"{path}/batch_{i}_{lang}.pkl" for i in range(batch_per_lang[lang])]
    
    # Split file paths into smaller batches
    file_batches = [file_paths[i:i + batch_size] for i in range(0, len(file_paths), batch_size)]
    
    for file_batch in tqdm.tqdm(file_batches):
        with concurrent.futures.ThreadPoolExecutor() as executor:
            results = list(executor.map(load_pickle_file, file_batch))
        
        for result in results:
            term_freqs.extend(result)
    
    return term_freqs

def load_model_picklebatches_idf(path, lang, batch_size=3):
    batch_per_lang = {"ar": 2, "de": 19, "en": 6, "es": 14, "fr": 16, "it": 17, "ko": 1}
    
    
    idf = {}
    file_paths = [f"{path}/batch_idf{i}_{lang}.pkl" for i in range(batch_per_lang[lang])]
    
    # Split file paths into smaller batches
    file_batches = [file_paths[i:i + batch_size] for i in range(0, len(file_paths), batch_size)]
    
    for file_batch in tqdm.tqdm(file_batches, desc=f"Loading {lang} IDF"):
        with concurrent.futures.ThreadPoolExecutor() as executor:
            results = list(executor.map(load_pickle_file, file_batch))
        
        for result in results:
            idf.update(result)
            
    return idf


def load_model_picklebatches_id(path, lang, batch_size=3):
    batch_per_lang = {"ar": 2, "de": 19, "en": 6, "es": 14, "fr": 16, "it": 17, "ko": 1}
    
    inverted_index = defaultdict(list)
    file_paths = [f"{path}/batch_inverse{i}_{lang}.pkl" for i in range(batch_per_lang[lang])]
    
    # Split file paths into smaller batches
    file_batches = [file_paths[i:i + batch_size] for i in range(0, len(file_paths), batch_size)]
    
    for file_batch in tqdm.tqdm(file_batches, desc=f"Loading {lang} inverted index"):
        with concurrent.futures.ThreadPoolExecutor() as executor:
            results = list(executor.map(load_pickle_file, file_batch))
        
        for result in results:
            for key, value in result.items():
                inverted_index[key].extend(value)
                
    return inverted_index

 ## Ranking the Documents

 To make the ranking of the documents much faster, we have used batches to sequently compute the BM25 scores for the documents. This way we aimed to reduce the time of computation by a lot. The parameters values of Batch Size, k1, b, and n are set to the optimal values. The optimal values were found by running the model ,multiple times and then selecting the best values.

In [None]:

def retrieve_test_queries_optimized(preprocessed_queries, query_langs, k=10):
    retrieved_docs = [None] * len(preprocessed_queries)
    queries_df = pd.DataFrame(
        {
            "query": preprocessed_queries,
            "lang": query_langs,
            "original_idx": range(len(preprocessed_queries)),
        }
    )
    
    # Average document lengths
    avgdl_dict = {
        "ar": 4418.960584437648,
        "de": 5575.8404294032025,
        "en": 1339.3939950714448,
        "es": 6174.31717941737,
        "fr": 6834.264518546272,
        "it": 6483.073170731707,
        "ko": 4380.481946028126
    }
    
    # Corpus sizes
    corpus_size_dict = {
        "ar": 8829,
        "de": 10992,
        "en": 207363,
        "es": 11019,
        "fr": 10676,
        "it": 11250,
        "ko": 7893
    }
    
    grouped = queries_df.groupby("lang")

    for lang, group in tqdm.tqdm(grouped):
        
        # Paths to load the model
        lang_queries = group["query"].tolist()
        lang_original_indices = group["original_idx"].tolist()
        lang_doc_ids_path = f"/kaggle/input/dis-project-1/doc_ids_{lang}.pkl"
        lang_bm25_model_path = f"/kaggle/input/dis-project-1/bm25_model_{lang}.pkl"
        lang_bm25_model_path_joblib = "/kaggle/input/dis-project-1/"
            
        

        # Load model with memory mapping
        print(f"Loading {lang_bm25_model_path}")
        
        term_freqs = load_model_picklebatches(lang_bm25_model_path_joblib, lang)

        # Loading the rest of the model
        start_time = time.time()
        idf = load_model_picklebatches_idf("/kaggle/input/dis-project-1", lang)
        end_time = time.time() - start_time
        print(f"IDF load time: {end_time}")
        start_time = time.time()
        inverted_index = load_model_picklebatches_id("/kaggle/input/dis-project-1/", lang)
        end_time = time.time() - start_time
        print(f"Inverted Index load time: {end_time}")
            
        start_time = time.time()
        bm25_model = BM25()
        bm25_model.term_freqs = term_freqs
        bm25_model.corpus_size = corpus_size_dict[lang]
        bm25_model.avgdl = avgdl_dict[lang]
        bm25_model.idf = idf
        bm25_model.inverted_index = inverted_index
        bm25_model.doc_lengths = bm25_model.precompute_doc_lengths()
        bm25_model.precomputed_idf = bm25_model.precompute_idf()
        end_time = time.time() - start_time
        print(f"BM25 model load time: {end_time}")

        # Load document IDs with memory mapping
        with open(lang_doc_ids_path, "rb") as f:
            doc_ids = pickle.load(f)

        batch_size = 100
        if lang == "en":
            batch_size = 200
        tokenized_queries = [query.split() for query in lang_queries]
        num_batches = math.ceil(len(tokenized_queries) / batch_size)
        batches = [tokenized_queries[i * batch_size:(i + 1) * batch_size] for i in range(num_batches)]
        original_idx_batches = [lang_original_indices[i * batch_size:(i + 1) * batch_size] for i in range(num_batches)]

        # Retrieve top-k documents for each batch of queries
        print(f"Retrieving top-{k} documents for {lang} queries...")
        start_time = time.time()

        # Process batches sequentially
        results = []
        for batch in batches:
            result = retrieve_top_n_batch((bm25_model, batch, k))
            results.append(result)

        retrieval_time = time.time() - start_time
        print(f"Retrieval time: {retrieval_time}")
        
        # Assign retrieved documents to original indices
        for batch_results, original_indices in zip(results, original_idx_batches):
            for i, result in enumerate(batch_results):
                retrieved_docs[original_indices[i]] = [doc_ids[idx] for idx in result]

        del bm25_model, doc_ids, lang_queries, tokenized_queries
        gc.collect()
        
    return retrieved_docs



## Main Function
The bellow function is the main function that is used to generate the final submission file. The function loads the preprocessed data, and then retrieves the top 10 ranked documents for each query. The final output is saved in the "submission.csv" file.

In [None]:
def main():
    
        
    #Load test data and preprocessed queries
    preprocessed_test_queries, test_query_langs = HybridSearch().load_preprocessed_queries()

    print(preprocessed_test_queries[:5])
    print(test_query_langs[:5])
    #Perform retrieval on test set
    retrieved_docs_test = retrieve_test_queries_optimized(
        preprocessed_queries=preprocessed_test_queries,
        query_langs=test_query_langs,
        k=10
    )

    print(retrieved_docs_test[:5])

    submission_df = pd.DataFrame({
        'id': np.arange(len(retrieved_docs_test)),
        'docids': retrieved_docs_test
    })

    print(submission_df[:5])
    submission_df.to_csv('submission.csv', index=False)

if __name__ == "__main__":
    main()