In [1]:
import scipy.sparse
import numpy as np
from unidecode import unidecode

from KeywordSearch.loader import metadata, stemmer, processed_books, LOOKUP_TABLE_PATH
from KeywordSearch.kwsearch import regex_tokenise
from KeywordSearch.utils import save_pickle

Using `is` instead of `=` for comparison in performance-critical code is acceptable
Downloading stopwords...


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\10022\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


All languages: ['afrikaans', 'arabic', 'breton', 'bulgarian', 'catalan', 'chinese', 'czech', 'danish', 'dutch', 'english', 'esperanto', 'estonian', 'finnish', 'french', 'galician', 'german', 'greek', 'hebrew', 'hungarian', 'icelandic', 'interlingua ', 'inuktitut', 'irish', 'italian', 'japanese', 'korean', 'latin', 'lithuanian', 'maori', 'norwegian', 'occitan ', 'persian', 'polish', 'portuguese', 'romanian', 'russian', 'sanskrit', 'serbian', 'slovenian', 'spanish', 'swedish', 'tagalog', 'telugu', 'tibetan', 'welsh', 'western frisian', 'yiddish']


In [2]:
def tokenise_text(input_str: str) -> list[str]:
    return stemmer.stemWords(regex_tokenise.findall(unidecode(input_str).casefold()))

In [3]:
def index_meta_data(metadata_dict: dict):
    saving_path = LOOKUP_TABLE_PATH.replace("lookup_table.npz", "%s")

    subject_vocab, title_vocab, author_vocab = set(), set(), set()
    tokens_buffer = dict()
    for book_id, (_, subjects, title, author) in metadata_dict.items():
        subject_tokens = subjects
        title_tokens = tokenise_text(title)
        author_tokens = tokenise_text(author)
        subject_vocab.update(subject_tokens)
        title_vocab.update(title_tokens)
        author_vocab.update(author_tokens)
        tokens_buffer[book_id] = (subject_tokens, title_tokens, author_tokens)

    subject_dict = {sub : i for i, sub in enumerate(sorted(subject_vocab))}
    title_dict = {w : i for i, w in enumerate(sorted(title_vocab))}
    author_dict = {w : i for i, w in enumerate(sorted(author_vocab))}
    book_limit = max(processed_books) + 1
    index_sub = scipy.sparse.dok_array((len(subject_dict), book_limit), dtype=np.bool_)
    index_ti = scipy.sparse.dok_array((len(title_dict), book_limit), dtype=np.bool_)
    index_au = scipy.sparse.dok_array((len(author_dict), book_limit), dtype=np.bool_)
    
    for book_id, (subject_tokens, title_tokens, author_tokens) in tokens_buffer.items():
        for token in subject_tokens:
            index_sub[subject_dict[token], book_id] = True
        for token in title_tokens:
            index_ti[title_dict[token], book_id] = True
        for token in author_tokens:
            index_au[author_dict[token], book_id] = True

    scipy.sparse.save_npz(saving_path %("subject_index.npz"), index_sub.tocsr())
    scipy.sparse.save_npz(saving_path %("title_index.npz"), index_ti.tocsr())
    scipy.sparse.save_npz(saving_path %("author_index.npz"), index_au.tocsr())

    save_pickle((subject_dict, title_dict, author_dict), saving_path %("metadata_index_lookup.pkl"))

In [4]:
index_meta_data(metadata)