In [1]:
import gc

import numpy as np
import scipy.sparse
from tqdm import tqdm, trange

from KeywordSearch import loader, indexing, utils, kwsearch, cloud_index

Using `is` instead of `=` for comparison in performance-critical code is acceptable
Downloading stopwords...


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\10022\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


All languages: ['afrikaans', 'arabic', 'breton', 'bulgarian', 'catalan', 'chinese', 'czech', 'danish', 'dutch', 'english', 'esperanto', 'estonian', 'finnish', 'french', 'galician', 'german', 'greek', 'hebrew', 'hungarian', 'icelandic', 'interlingua ', 'inuktitut', 'irish', 'italian', 'japanese', 'korean', 'latin', 'lithuanian', 'maori', 'norwegian', 'occitan ', 'persian', 'polish', 'portuguese', 'romanian', 'russian', 'sanskrit', 'serbian', 'slovenian', 'spanish', 'swedish', 'tagalog', 'telugu', 'tibetan', 'welsh', 'western frisian', 'yiddish']


In [2]:
def calculate_raw_tf(counts_index: list[dict], max_book_id: int, scale: float = 100.0, dtype=np.float32):
    shape = (max_book_id + 1, len(counts_index))
    mat = scipy.sparse.dok_matrix(shape, dtype=dtype)
    for i, token_dict in tqdm(enumerate(counts_index), total=len(counts_index), ncols=80, miniters=5000):
        mat[list(token_dict.keys()), i] = np.array(list(token_dict.values()), np.uint32) / scale # uint32 and divide by scale avoid overflow
    return mat

In [3]:
def calculate_idf(counts_index: list[dict], N_doc: int, smoothing: int=1, **kwargs):
    n_t = np.array([len(d) for d in counts_index], **kwargs)
    return np.log2((N_doc + smoothing) / (n_t + smoothing))

In [4]:
N_doc = len(loader.processed_books)

In [5]:
doc_len = indexing.fetch_all_doc_length(batch_size=100)

Counting books: 100%|██████████| 70974/70974 [00:44<00:00, 1604.98it/s]


In [6]:
gc.collect()
counts_index = loader.load_merged_index(max_workers=8, dummy=True)

2987 segments to load


Loading segments: 100%|██████████| 2987/2987 [02:45<00:00, 18.09it/s]
Merging segments: 100%|██████████| 2987/2987 [00:09<00:00, 299.43it/s]



Garbage collection done
The index took 2 minutes  56 seconds to load
All done


In [7]:
gc.collect()
doc_len_arr = np.array(utils.dict2arr(doc_len, dtype=np.float32), dtype=np.float32, ndmin=2).T
del doc_len
gc.collect()

0

In [8]:
idf_arr = np.array(calculate_idf(counts_index, N_doc, dtype=np.float32), dtype=np.float16)

In [9]:
tf = calculate_raw_tf(counts_index, max(loader.processed_books), scale=100.0)
tf

  0%|                                              | 0/14930836 [00:00<?, ?it/s]

100%|████████████████████████████| 14930836/14930836 [16:20<00:00, 15224.86it/s]


<72535x14930836 sparse matrix of type '<class 'numpy.float32'>'
	with 357847382 stored elements in Dictionary Of Keys format>

In [10]:
tfidf = tf.multiply(1 / (doc_len_arr)).multiply(1 / idf_arr)
del tf, counts_index
gc.collect()

  tfidf = tf.multiply(1 / (doc_len_arr)).multiply(1 / idf_arr)


0

In [11]:
# scipy's spare matrix multiplication code is very flawed, ndmin=2 and transpose are necessary
norm = np.array(scipy.sparse.linalg.norm(tfidf, 2, axis=1), ndmin=2).T
tfidf = tfidf.multiply(1 / norm) # normalization

  tfidf = tfidf.multiply(1 / norm) # normalization


In [14]:
gc.collect()
scipy.sparse.save_npz("KeywordSearch/tfidf.npz", tfidf.tocsr().astype(np.float16))
np.savez_compressed("KeywordSearch/idf.npz", idf_arr)