In [127]:
import os
import pandas as pd
from bitarray import bitarray
from sys import getsizeof

In [23]:
data_folder_path = os.path.join('.', 'data')
books_list = os.listdir(data_folder_path)
books_list

['the-adventures-of-huckleberry-finn.txt',
 'pride-and-prejudice.txt',
 'anna-karenina.txt',
 'jane-eyre.txt',
 'frankenstein.txt',
 '1984.txt',
 'the-great-gatsby.txt',
 'grimms-fairy-tales.txt',
 'wuthering-heights.txt',
 'tender-is-the-night.txt',
 'great-expectations.txt']

In [24]:
index = {}  #  dictionary of sets {'term': set(document_names)}

def populate_index(doc_name, words):
    for word in words:
        if word in index:
            index[word].add(doc_name)
        else:
            index[word] = set([doc_name])

In [25]:
import codecs
import re

dictionary = set()

for book in books_list:
    book_path = os.path.join(data_folder_path, book)
    print(book_path)
    with codecs.open(book_path, "r", "utf_8_sig") as fileObj:
        text = fileObj.read()

    words = re.findall("[a-z]+[’'-]?[a-z]+", text.lower())
    print('\tTotal words count:\t', len(words))
    words = set(words)

    #  Populate reverse index
    populate_index(book, words)

    print('\tUniq words count:\t', len(words), '\n')
    dictionary.update(words)

print('Dictinary length:', len(dictionary))

./data/the-adventures-of-huckleberry-finn.txt
	Total words count:	 105289
	Uniq words count:	 6785 

./data/pride-and-prejudice.txt
	Total words count:	 117752
	Uniq words count:	 6457 

./data/anna-karenina.txt
	Total words count:	 340710
	Uniq words count:	 13545 

./data/jane-eyre.txt
	Total words count:	 175385
	Uniq words count:	 13330 

./data/frankenstein.txt
	Total words count:	 70739
	Uniq words count:	 7067 

./data/1984.txt
	Total words count:	 100662
	Uniq words count:	 9226 

./data/the-great-gatsby.txt
	Total words count:	 45918
	Uniq words count:	 5970 

./data/grimms-fairy-tales.txt
	Total words count:	 97951
	Uniq words count:	 4967 

./data/wuthering-heights.txt
	Total words count:	 110498
	Uniq words count:	 9645 

./data/tender-is-the-night.txt
	Total words count:	 105141
	Uniq words count:	 11242 

./data/great-expectations.txt
	Total words count:	 174719
	Uniq words count:	 11455 

Dictinary length: 35961


In [128]:
getsizeof(index)

1310808

## Bool search in inverted index

In [91]:
print(index['footsteps'], index['affront'], index['mellow'], sep='\n')

{'the-great-gatsby.txt', 'tender-is-the-night.txt', 'great-expectations.txt', 'anna-karenina.txt', 'the-adventures-of-huckleberry-finn.txt', 'frankenstein.txt', 'wuthering-heights.txt'}
{'the-great-gatsby.txt', 'great-expectations.txt', 'pride-and-prejudice.txt', 'anna-karenina.txt', 'the-adventures-of-huckleberry-finn.txt', 'tender-is-the-night.txt'}
{'the-great-gatsby.txt', 'anna-karenina.txt', 'the-adventures-of-huckleberry-finn.txt', 'jane-eyre.txt', 'wuthering-heights.txt'}


In [95]:
def inot(books):
    return set(books_list)-books
inot(index['mellow'])

{'1984.txt',
 'frankenstein.txt',
 'great-expectations.txt',
 'grimms-fairy-tales.txt',
 'pride-and-prejudice.txt',
 'tender-is-the-night.txt'}

In [96]:
index['footsteps'] | index['affront']

{'anna-karenina.txt',
 'frankenstein.txt',
 'great-expectations.txt',
 'pride-and-prejudice.txt',
 'tender-is-the-night.txt',
 'the-adventures-of-huckleberry-finn.txt',
 'the-great-gatsby.txt',
 'wuthering-heights.txt'}

In [None]:
#  or == union
#  and == intersection
#  not == difference (between all documetns and sample)

In [97]:
#  word1 or word2 and not word3
(index['footsteps'] | index['affront']) & inot(index['mellow'])

{'frankenstein.txt',
 'great-expectations.txt',
 'pride-and-prejudice.txt',
 'tender-is-the-night.txt'}

## Bool search in incidence matrix

In [113]:
documetn_count = len(books_list)
document_index = {books_list[i]: i for i in range(documetn_count)}
terms = list(dictionary)
term_index = {terms[i]: i for i in range(len(terms))}

matrix = []
for i in range(len(terms)):
    matrix.append(documetn_count * bitarray('0'))
    for doc in index[terms[i]]:
        matrix[i][document_index[doc]] = 1


In [129]:
getsizeof(matrix)

312024

In [134]:
getsizeof(dictionary)

2097368

In [135]:
len(dictionary), len(term_index)

(35961, 35961)

In [133]:
print(f'{getsizeof(document_index)=}\n{getsizeof(term_index)=}\n{getsizeof(books_list)=}\n{getsizeof(terms)=}')

getsizeof(document_index)=640
getsizeof(term_index)=1310808
getsizeof(books_list)=184
getsizeof(terms)=287752


In [130]:
getsizeof(document_index) + getsizeof(term_index) + getsizeof(books_list) + getsizeof(terms)

1599384

In [119]:
print(matrix[term_index['footsteps']], matrix[term_index['affront']], matrix[term_index['mellow']])

bitarray('10101010111') bitarray('11100010011') bitarray('10110010100')


In [123]:
matrix[term_index['footsteps']] | matrix[term_index['affront']]

bitarray('11101010111')

In [120]:
~matrix[term_index['mellow']]

bitarray('01001101011')

In [125]:
#  word1 or word2 and not word3
search_result = (matrix[term_index['footsteps']] | matrix[term_index['affront']]) & ~matrix[term_index['mellow']]
search_result

bitarray('01001000011')

In [126]:
def get_document_list(search_result):
    documents = []
    for i in range(len(search_result)):
        if search_result[i]:
            documents.append(books_list[i])
    return documents
get_document_list(search_result)

['pride-and-prejudice.txt',
 'frankenstein.txt',
 'tender-is-the-night.txt',
 'great-expectations.txt']