Обрати один з вказаних методів та стиснути словник і інвертований індекс колекції

In [110]:
import os
import numpy as np

In [111]:
data_folder_path = os.path.join('.', 'data')
books_list = os.listdir(data_folder_path)
books_list

['the-adventures-of-huckleberry-finn.txt',
 'g1_1.txt',
 'g1_0.txt',
 'g1_10.txt',
 'g1_2.txt',
 'pride-and-prejudice.txt',
 'g1_3.txt',
 'anna-karenina.txt',
 'g1_7.txt',
 'jane-eyre.txt',
 'g1_6.txt',
 'g1_4.txt',
 'g1_5.txt',
 'frankenstein.txt',
 'g1_8.txt',
 'g1_9.txt',
 '1984.txt',
 'the-great-gatsby.txt',
 'grimms-fairy-tales.txt',
 'wuthering-heights.txt',
 'tender-is-the-night.txt',
 'great-expectations.txt']

In [112]:
index = {}  #  dictionary of sets {'term': set(document_names)}

def populate_index(doc_name, words):
    for word in words:
        if word in index:
            index[word].add(doc_name)
        else:
            index[word] = set([doc_name])

In [113]:
import codecs
import re

for book in books_list:
    book_path = os.path.join(data_folder_path, book)
    with codecs.open(book_path, "r", "utf_8_sig") as fileObj:
        text = fileObj.read()

    words = re.findall("[a-z]+[’'-]?[a-z]+", text.lower())
    words = set(words)

    #  Populate reverse index
    populate_index(book, words)

In [114]:
index

{'marked': {'1984.txt',
  'anna-karenina.txt',
  'frankenstein.txt',
  'g1_0.txt',
  'g1_1.txt',
  'g1_10.txt',
  'g1_2.txt',
  'g1_3.txt',
  'g1_4.txt',
  'g1_5.txt',
  'g1_6.txt',
  'g1_7.txt',
  'g1_8.txt',
  'g1_9.txt',
  'great-expectations.txt',
  'grimms-fairy-tales.txt',
  'jane-eyre.txt',
  'pride-and-prejudice.txt',
  'tender-is-the-night.txt',
  'the-adventures-of-huckleberry-finn.txt',
  'the-great-gatsby.txt',
  'wuthering-heights.txt'},
 'cited': {'anna-karenina.txt',
  'g1_10.txt',
  'g1_6.txt',
  'g1_9.txt',
  'the-adventures-of-huckleberry-finn.txt'},
 'waw-path': {'g1_0.txt',
  'g1_1.txt',
  'g1_10.txt',
  'g1_5.txt',
  'g1_7.txt',
  'the-adventures-of-huckleberry-finn.txt'},
 'consekens': {'g1_0.txt',
  'g1_1.txt',
  'g1_4.txt',
  'g1_8.txt',
  'the-adventures-of-huckleberry-finn.txt'},
 'crossbones': {'g1_10.txt',
  'g1_5.txt',
  'g1_9.txt',
  'the-adventures-of-huckleberry-finn.txt'},
 'acquainted': {'anna-karenina.txt',
  'frankenstein.txt',
  'g1_0.txt',
  'g1_1.

In [115]:
dictionary = sorted(index)
dictionary[:10]

['a-a',
 'a-barking',
 'a-beaming',
 'a-bear',
 'a-bed',
 'a-begging',
 'a-bilin',
 'a-biling',
 'a-bitch',
 'a-blazing']

In [116]:
term_string = ''
compresed_dictionary = []
for term in dictionary:
    compresed_dictionary.append(len(term_string))
    term_string += term
compresed_dictionary[:10], term_string[:60]

([0, 3, 12, 21, 27, 32, 41, 48, 56, 63],
 'a-aa-barkinga-beaminga-beara-beda-begginga-bilina-bilinga-bi')

In [117]:
from pympler.asizeof import asizeof

print(f'Original dictionary size: {asizeof(dictionary)}')
compr_dict_size = asizeof(compresed_dictionary)
term_str_size = asizeof(term_string)
print(f'Compressed dictionary size: {compr_dict_size+term_str_size} ({term_str_size} for term string and {compr_dict_size} for pointers)')

Original dictionary size: 2506464
Compressed dictionary size: 2041176 (578408 for term string and 1462768 for pointers)


In [118]:
doc_id_dict = {books_list[i]:i for i in range(len(books_list))}
doc_id_dict

{'the-adventures-of-huckleberry-finn.txt': 0,
 'g1_1.txt': 1,
 'g1_0.txt': 2,
 'g1_10.txt': 3,
 'g1_2.txt': 4,
 'pride-and-prejudice.txt': 5,
 'g1_3.txt': 6,
 'anna-karenina.txt': 7,
 'g1_7.txt': 8,
 'jane-eyre.txt': 9,
 'g1_6.txt': 10,
 'g1_4.txt': 11,
 'g1_5.txt': 12,
 'frankenstein.txt': 13,
 'g1_8.txt': 14,
 'g1_9.txt': 15,
 '1984.txt': 16,
 'the-great-gatsby.txt': 17,
 'grimms-fairy-tales.txt': 18,
 'wuthering-heights.txt': 19,
 'tender-is-the-night.txt': 20,
 'great-expectations.txt': 21}

In [119]:
id_doc_dict = {i:books_list[i] for i in range(len(books_list))}
id_doc_dict

{0: 'the-adventures-of-huckleberry-finn.txt',
 1: 'g1_1.txt',
 2: 'g1_0.txt',
 3: 'g1_10.txt',
 4: 'g1_2.txt',
 5: 'pride-and-prejudice.txt',
 6: 'g1_3.txt',
 7: 'anna-karenina.txt',
 8: 'g1_7.txt',
 9: 'jane-eyre.txt',
 10: 'g1_6.txt',
 11: 'g1_4.txt',
 12: 'g1_5.txt',
 13: 'frankenstein.txt',
 14: 'g1_8.txt',
 15: 'g1_9.txt',
 16: '1984.txt',
 17: 'the-great-gatsby.txt',
 18: 'grimms-fairy-tales.txt',
 19: 'wuthering-heights.txt',
 20: 'tender-is-the-night.txt',
 21: 'great-expectations.txt'}

In [120]:
posting_index = {}
for term, docs in index.items():
    posting_index[term] = sorted([doc_id_dict[d] for d in docs])
posting_index

{'marked': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21],
 'cited': [0, 3, 7, 10, 15],
 'waw-path': [0, 1, 2, 3, 8, 12],
 'consekens': [0, 1, 2, 11, 14],
 'crossbones': [0, 3, 12, 15],
 'acquainted': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 19, 21],
 'forsook': [0, 1, 2, 3, 4, 6, 8, 9, 10, 11, 12, 14, 15, 16, 21],
 'down-stream': [0, 3, 4, 6, 8, 10, 11, 12, 14, 15],
 'hole’s': [0, 1, 2, 3, 4, 10, 11, 14, 15, 21],
 'studied': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  21],
 'boom': [0, 1, 2, 3, 4, 6, 10, 11, 12, 14, 15, 17, 21],
 'center': [0, 1, 2, 3, 4, 6, 7, 8, 10, 11, 12, 14, 15, 17],
 'budge': [0, 1, 3, 4, 6, 10, 11, 12, 14, 15],
 'slammed': [0, 1, 2, 3, 4, 6, 7, 8, 10, 11, 12, 14, 15, 16, 17, 19],
 'yourselves': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  10,
  11,
  12,
  13,
  14,
  15,
  18,
  19,
  20,
  21],
 'harem': [

In [121]:
interval_posting_index = {}
for term, doc_ids in posting_index.items():
    interval_posting_index[term] = [doc_ids[0]] + [doc_ids[i+1]-doc_ids[i] for i in range(len(doc_ids)-1)]
interval_posting_index

{'marked': [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'cited': [0, 3, 4, 3, 5],
 'waw-path': [0, 1, 1, 1, 5, 4],
 'consekens': [0, 1, 1, 9, 3],
 'crossbones': [0, 3, 9, 3],
 'acquainted': [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 2],
 'forsook': [0, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 5],
 'down-stream': [0, 3, 1, 2, 2, 2, 1, 1, 2, 1],
 'hole’s': [0, 1, 1, 1, 1, 6, 1, 3, 1, 6],
 'studied': [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2],
 'boom': [0, 1, 1, 1, 1, 2, 4, 1, 1, 2, 1, 2, 4],
 'center': [0, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2],
 'budge': [0, 1, 2, 1, 2, 4, 1, 1, 2, 1],
 'slammed': [0, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2],
 'yourselves': [0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 3, 1, 1, 1],
 'harem': [0, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 5],
 'searched': [0, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1],
 'givin': [0, 3, 1, 6, 4],
 'likely': [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [122]:
bin(824)[2:]

'1100111000'

In [123]:
def binaryze_number(number, bit_in_byte=8):
    bin_number = bin(number)[2:]
    bin_number = '0' * (bit_in_byte - len(bin_number)%bit_in_byte) + bin_number
    return bin_number

In [124]:
def vb_encoding(number):
    bin_number = binaryze_number(number, 7)
    res = ''
    for i in range(0, len(bin_number)-7, 7):
        res += '0' + bin_number[i:i+7]# + '_'
    res += '1' + bin_number[-7:]
    return res
vb_encoding(824), vb_encoding(5), vb_encoding(214577)

('0000011010111000', '10000101', '000011010000110010110001')

In [125]:
def vb_decoding(code):
    res = []
    buf = ''
    for i in range(0, len(code), 8):
        part = code[i:i+8]
        buf += part[1:]
        if part.startswith('1'):
            res.append(int(buf, 2))
            buf = ''
    return res
vb_decoding('000001101011100010000101000011010000110010110001')

[824, 5, 214577]

In [126]:
vb_interval_posting_index = {}
for term, doc_ids in interval_posting_index.items():
    vb_interval_posting_index[term] = [vb_encoding(ind) for ind in doc_ids]
vb_interval_posting_index

{'marked': ['10000000',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001'],
 'cited': ['10000000', '10000011', '10000100', '10000011', '10000101'],
 'waw-path': ['10000000',
  '10000001',
  '10000001',
  '10000001',
  '10000101',
  '10000100'],
 'consekens': ['10000000', '10000001', '10000001', '10001001', '10000011'],
 'crossbones': ['10000000', '10000011', '10001001', '10000011'],
 'acquainted': ['10000000',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000100',
  '10000010'],
 'forsook': ['10000000',
  '10000001',
  '10000001',
  '10000001',
  '10000001',
  '10000010',
  '100000

In [127]:
int('1100111000', 2)

824

In [128]:
with open('posting_index', 'wb') as wr:
    for d in dictionary:
        np.array(posting_index[d]).tofile(wr)

In [129]:
with open('interval_posting_index', 'wb') as wr:
    for d in dictionary:
        np.array(interval_posting_index[d]).tofile(wr)

In [130]:
with open('vb_interval_posting_index', 'wb') as wr:
    for d in dictionary:
        np.array([int(vb, 2) for vb in vb_interval_posting_index[d]]).tofile(wr)

In [131]:
with open('posting_index.txt', 'w') as wr:
    for d in dictionary:
        wr.write(f"{''.join([binaryze_number(n) for n in posting_index[d]])}\n")

In [132]:
with open('interval_posting_index.txt', 'w') as wr:
    for d in dictionary:
        wr.write(f"{''.join([binaryze_number(n) for n in interval_posting_index[d]])}\n")

In [133]:
with open('vb_interval_posting_index.txt', 'w') as wr:
    for d in dictionary:
        wr.write(f"{''.join([vb for vb in vb_interval_posting_index[d]])}\n")