In [45]:
def variable_byte_decode(encoded_bytes):
    """Decodes a sequence of bytes using variable byte encoding.

    Args:
    encoded_bytes (list of int): The encoded bytes.

    Returns:
    list of int: The decoded integers.
    """
    decoded_numbers = []
    current_number = 0
    for byte in encoded_bytes:
        # Add the 7 least significant bits of the byte to the current number
        current_number = (current_number << 7) | (byte & 0x7F)
        # Check if this is the last byte in the number
        if (byte & 0x80) == 0:  # If the most significant bit is not set
            decoded_numbers.append(current_number)
            current_number = 0  # Reset for the next number
    return decoded_numbers

In [27]:
def variable_byte_encode(number):
    """Encodes a number using variable byte encoding."""
    if number == 0:
        return [0]

    bytes_list = []
    while number > 0:
        bytes_list.insert(0, number % 128)
        number >>= 7

    # Set the most significant bit to 1 for all but the last byte
    for i in range(len(bytes_list) - 1):
        bytes_list[i] |= 0x80
    bytes_list[-1] |= 0x00  # Ensure the last byte is in the range 0-127

    return bytes_list

In [91]:
def ungap_compression(posting_array):
    # Calculate document ID from gap and update postings
    last_doc_id = 0
    posting_list = {}
    for index in range(0,len(posting_array),2):
        doc_id = last_doc_id + posting_array[index]
        posting_list[str(doc_id)] = posting_array[index+1]
        last_doc_id = doc_id
    return posting_list

In [104]:
def load_inverted_index_binary(filename):
    """
    Loads the inverted index from a binary file.

    Args:
    filename (str): The filename of the inverted index file.

    Returns:
    dict: The inverted index.
    """
    inverted_index = {}
    with open(filename, 'rb') as file:
        while True:
            term_id = 0
            term_id_byte = file.read(1)
            if not term_id_byte:
                break  # End of file reached

            term_id = (term_id << 7) | (term_id_byte[0] & 0x0F)
            print("1 : ",term_id)
            while (term_id_byte[0] & 0x80) != 0:
                term_id_byte = file.read(1)
                term_id = (term_id << 7) | (term_id_byte[0] & 0x0F)
                print("2 : ",term_id)
            posting_length = 0
            while True:
                posting_length_byte = file.read(1)

                posting_length = (posting_length << 7) | (posting_length_byte[0] & 0x7F)

                if (posting_length_byte[0] & 0x80) == 0:
                    break  # Last byte of the posting_length

            posting_byte = file.read(posting_length)
            posting_array = variable_byte_decode(posting_byte)
            inverted_index[term_id] = ungap_compression(posting_array)

    return inverted_index

In [105]:
target = "../data/inverted_index_1000.bin"
inverted_index = load_inverted_index_binary(target)
print(inverted_index)

1 :  0
1 :  1
1 :  2
1 :  3
1 :  4
1 :  5
1 :  6
1 :  7
1 :  8
1 :  9
1 :  10
1 :  11
1 :  12
1 :  13
1 :  14
1 :  15
1 :  0
1 :  1
1 :  2
1 :  3
1 :  4
1 :  5
1 :  6
1 :  7
1 :  8
1 :  9
1 :  10
1 :  11
1 :  12
1 :  13
1 :  14
1 :  15
1 :  0
1 :  1
1 :  2
1 :  3
1 :  4
1 :  5
1 :  6
1 :  7
1 :  8
1 :  9
1 :  10
1 :  11
1 :  12
1 :  13
1 :  14
1 :  15
1 :  0
1 :  1
1 :  2
1 :  3
1 :  4
1 :  5
1 :  6
1 :  7
1 :  8
1 :  9
1 :  10
1 :  11
1 :  12
1 :  13
1 :  14
1 :  15
1 :  0
1 :  1
1 :  2
1 :  3
1 :  4
1 :  5
1 :  6
1 :  7
1 :  8
1 :  9
1 :  10
1 :  11
1 :  12
1 :  13
1 :  14
1 :  15
1 :  0
1 :  1
1 :  2
1 :  3
1 :  4
1 :  5
1 :  6
1 :  7
1 :  8
1 :  9
1 :  10
1 :  11
1 :  12
1 :  13
1 :  14
1 :  15
1 :  0
1 :  1
1 :  2
1 :  3
1 :  4
1 :  5
1 :  6
1 :  7
1 :  8
1 :  9
1 :  10
1 :  11
1 :  12
1 :  13
1 :  14
1 :  15
1 :  0
1 :  1
1 :  2
1 :  3
1 :  4
1 :  5
1 :  6
1 :  7
1 :  8
1 :  9
1 :  10
1 :  11
1 :  12
1 :  13
1 :  14
1 :  15
1 :  1
2 :  128
1 :  1
2 :  129
1 :  1
2 :  130
1 :  1
2

In [84]:
print(variable_byte_decode(b'\x00\x01\x82\\\x01'))

print(variable_byte_decode(variable_byte_encode(1028)))

[0, 1, 348, 1]
[1028]


In [6]:
print(inverted_index)

{0: {'10': 0, '11': 434, '12': 27, '13': 238, '14': 1, '16': 0, '17': 2, '22': 0, '23': 955, '24': 3, '26': 0, '27': 4, '38': 0, '39': 116, '40': 16, '41': 721, '42': 81, '43': 5, '78': 0, '79': 41, '80': 142, '81': 1, '82': 420, '83': 95, '84': 1, '85': 1, '86': 1, '87': 1, '88': 2, '89': 1, '90': 1, '91': 1, '92': 166, '93': 75, '94': 6, '104': 0, '105': 383, '106': 43, '107': 481, '108': 7, '135': 0, '136': 2, '137': 243, '138': 320, '139': 102, '140': 33, '141': 3, '142': 22, '143': 20, '144': 2, '145': 166, '146': 67, '147': 8, '170': 0, '171': 7, '172': 160, '173': 20, '174': 11, '175': 88, '176': 91, '177': 266, '178': 121, '179': 223, '180': 9, '238': 0, '239': 10, '240': 1, '241': 1, '242': 1, '244': 2, '245': 1, '246': 3, '247': 122, '248': 4, '249': 19, '250': 86, '251': 9, '252': 1, '253': 51, '254': 88, '255': 8, '256': 18, '257': 214, '258': 42, '259': 4, '260': 1, '261': 101, '262': 42, '263': 2, '264': 7, '265': 145, '266': 14, '267': 10, '272': 0, '273': 238, '274': 11