In [12]:
text = "Embrace the journey, for every step forward shapes the path that lies ahead. The road may twist and turn, but with curiosity and courage, you’ll discover opportunities hidden in the most unexpected places. Stay bold in your dreams and steady in your actions, letting each choice echo the vision of the future you wish to create. Remember, growth lies not in the destination but in the lessons learned along the way."

In [13]:
import sys
import os
sys.path.append(os.getcwd())
from compressor import Compressor

compressor = Compressor()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
test = "abracadabra"
compressed = compressor.rank_encode(test)
print(compressed)
decompressed = compressor.rank_decode(compressed)
print(decompressed)

[52548, 1794, 0, 0]
abracadabra


In [15]:
ranks = compressor.rank_encode(text)

In [16]:
ranks_in_bits = [format(rank, 'b') for rank in ranks]
print(ranks_in_bits)

['110111100001000', '10010', '0', '101100', '1011', '1111', '1001', '0', '0', '10010011', '0', '1', '1001', '10', '0', '0', '1', '1', '11', '110', '0', '0', '0', '0', '101', '100010', '0', '1010', '0', '1', '101', '1', '1111', '10010', '1', '10', '1', '1', '0', '0', '10010', '111110', '10', '0', '10', '1', '11011000', '10', '0', '101100', '0', '100100', '1000', '10001', '1001010', '100', '10111101', '1', '10', '0', '10', '111', '0', '0', '0', '1', '0', '10001', '110', '10', '0', '0', '10', '0', '0', '0', '101100011', '0', '0', '0', '0', '0']


In [17]:
compressor.tokenizer.vocab_size

151643

In [18]:
len(format(compressor.tokenizer.vocab_size, 'b'))

18

In [24]:
def encode_with_length_prefix(sequences):
    encoded = ""
    for seq in sequences:
        length = len(seq)
        length_prefix = f"{length:05b}"
        encoded += length_prefix + seq
    return encoded

def decode_with_length_prefix(encoded):
    sequences = []
    i = 0
    while i < len(encoded):
        # Read the 5-bit length prefix
        length = int(encoded[i:i+5], 2)
        i += 5
        # Extract the sequence
        sequence = encoded[i:i+length]
        sequences.append(sequence)
        i += length
    return sequences

encoded = encode_with_length_prefix(ranks_in_bits)
print(encoded)
decoded = decode_with_length_prefix(encoded)
print(decoded)
assert decoded == ranks_in_bits

print("Number of bits for original text:", len(text.encode('utf-8')))
print("Number of bits for compressed text:", len(encoded))

import zlib
text_compress = zlib.compress(text.encode('utf-8'), 9)
ranks_compress = zlib.compress(encoded.encode('utf-8'), 9)
ranks_delimited_compress = zlib.compress(" ".join(ranks_in_bits).encode('utf-8'), 9)
print("Number of bits for original text compressed with zlib:", len(text_compress))
print("Number of bits for ranks compressed with zlib:", len(ranks_compress))
print("Number of bits for ranks with delimiters compressed with zlib:", len(ranks_delimited_compress))

print("BPB for original compressed text:", len(text_compress) * 8 / len(text))
print("BPB for rank compressed text with length prefix:", len(ranks_compress) * 8 / len(text))
print("BPB for rank compressed text with delimiters:", len(ranks_delimited_compress) * 8 / len(text))

ranks_decoded = [int(i, 2)  for i in zlib.decompress(ranks_delimited_compress).decode('utf-8').split(" ")]
print(ranks_decoded)

01111110111100001000001011001000001000110101100001001011001001111001001001000010000010010001001001100001000001100100100100010100000100000100000110000110001011000111100000100000100000100000100001110100110100010000010001001010000010000011000111010000110010011110010110010000011000101000001100001100001000001000101100100011011111000010100000100001010000011010001101100000010100000100011010110000001000110100100001001000001011000100111100101000011100010001011110100001100010100000100001010000111110000100000100000100000110000100010110001000111100001010000010000010000101000001000001000001001001101100011000010000010000010000010000010
['110111100001000', '10010', '0', '101100', '1011', '1111', '1001', '0', '0', '10010011', '0', '1', '1001', '10', '0', '0', '1', '1', '11', '110', '0', '0', '0', '0', '101', '100010', '0', '1010', '0', '1', '101', '1', '1111', '10010', '1', '10', '1', '1', '0', '0', '10010', '111110', '10', '0', '10', '1', '11011000', '10', '0', '101100', '0', '100100', '1000', '10001