In [105]:
from math import log

In [1]:
def load_text(filename):
    with open(f"../Data/{filename}", 'r', encoding='utf-8-sig') as f:
        text = f.read()
        return text

In [2]:
czech = load_text('czech.txt')
english = load_text('english.txt')
french = load_text('french.txt')
german = load_text('german.txt')
hungarian = load_text('hungarian.txt')
unknown0 = load_text('unknown0.txt')
unknown1 = load_text('unknown1.txt')
unknown2 = load_text('unknown2.txt')

texts = [
    ('Czech', czech),
    ('English', english),
    ('French', french),
    ('German', german),
    ('Hungarian', hungarian)
]

In [99]:
def find_longest_prefix(text, prefix):
    while len(prefix) > 0:
        #print(f"prefix={prefix}")
        index = text.rfind(prefix)
        if index != -1:
            #print(f"text={text}, prefix={prefix}, index={index}")
            return len(text) - index, len(prefix)
        prefix = prefix[:-1]
    return 0, 0

assert find_longest_prefix('', 'abba') == (0, 0)
assert find_longest_prefix('a', 'bbab') == (0, 0)
assert find_longest_prefix('ab', 'babb') == (1, 1)
assert find_longest_prefix('abba', 'bbba') == (3, 2)
assert find_longest_prefix('abbabbb', 'abaa') == (4, 2)

In [104]:
def encode_lz77(text, window_size, text_size):
    window = ""
    
    while len(text) > 0:
        i, l = find_longest_prefix(window, text[:text_size])
        c = text[l:l+1]
        #print(f"({i}, {l}, {c})")
        yield i, l, c
        if l == 0:
            window += text[0]
        else:
            window += text[:l+1]
        window = window[-window_size:]
        text = text[l+1:]
        #print(f"{window}|{text}")

#list(encode_lz77('abbabbbabaa', 10, 4))

In [115]:
print(f"{'Name':<12} {'Triplets':>10} {'FileSize':>10} {'WindowSize':>12} {'Max.Match':>10} {'Enc.Size':>10} {'bps':>10}")
for name, text in texts:
    print('-' * 80)
    for window_size, max_match in [(2**12, 16), (2**14, 32), (2**15, 64)]:
        triplets = list(encode_lz77(text, window_size, max_match))
        size_per_triplet = int(log(window_size, 2) + log(max_match, 2) + 8)
        encoded_size = len(triplets) * size_per_triplet
        bps = encoded_size / len(text)
        print(f"{name:<12} {len(triplets):>10} {len(text):>10} {window_size:>12} {max_match:>10} {encoded_size:>10} {bps:>10.3f}")

Name           Triplets   FileSize   WindowSize  Max.Match   Enc.Size        bps
--------------------------------------------------------------------------------
Czech             27261     136448         4096         16     654264      4.795
Czech             23048     136448        16384         32     622296      4.561
Czech             21651     136448        32768         64     627879      4.602
--------------------------------------------------------------------------------
English           28162     146480         4096         16     675888      4.614
English           23503     146480        16384         32     634581      4.332
English           21477     146480        32768         64     622833      4.252
--------------------------------------------------------------------------------
French            26989     143274         4096         16     647736      4.521
French            22331     143274        16384         32     602937      4.208
French            20841     