# Kompresja danych - kodowanie Huffmana

Zadanie polega na implementacji dwóch algorytmów kompresji:
1. statycznego algorytmu Huffmana (2 p)
2. dynamicznego algorytmu Huffmana (3 p)

In [2]:
from heapq import heapify, heappop, heappush
from bitarray import bitarray
import random
import os
import pandas as pd
from time import time

In [3]:
class Node:
    def __init__(self, freq, letter=None, left=None, right=None, parent=None) -> None:
        self.freq = freq
        self.letter = letter
        self.left = left
        self.right = right

    def __lt__(self, other):
        return self.freq < other.freq
    
    def is_leaf(self):
        return self.letter is not None
        

### Static

In [15]:
def count_letters(text):
    letter_freq = {}
    for letter in text:
        if letter in letter_freq:
            letter_freq[letter] += 1
        else:
            letter_freq[letter] = 1
    return letter_freq

def build_tree(text, preproessing=True):
    if preproessing:
        letter_freq = count_letters(text)
    else:
        letter_freq = text
        
    nodes = [Node(freq, letter) for letter, freq in letter_freq.items()]
    heapify(nodes)

    while len(nodes) > 1:
        left = heappop(nodes)
        right = heappop(nodes)
        heappush(nodes, Node(left.freq + right.freq, left=left, right=right))

    return nodes[0]

def build_code_table(root):
    codes = {}
    def build_code(node, code=''):
        if node.is_leaf():
            codes[node.letter] = code
        else:
            build_code(node.left, code + '0')
            build_code(node.right, code + '1')
    
    build_code(root)
    return codes

def encode_static(text, root=None):
    if root is None:
        root = build_tree(text)

    code_table = build_code_table(root)
    encoded = bitarray()
    for letter in text:
        encoded.extend(code_table[letter])
    return encoded

def decode_static(code, root):
    decoded = []
    node = root

    for bit in code:
        if bit:
            node = node.right
        else:
            node = node.left

        if node.is_leaf():
            decoded.append(node.letter)
            node = root

    return ''.join(decoded)

In [5]:
def encode_adaptive(text):
    ...

def decode_adaptive(code):
    ...


In [6]:
kB = 2**10
MB = 2**20

dir = './input_files'

def delete_files():
    for file_name in os.listdir(dir):
        file_path = os.path.join(dir, file_name)
        if os.path.isfile(file_path):
            os.remove(file_path)

In [7]:
guttenberg_file = './source_files/guttenberg.txt'
linux_file = './source_files/linux.txt'

def create_partial_file(input_file_name, output_file_name, size):
    with open(input_file_name, 'rb') as input_file, open(output_file_name, 'wb') as output_file:
        output_file.write(input_file.read(size))

gutenberg_files = [('gutenberg_1kB.txt', 1 * kB), ('gutenberg_10kB.txt', 10 * kB), ('gutenberg_100kB.txt', 100 * kB), ('gutenberg_1MB.txt', 1 * MB)]
linux_files = [('linux_1kB.txt', 1 * kB), ('linux_10kB.txt', 10 * kB), ('linux_100kB.txt', 100 * kB), ('linux_1MB.txt', 1 * MB)]

for file_name, size in gutenberg_files:
    create_partial_file(guttenberg_file, os.path.join(dir, file_name), size)

for file_name, size in linux_files:
    create_partial_file(linux_file, os.path.join(dir, file_name), size)

In [8]:
files = [("random_1kB.txt", 1*kB), ("random_10kB.txt", 10*kB), ("random_100kB.txt", 100*kB), ("random_1MB.txt", 1*MB)]

def generate_random_files():
    for name, size in files:
        path = dir + '/' + name
        with open(path, 'wb') as f:
            f.write(bytes(random.randint(0, 255) for _ in range(size)))
        f.close()

generate_random_files()

In [26]:
save_path = './output_files'
list_dir = os.listdir(dir)

def test():
    compression_res = []
    time_res = []
    for file_name in list_dir:
        # static
        with open(os.path.join(dir, file_name), 'rb') as f_in:
            bytes_data = f_in.read()
            text = bytes_data.decode('utf-8', errors='ignore')

            huffman = build_tree(text)
            start = time()
            encoded = encode_static(text, huffman)
            end = time()
            st_enc_time = end - start
            with open(os.path.join(save_path, 'static_' + file_name), 'wb') as f_out:
                encoded.tofile(f_out)
            f_out.close()

            start = time()
            decoded = decode_static(encoded, huffman)
            end = time()
            st_dec_time = end - start
            if decoded != text:
                print('error')

            compression_ratio_st = f'{(1 - os.path.getsize(os.path.join(save_path, "static_" + file_name)) / os.path.getsize(os.path.join(dir, file_name))) * 100:.2f}%'
        f_in.close()

        # adaptive

        compression_ratio_ad = None
        ad_enc_time = None
        ad_dec_time = None

        compression_res.append((compression_ratio_st, compression_ratio_ad))
        time_res.append((st_enc_time, ad_enc_time, st_dec_time, ad_dec_time))
    return compression_res, time_res


comp_result, time_res = test()
comp_df = pd.DataFrame(comp_result, columns=['static', 'adaptive'], index=list_dir)
time_df = pd.DataFrame(time_res, columns=['st_enc', 'ad_enc', 'st_dec', 'ad_dec'], index=list_dir)

print("COMPRASSION RATIO")
print(comp_df, '\n')
print("TIME")
print(time_df)

COMPRASSION RATIO
                     static adaptive
gutenberg_100kB.txt  42.88%     None
gutenberg_10kB.txt   42.14%     None
gutenberg_1kB.txt    39.55%     None
gutenberg_1MB.txt    43.24%     None
linux_100kB.txt      37.59%     None
linux_10kB.txt       37.94%     None
linux_1kB.txt        36.43%     None
linux_1MB.txt        37.55%     None
random_100kB.txt     49.33%     None
random_10kB.txt      50.06%     None
random_1kB.txt       50.88%     None
random_1MB.txt       49.11%     None 

TIME
                       st_enc ad_enc    st_dec ad_dec
gutenberg_100kB.txt  0.022624   None  0.079132   None
gutenberg_10kB.txt   0.002005   None  0.007566   None
gutenberg_1kB.txt    0.000000   None  0.001004   None
gutenberg_1MB.txt    0.159406   None  0.669312   None
linux_100kB.txt      0.017010   None  0.075776   None
linux_10kB.txt       0.001000   None  0.006061   None
linux_1kB.txt        0.000000   None  0.001027   None
linux_1MB.txt        0.162879   None  0.695429   None
random_1