In [193]:
from heapq import heappush, heappop
import os, pickle, random, glob
from collections import defaultdict
from time import time
import heapq

In [194]:
class Node:
    def __init__(self, symbol=None, freq=None, left=None, right=None):
        self.symbol = symbol
        self.freq = freq
        self.left = left
        self.right = right

    def __lt__(self, other):
        return self.freq < other.freq

    def is_leaf(self):
        return self.left is None and self.right is None

## ZAD1

In [195]:
def huffman_encoding(data):
    # count the frequency of each symbol in the input data
    freq = {}
    for symbol in data:
        freq[symbol] = freq.get(symbol, 0) + 1

    # build the heap
    heap = []
    for symbol, freq in freq.items():
        node = Node(symbol, freq)
        heappush(heap, node)

    # build the Huffman tree
    while len(heap) > 1:
        left = heappop(heap)
        right = heappop(heap)
        parent = Node(None, left.freq + right.freq, left, right)
        heappush(heap, parent)

    # create root
    root = heap[0]

    # build the Huffman codes for each symbol
    codes = {}
    def build_codes(node, code):
        if node is None:
            return
        if node.is_leaf():
            codes[node.symbol] = code
            return
        build_codes(node.left, code + '0')
        build_codes(node.right, code + '1')
    build_codes(root, '')

    # encode the input data using the Huffman codes
    encoded_data = ''.join(codes[symbol] for symbol in data)

    # pad the encoded data with zeroes to make its length a multiple of 8
    pad_length = 8 - len(encoded_data) % 8
    encoded_data += '0' * pad_length

    # convert the encoded data to bytes
    encoded_bytes = bytearray()
    for i in range(0, len(encoded_data), 8):
        byte = int(encoded_data[i:i+8], 2)
        encoded_bytes.append(byte)

    return encoded_bytes, root, codes

In [196]:
def huffman_decoding(encoded_data, root):
    # convert the encoded data from bytes to binary string
    encoded_bits = ''.join(format(byte, '08b') for byte in encoded_data)

    # decode the input data using the Huffman tree
    decoded_data = ''
    node = root
    for bit in encoded_bits:
        if bit == '0':
            node = node.left
        else:
            node = node.right
        if node.is_leaf():
            decoded_data += node.symbol
            node = root

    return decoded_data

In [197]:
data = "this is a test string\n"
encoded_bytes, root, codes = huffman_encoding(data)
decoded_data = huffman_decoding(encoded_bytes, root)

print("Input data:", data)
print("Encoded data:", encoded_bytes)
print("Decoded data:", decoded_data)
print("Codes:", codes)

Input data: this is a test string

Encoded data: bytearray(b'-33t\xe6\x1czW\xfa')
Decoded data: this is a test string

Codes: {'t': '00', ' ': '01', 'i': '100', 'n': '1010', 'h': '10110', 'a': '10111', 's': '110', 'e': '11100', '\n': '11101', 'r': '11110', 'g': '11111'}


In [198]:
def huffman(input_file, output_file, remove = False):
    binary_file = "encoded_gutenberg.bin"
    t1 = time()
    with open(input_file, "r", encoding="latin-1") as f:
        data = f.read()
    encoded_bytes, root, codes = huffman_encoding(data)
    data_length = len(data)
    with open(binary_file, "wb") as f:
        pickle.dump(encoded_bytes, f)
    t2 = time()
    with open(binary_file, "rb") as f:
        encoded_bytes = pickle.load(f)
    decoded_data = huffman_decoding(encoded_bytes, root)[:data_length]
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(decoded_data)
    t3 = time()

    res = round(100 - 100*os.stat(binary_file).st_size/os.stat(input_file).st_size)
    if remove:
        os.remove(binary_file)
        os.remove(output_file)
    return res, t2-t1, t3-t2



### Creating rand

In [199]:
# random_input_file = "rand_1kB.txt"
# with open(random_input_file, 'wb') as f:
#     for i in range(10**3):
#         value = random.randint(0, 255)
#         f.write(bytes([value]))

## Time measurement

In [200]:
for input_file in glob.glob("*.txt"):    
    output_file = "output_"+input_file
    compression, c_time, d_time = huffman(input_file, output_file, True)
    print(f"{input_file}\tsize: {round(os.stat(input_file).st_size/1000)}kB \tcopression result: {compression}%\t  Compression time:{round(c_time,3)}s\t Decompression time: {round(d_time,3)}s")


gut_1000kb.txt	size: 988kB 	copression result: 42%	  Compression time:0.365s	 Decompression time: 0.952s
gut_100kb.txt	size: 91kB 	copression result: 44%	  Compression time:0.032s	 Decompression time: 0.086s
gut_10kb.txt	size: 11kB 	copression result: 38%	  Compression time:0.006s	 Decompression time: 0.013s
gut_1kb.txt	size: 1kB 	copression result: 28%	  Compression time:0.002s	 Decompression time: 0.002s
lin_1000kB.txt	size: 1014kB 	copression result: 35%	  Compression time:0.373s	 Decompression time: 1.132s
lin_100kB.txt	size: 28kB 	copression result: 35%	  Compression time:0.013s	 Decompression time: 0.033s
lin_10kB.txt	size: 10kB 	copression result: 34%	  Compression time:0.006s	 Decompression time: 0.013s
lin_1kB.txt	size: 3kB 	copression result: 33%	  Compression time:0.003s	 Decompression time: 0.005s
rand_1000kB.txt	size: 1000kB 	copression result: 0%	  Compression time:0.467s	 Decompression time: 1.567s
rand_100kB.txt	size: 100kB 	copression result: 0%	  Compression time:0.04

## ZAD2

In [201]:
class HuffmanEncoder:
    def __init__(self):
        self.root = None
        self.codes = {}

    def build_tree(self, freq_table):
        # build the heap
        heap = []
        for symbol, freq in freq_table.items():
            node = Node(symbol, freq)
            heapq.heappush(heap, node)

        # build the Huffman tree
        while len(heap) > 1:
            left = heapq.heappop(heap)
            right = heapq.heappop(heap)
            parent = Node(None, left.freq + right.freq, left, right)
            heapq.heappush(heap, parent)

        # create root
        self.root = heap[0]

        # build the Huffman codes for each symbol
        def build_codes(node, code):
            if node is None:
                return
            if node.is_leaf():
                self.codes[node.symbol] = code
                return
            build_codes(node.left, code + '0')
            build_codes(node.right, code + '1')

        build_codes(self.root, '')

    def encode(self, data):
        # update frequency table with new symbols
        freq_table = {}
        for symbol in data:
            freq_table[symbol] = freq_table.get(symbol, 0) + 1

        # build Huffman tree and codes
        self.build_tree(freq_table)

        # encode the input data using the Huffman codes
        encoded_data = ''.join(self.codes[byte] for byte in data)

        # pad the encoded data with zeroes to make its length a multiple of 8
        pad_length = 8 - len(encoded_data) % 8
        encoded_data += '0' * pad_length

        # convert the encoded data to bytes
        encoded_bytes = bytearray()
        for i in range(0, len(encoded_data), 8):
            byte = int(encoded_data[i:i+8], 2)
            encoded_bytes.append(byte)

        return encoded_bytes

class HuffmanDecoder:
    def __init__(self, root):
        self.root = root

    def decode(self, encoded_data):
        # convert the encoded data from bytes to binary string
        encoded_bits = ''.join(format(byte, '08b') for byte in encoded_data)

        # decode the input data using the Huffman tree
        decoded_data = ''
        node = self.root
        for bit in encoded_bits:
            if bit == '0':
                node = node.left
            else:
                node = node.right
            if node.is_leaf():
                decoded_data += node.symbol
                node = self.root

        return decoded_data


In [202]:
data = "this is a test string\n"

encoder = HuffmanEncoder()
encoded_data = encoder.encode(data)
decoder = HuffmanDecoder(encoder.root)
decoded_data = decoder.decode(encoded_data)

print("Original data: {}".format(data))
print("Encoded data: {}".format(encoded_data))
print("Decoded data: {}".format(decoded_data))


Original data: this is a test string

Encoded data: bytearray(b'-33t\xe6\x1czW\xfa')
Decoded data: this is a test string



In [203]:
def dynamic_huffman(input_file, output_file, remove = False):
    binary_file = "encoded_gutenberg.bin"
    t1 = time()
    with open(input_file, "r", encoding="latin-1") as f:
        data = f.read()
    encoder = HuffmanEncoder()
    encoded_bytes = encoder.encode(data)
    data_length = len(data)
    with open(binary_file, "wb") as f:
        pickle.dump(encoded_bytes, f)
    t2 = time()
    with open(binary_file, "rb") as f:
        encoded_bytes = pickle.load(f)
    decoder = HuffmanDecoder(encoder.root)
    decoded_data = decoder.decode(encoded_data)
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(decoded_data)
    t3 = time()
    res = round(100 - 100*os.stat(binary_file).st_size/os.stat(input_file).st_size)
    if remove:
        os.remove(binary_file)
        os.remove(output_file)
    return res, t2-t1, t3-t2


## Time measurement

In [204]:
for input_file in glob.glob("*.txt"): 
    output_file = "output_"+input_file
    compression, c_time, d_time = dynamic_huffman(input_file, output_file, True)
    print(f"{input_file}\tsize: {round(os.stat(input_file).st_size/1000)}kB \tcopression result: {compression}%\t  Compression time:{round(c_time,3)}s\t Decompression time: {round(d_time,3)}s")


gut_1000kb.txt	size: 988kB 	copression result: 42%	  Compression time:0.347s	 Decompression time: 0.002s
gut_100kb.txt	size: 91kB 	copression result: 44%	  Compression time:0.032s	 Decompression time: 0.001s
gut_10kb.txt	size: 11kB 	copression result: 38%	  Compression time:0.005s	 Decompression time: 0.002s
gut_1kb.txt	size: 1kB 	copression result: 28%	  Compression time:0.002s	 Decompression time: 0.002s
lin_1000kB.txt	size: 1014kB 	copression result: 35%	  Compression time:0.381s	 Decompression time: 0.004s
lin_100kB.txt	size: 28kB 	copression result: 35%	  Compression time:0.013s	 Decompression time: 0.001s
lin_10kB.txt	size: 10kB 	copression result: 34%	  Compression time:0.007s	 Decompression time: 0.002s
lin_1kB.txt	size: 3kB 	copression result: 33%	  Compression time:0.003s	 Decompression time: 0.002s
rand_1000kB.txt	size: 1000kB 	copression result: 0%	  Compression time:0.503s	 Decompression time: 0.002s
rand_100kB.txt	size: 100kB 	copression result: 0%	  Compression time:0.05