# Lab 3 Huffman Jakub Janicki 

In [1]:
from heapq import heappop, heappush, heapify
from bitarray import bitarray
import os
from time import perf_counter
from bitarray import bitarray
from collections import defaultdict
import tabulate

## Static Huffman

In [2]:
class Leaf:
    def __init__(self, value, char=None, left=None, right=None):
        self.char = char
        self.value = value
        self.right = right
        self.left = left

    def __gt__(self, other):
        return self.value > other.value


def count_letters(text):
    letter_counts = dict()
    for char in text:
        letter_counts[char] = letter_counts.get(char, 0) + 1
    return letter_counts


def create_codes(node, codes, code):
    if node.char:
        codes[node.char] = code

    code_copy = code.copy()
    if node.left:
        code.append(0)
        create_codes(node.left, codes, code)
    if node.right:
        code = code_copy
        code.append(1)
        create_codes(node.right, codes, code)


def build_static_huffman_tree(letter_counts):
    leafs = []

    for char, weight in letter_counts.items():
        leafs.append(Leaf(weight, char))

    while len(leafs) > 1:
        first, second = heappop(leafs), heappop(leafs)
        heappush(leafs, Leaf(first.value + second.value, left=first, right=second))

    return leafs[0]


def encode_static(tree, text):
    res = bitarray()
    for char in text:
        res.extend(tree.codes[char])
    return res


def decode_static(tree, bitarray):
    node = tree.root
    res = ""
    for bite in bitarray:
        if not bite:
            node = node.left
        else:
            node = node.right
        if not node.left and not node.right:
            res += node.char
            node = tree.root
    return res


class StaticHuffmanTree:
    def __init__(self, text):
        self.root = build_static_huffman_tree(count_letters(text))
        self.codes = dict()
        create_codes(self.root, self.codes, bitarray())


In [3]:
text = "ala ma kota kot ma ale"
tree = StaticHuffmanTree(text)
res = encode_static(tree, text)
print(decode_static(tree, res))

ala ma kota kot ma ale


## Adaptative Huffman

In [4]:
class Node:
    def __init__(self, weight=1, char=None, index=0, left=None, right=None, parent=None):
        self.char = char
        self.weight = weight
        self.index = index
        self.right = right
        self.left = left
        self.parent = parent

    def swap(self, other):
        self.index, other.index = other.index, self.index
        self_parent, other_parent = self.parent, other.parent

        if self_parent.right == self:
            self_parent.right = other
        else:
            self_parent.left = other

        if other_parent.right == other:
            other_parent.right = self
        else:
            other_parent.left = self

        if self_parent != other_parent:
            other.parent, self.parent = self_parent, other_parent

    def get_code(self):
        node, code = self, bitarray()
        while node.parent:
            if node == node.parent.left:
                code.append(0)
            else:
                code.append(1)
            node = node.parent
        code.reverse()
        return code

    def print(self, level=0):
        print('\t' * level + repr(self.char) + repr(self.weight))
        children = []
        if self.left: children.append(self.left)
        if self.right: children.append(self.right)
        for child in children:
            child.print(level + 1)


class AdaptiveHuffmanTree:
    def __init__(self):
        # because ascii is 256
        self.index = 520
        self.root = Node(weight=0, index=self.index)
        self.index -= 1
        # private unicode
        self.NYT = "\U000F0000"
        self.nodes = dict({self.NYT: self.root})
        self.weights = defaultdict(set)
        self.upload_weights(self.root)

    def get_NYT(self):
        return self.nodes[self.NYT]

    def upload_weights(self, node):
        self.weights[node.weight].add(node)

    def update_leader(self, node):
        if node.weight != node.parent.weight:
            leader = max(self.weights[node.weight], key=lambda x: x.index)
            if leader != node:
                node.swap(leader)

    def add(self, char):
        node = self.get_NYT()
        node.left = Node(weight=0, index=self.index - 1, parent=node)
        node.right = Node(index=self.index, parent=node, char=char)
        self.nodes[self.NYT] = node.left
        self.nodes[char] = node.right
        self.index -= 2
        self.upload_weights(node.left)
        self.upload_weights(node.right)

        self.insert(node)

    def insert(self, node):
        if node.parent:
            self.update_leader(node)

        self.weights[node.weight].remove(node)
        node.weight += 1
        self.upload_weights(node)

        if node.parent:
            self.insert(node.parent)

    def print(self):
        self.root.print(0)


def encode_adaptive(text):
    tree = AdaptiveHuffmanTree()
    res = bitarray()
    for char in text:
        if char not in tree.nodes:
            res.extend(tree.get_NYT().get_code())
            res.frombytes(char.encode())
            tree.add(char)
        else:
            res.extend(tree.nodes[char].get_code())
            tree.insert(tree.nodes[char])
    tree.root.print()
    return res


def decode_adaptive(bits):
    tree = AdaptiveHuffmanTree()
    current = tree.root
    res = ''
    i = 0
    while i < len(bits):
        if current.right:
            if bits[i]:
                current = current.right
            else:
                current = current.left
            i += 1
        else:
            char = current.char
            if current != tree.get_NYT():
                tree.insert(tree.nodes[char])
            else:
                for size in range(8, len(bits) + 1, 8):
                    try:
                        char = bits[i:i + size].tobytes().decode()
                        i += size
                        break
                    except:
                        pass
                tree.add(char)

            res += char
            current = tree.root

    if not current.right:
        res += current.char
    return res

In [5]:
res = encode_adaptive("aabrc")
print(decode_adaptive(res))

None5
	'a'2
	None3
		'b'1
		None2
			None1
				None0
				'c'1
			'r'1
aabrc


## Tests

In [6]:
paths = ["assets/1KB.txt", "assets/10KB.txt", "assets/100KB.txt", "assets/1MB.txt", "assets/guttenberg.txt", "assets/linux.txt", "assets/uniform.txt"]

In [7]:
def correctness_test(input):

    with open(input, "r") as file:
        text = file.read()

    static_tree = StaticHuffmanTree(text)
    res = encode_static(static_tree, text)
    
    if decode_static(static_tree, res) == text:
        print(f'Static huffman   passed {input} test')
    
    res = encode_adaptive(text)
    if decode_adaptive(res) == text:
        print(f'Adaptive huffman passed {input} test')

In [8]:
for path in paths:
    correctness_test(path)

Static huffman   passed 1KB.txt test
None1024
	None420
		None196
			None93
				None45
					'Q'22
					'n'23
				None48
					'O'23
					None25
						'P'12
						'm'13
			None103
				None51
					'A'25
					'r'26
				None52
					'd'26
					'E'26
		None224
			None109
				None54
					'v'27
					'H'27
				None55
					'G'27
					None28
						'U'13
						'x'15
			None115
				None56
					'p'28
					'L'28
				None59
					'I'29
					None30
						'l'15
						'X'15
	None604
		None269
			None125
				None61
					None30
						'Y'15
						'i'15
					None31
						'T'15
						'F'16
				None64
					None32
						'R'16
						'o'16
					None32
						'j'16
						't'16
			None144
				None71
					None35
						'D'17
						None18
							None7
								None1
									None0
									'\n'1
								'Z'6
							'C'11
					None36
						'c'18
						'a'18
				None73
					None36
						'w'18
						's'18
					None37
						'S'18
						'z'19
		None335
			None162
				None78
					None38
						'K'19
						'N'19
					None40
						

Adaptive huffman passed guttenberg.txt test
Static huffman   passed linux.txt test
None666546
	None271461
		None129183
			None62242
				'\t'30928
				None31314
					'b'15487
					None15827
						None7807
							None3812
								'M'1887
								'4'1925
							'C'3995
						None8020
							'w'3997
							None4023
								None1974
									None954
										None453
											None213
												'~'103
												'Z'110
											'8'240
										'6'501
									None1020
										'|'505
										None515
											'<'245
											'Y'270
								'B'2049
			' '66941
		None142278
			None67972
				't'32597
				'e'35375
			None74306
				None36106
					None17171
						None8301
							'='4085
							None4216
								None2075
									'2'1036
									'G'1039
								None2141
									'3'1068
									'H'1073
						'*'8870
					'p'18935
				None38200
					None18940
						None9240
							None4613
								None2221
									None1108
										'K'532
										'+'576
									'X'1113
								No

In [9]:
def compression_test(input):
    output = "result.txt"
    size = os.path.getsize(input)
    result =[]
    with open(input, "r") as file:
        text = file.read()
    with open(output, "wb") as file:
        static_tree = StaticHuffmanTree(text)
        start = perf_counter()
        bits = encode_static(static_tree, text)
        end = perf_counter()
        result.append(round(end - start,3))
        start = perf_counter()
        decode_static(static_tree, bits)
        end = perf_counter()
        result.append(round(end - start,3))
        bits.tofile(file)
    huffman_size = os.path.getsize(output)
    result.append((round((1-(huffman_size/ size))* 100 ,2)))
    
    
    with open(output, "wb") as file:
        start = perf_counter()
        bits = encode_adaptive(text)
        end = perf_counter()
        result.append(round(end - start,3))
        start = perf_counter()
        decode_adaptive(bits)
        end = perf_counter()
        result.append(round(end - start,3))
        bits.tofile(file)
    huffman_size = os.path.getsize(output)
    result.append((round((1-(huffman_size/ size))* 100 ,2)))

    return result 


In [10]:
encode = [["Static Huffman"], ["Adaptive Huffman"], ["Static - Adaptive"]]
decode = [["Static Huffman"], ["Adaptive Huffman"], ["Static - Adaptive"]]
compression = [["Static Huffman"], ["Adaptive Huffman"], ["Static - Adaptive"]]


for path in paths:
    result = compression_test(path)
    encode[0].append(result[0])
    encode[1].append(result[3])
    encode[2].append(result[0] -result[3])
    decode[0].append(result[1])
    decode[1].append(result[4])
    decode[2].append(result[1] - result[4])
    compression[0].append(str(result[2]) + '%')
    compression[1].append(str(result[5]) + '%')
    compression[2].append(str(round(result[2] - result[5],2)) + '%')

    
print(tabulate.tabulate(encode,headers=["ENCODE"]+paths, tablefmt="fancy_grid"))
print()
print(tabulate.tabulate(decode,headers=["DECODE"]+paths, tablefmt="fancy_grid"))
print()
print(tabulate.tabulate(compression,headers=["COMPRESSION"]+paths, tablefmt="fancy_grid"))    

None1024
	None420
		None196
			None93
				None45
					'Q'22
					'n'23
				None48
					'O'23
					None25
						'P'12
						'm'13
			None103
				None51
					'A'25
					'r'26
				None52
					'd'26
					'E'26
		None224
			None109
				None54
					'v'27
					'H'27
				None55
					'G'27
					None28
						'U'13
						'x'15
			None115
				None56
					'p'28
					'L'28
				None59
					'I'29
					None30
						'l'15
						'X'15
	None604
		None269
			None125
				None61
					None30
						'Y'15
						'i'15
					None31
						'T'15
						'F'16
				None64
					None32
						'R'16
						'o'16
					None32
						'j'16
						't'16
			None144
				None71
					None35
						'D'17
						None18
							None7
								None1
									None0
									'\n'1
								'Z'6
							'C'11
					None36
						'c'18
						'a'18
				None73
					None36
						'w'18
						's'18
					None37
						'S'18
						'z'19
		None335
			None162
				None78
					None38
						'K'19
						'N'19
					None40
						'u'20
						'b'20
				None84
					Non

None666546
	None271461
		None129183
			None62242
				'\t'30928
				None31314
					'b'15487
					None15827
						None7807
							None3812
								'M'1887
								'4'1925
							'C'3995
						None8020
							'w'3997
							None4023
								None1974
									None954
										None453
											None213
												'~'103
												'Z'110
											'8'240
										'6'501
									None1020
										'|'505
										None515
											'<'245
											'Y'270
								'B'2049
			' '66941
		None142278
			None67972
				't'32597
				'e'35375
			None74306
				None36106
					None17171
						None8301
							'='4085
							None4216
								None2075
									'2'1036
									'G'1039
								None2141
									'3'1068
									'H'1073
						'*'8870
					'p'18935
				None38200
					None18940
						None9240
							None4613
								None2221
									None1108
										'K'532
										'+'576
									'X'1113
								None2392
									None1176
										None587
											None287
												None143
	