In [7]:
from collections import Counter
import math

In [35]:
def load_text(filename):
    with open(f"../Data/{filename}", 'r', encoding='utf-8-sig') as f:
        text = f.read()
        return text

In [121]:
def entropy(text):
    counter = Counter(text)
    n = len(text)
    entropy = 0
    for char in counter:
        pi = counter[char] / n
        entropy += pi * math.log(pi, 2)
    return -entropy

In [37]:
text = load_text("czech.txt")
entropy(text)

5.067659446102941

In [123]:
class Node:
    def __init__(self, key, probability):
        self.key = key
        self.probability = probability
        self.encoded = ""
        self.parent = None
        self.left = None
        self.right = None

def encode_huffman(input):
    counter = Counter(input)
    n = len(input)
    #print(counter)
    nodes = {}
    all_nodes = {}
    for key in counter:
        counter[key] /= n
        node = Node(key, counter[key])
        nodes[key] = node
        all_nodes[key] = node
        
    #print(counter)
    
    # merge all nodes till we find root
    while len(counter) >= 2:
        a, b = counter.most_common()[-2:]
        #print(f"merge '{a}' with '{b}'")
        del counter[a[0]]
        del counter[b[0]]
        node = Node(a[0]+b[0], a[1] + b[1])
        counter[node.key] = node.probability
        nodes[node.key] = node
        
        node_a = nodes[a[0]]
        node_b = nodes[b[0]]
        del nodes[a[0]]
        del nodes[b[0]]
        
        node_a.parent = node
        node_b.parent = node
        
        node.left = node_a
        node.right = node_b
        
        #print(counter.most_common())
        
    #print("final")
    #print(counter.most_common())
    
    def calculate_encoded(node, value):
        # leaf
        if node.left == None or node.right == None:
            node.encoded = value
            return
        
        calculate_encoded(node.left, value + '0')
        calculate_encoded(node.right, value + '1')
    
    root = nodes[counter.most_common()[0][0]]
    calculate_encoded(root, "")
    
    #for key, node in all_nodes.items():
    #    print(f"key={key}, encoded={node.encoded}")
    
    result = ""
    for c in input:
        result += all_nodes[c].encoded        
    return result
        
#len(encode_huffman("factory function for creating tuple subclasses with named fields"))
#len(encode_huffman(load_text("czech.txt")[:100]))

In [143]:
files = ['czech.txt', 'german.txt', 'english.txt', 'french.txt', 'hungarian.txt']

for file in files:
    text = load_text(file)
    ent = entropy(text)
    encoded = encode_huffman(text)
    
    print(f"File = {file:<15} FileSize = {len(text):<10} Enc.Size = {len(encoded):<10} Entropy = {ent:<10.2f}")

File = czech.txt       FileSize = 136448     Enc.Size = 695565     Entropy = 5.07      
File = german.txt      FileSize = 218751     Enc.Size = 1040648    Entropy = 4.73      
File = english.txt     FileSize = 146480     Enc.Size = 684343     Entropy = 4.65      
File = french.txt      FileSize = 143274     Enc.Size = 709381     Entropy = 4.91      
File = hungarian.txt   FileSize = 194416     Enc.Size = 942106     Entropy = 4.82      
