In [98]:
from heapq import heapify, heappop, heappush
# Given frequency of symbols, 
# calculate the minimized average code length by huffman encoding

class Node(object):
    def __init__(self, symbol, freq):
        self.symbol = symbol
        self.freq = freq
        self.left = None
        self.right = None
    
    # This function enables HEAP to compare frequency of nodes, returns the smallest one
    def __lt__(self, other):
        return self.freq < other.freq

class Huffman(object):
    def __init__(self, path):
        self.path = path
        self.heap = []
        self.codes = {}
        self.reverse_map = {}
    
    def freq_dict(self):
        '''
        Create a frequency dictionary based on the file
        '''
        freq_dict = {}
        file = open(self.path, 'r')
        for i, f in enumerate(file.readlines()[1:]):
            freq_dict[str(i)] = int(f)
        return freq_dict
    
    def make_heap(self):
        '''
        create heap based on file
        '''
        file = open(self.path, 'r')
        for i, f in enumerate(file.readlines()[1:]):
            self.heap.append(Node(str(i), int(f)))
        heapify(self.heap)
        
    def merge_node(self):
        '''
        Merger node according to huffman coding rule
        '''
        while len(self.heap) > 1:
            node1 = heappop(self.heap)
            node2 = heappop(self.heap)
            freq = node1.freq + node2.freq
            merged = Node(None, freq)

            merged.left = node1
            merged.right = node2

            heappush(self.heap, merged)
    
    def __code_helper(self, root, current_code):
        if root is None:
            return None
        
        if root.symbol is not None:
            self.codes[root.symbol] = current_code
            self.reverse_map[current_code] = root.symbol
            return None
        
        ## 精妙之处， 沿着树来增加编码
        self.__code_helper(root.left, current_code + '0')
        self.__code_helper(root.right, current_code + '1')
    
    def encode(self):
        root = heappop(self.heap)
        current_code = ""
        self.__code_helper(root, current_code)

    def avg_length(self):
        '''Calculate average encoding length'''
        if len(self.codes) == 0:
            print('No valid encoding !!!')
            return None
        freqs = self.freq_dict()
        acc_len = 0
        freq_sum = 0
        for symb in self.codes.keys():
            acc_len += len(self.codes[symb])*freqs[symb]
            freq_sum += freqs[symb]
            
        return acc_len / freq_sum

In [51]:
file = open('huffman_test_1.txt', 'r')
fs = []
freq_dict = {}
for i, f in enumerate(file.readlines()):
    fs.append(Node(str(i), int(f)))
    freq_dict[str(i)] = int(f)
    symb+=1
    
heapify(fs)
huffman = Huffman(fs)

In [52]:
huffman.merge_node()
huffman.encode()

In [99]:
path = './huffman_test_2.txt'
huffman = Huffman(path)
huffman.make_heap()
huffman.merge_node()
huffman.encode()

In [90]:
huffman.avg_length()

3.6681626187961984

In [91]:
huffman.codes

{'9': '000',
 '4': '0010',
 '12': '0011',
 '5': '010',
 '0': '011',
 '14': '100',
 '3': '101',
 '1': '110000',
 '8': '110001',
 '11': '11001',
 '7': '1101',
 '10': '1110',
 '13': '11110',
 '6': '111110',
 '2': '111111'}

In [84]:
huffman.freq_dict()

{'0': 10,
 '1': 37,
 '2': 59,
 '3': 43,
 '4': 27,
 '5': 30,
 '6': 96,
 '7': 96,
 '8': 71,
 '9': 8,
 '10': 76}

In [93]:
vars(huffman)

{'path': './huffman_test_2.txt',
 'heap': [],
 'codes': {'9': '000',
  '4': '0010',
  '12': '0011',
  '5': '010',
  '0': '011',
  '14': '100',
  '3': '101',
  '1': '110000',
  '8': '110001',
  '11': '11001',
  '7': '1101',
  '10': '1110',
  '13': '11110',
  '6': '111110',
  '2': '111111'},
 'reverse_map': {'000': '9',
  '0010': '4',
  '0011': '12',
  '010': '5',
  '011': '0',
  '100': '14',
  '101': '3',
  '110000': '1',
  '110001': '8',
  '11001': '11',
  '1101': '7',
  '1110': '10',
  '11110': '13',
  '111110': '6',
  '111111': '2'}}

## Notes:
Huffman编码实现中精髓之处在于__code_helper()这个函数。这个函数解决了从根到叶编码的问题，利用了递归的方式来操作？？？

## Reference:
1. Huffman coding: http://bhrigu.me/blog/2017/01/17/huffman-coding-python-implementation/
2. Enable heap to compare objects: https://stackoverflow.com/questions/3954530/how-to-make-heapq-evaluate-the-heap-off-of-a-specific-attribute