In [21]:
import heapq
from urllib.request import urlopen
import shutil
import gzip
import os
from collections import defaultdict
from bitarray import bitarray
import pickle
import math


# Download the file if need be:
def download_file(url, filename):
    if not os.path.exists(filename):
        response = urlopen(url + filename)
        shutil.copyfileobj(
            gzip.GzipFile(fileobj=response), open(filename, 'wb'))


# build a frequency table:
def build_freq(filename):
    freq = defaultdict(int)
    with open(filename, 'rb') as f:
        for line in f:
            for char in line:
                freq[char] += 1
    total = float(sum(freq.values()))
    return {char: count / total for (char, count) in freq.items()}

def min_heapify(arr, i):
    l = 2*i + 1
    r = 2*i + 2
    
    min_idx = i
    if l < len(arr) and arr[l] < arr[i]:
        min_idx = l
    if r < len(arr) and arr[r] < arr[min_idx]:
        min_idx = r
        
    if min_idx != i:
        arr[min_idx], arr[i] = arr[i], arr[min_idx]
        min_heapify(arr, min_idx)
    
def build_min_heap(arr):
    for i in range(math.floor(len(arr) / 2), -1, -1):
        min_heapify(arr, i)
        
def heap_pop(arr):
    min_val = arr[0]
    arr[0], arr[-1] = arr[-1], arr[0]
    arr.pop()
    min_heapify(arr, 0)
    
    return min_val

def heap_push(arr, key):
    arr.append(key)
    arr[0], arr[-1] = arr[-1], arr[0]
    min_heapify(arr, 0)
    
class Node:
    symb = ''
    left = None
    right = None
    freq = 0
    code = '0'
    
    def __lt__(self, other):
        return self.freq < other.freq
    
    def __str__(self):
        return "\nCharacter: " + str(self.symb) + "; Code: " + str(self.code)

def huff_code(z):
    if z.left != None:
        z.left.code = bitarray(z.code + '0')
        huff_code(z.left)
    if z.right != None:
        z.right.code = bitarray(z.code + '1')
        huff_code(z.right)
    
def inorder(root, dct):
    if root != None:
        inorder(root.left, dct)
        if root.symb != '':
            dct[root.symb] = root.code
        inorder(root.right, dct)    
    
# Now build the Huffman encoding:
def encode(symb2freq):
    heap_arr = []
    for key in symb2freq:
        curChar = Node()
        curChar.symb = key
        curChar.freq = symb2freq[key]
        heap_arr.append(curChar)
    build_min_heap(heap_arr)
    
    for i in range(len(heap_arr) - 1):
        z = Node()
        z.left = heap_pop(heap_arr)
        z.right = heap_pop(heap_arr)
        z.freq = z.left.freq + z.right.freq
        heap_push(heap_arr, z)
        
    root = heap_pop(heap_arr)
    huff_code(root)
    symb2code = {}
    inorder(root, symb2code)
    print("symb2code:", symb2code)
    
    return symb2code


# Now compress the file:
def compress(filename, encoding, compressed_name=None):
    if compressed_name is None:
        compressed_name = filename + ".huff"
    output = bitarray()
    with open(filename, 'rb') as f:
        for line in f:
            for char in line:
                output.extend(encoding[char])
    N = len(output)
    with open(compressed_name, 'wb') as f:
        pickle.dump(N, f)
        pickle.dump(encoding, f)
        output.tofile(f)


# Now decompress the file:
def decompress(filename, decompressed_name=None):
    if decompressed_name is None:
        decompressed_name = filename + ".dehuff"
    with open(filename, 'rb') as f:
        N = pickle.load(f)
        encoding = pickle.load(f)
        bits = bitarray()
        bits.fromfile(f)
        bits = bits[:N]

    # Totally cheating here and using a builtin method:
    output = bits.decode(encoding)
    with open(decompressed_name, 'wb') as f:
        f.write(bytes(output))


url = "http://www.gutenberg.org/ebooks/"
filename = "100.txt.utf-8"

download_file(url, filename)
freq = build_freq(filename)
encoding = encode(freq)
compress(filename, encoding)
decompress(filename + ".huff")
# Do you get identical files?

symb2code: {32: bitarray('000'), 44: bitarray('0010000'), 84: bitarray('00100010'), 67: bitarray('001000110'), 40: bitarray('00100011100000'), 88: bitarray('00100011100001'), 81: bitarray('0010001110001'), 113: bitarray('001000111001'), 120: bitarray('00100011101'), 89: bitarray('0010001111'), 39: bitarray('00100100'), 59: bitarray('001001010'), 87: bitarray('001001011'), 45: bitarray('0010011000'), 49: bitarray('0010011001000'), 57: bitarray('0010011001001'), 58: bitarray('001001100101'), 86: bitarray('00100110011'), 66: bitarray('001001101'), 107: bitarray('00100111'), 108: bitarray('001010'), 102: bitarray('0010110'), 118: bitarray('00101110'), 83: bitarray('00101111'), 97: bitarray('00110'), 117: bitarray('001110'), 103: bitarray('0011110'), 73: bitarray('0011111'), 110: bitarray('010000'), 104: bitarray('010001'), 85: bitarray('0100100000'), 75: bitarray('01001000010'), 34: bitarray('010010000110000'), 60: bitarray('010010000110001'), 62: bitarray('010010000110010'), 50: bitarray(

#### 2. Run the code and compress the complete works of Shakespeare. How big is the original file, what is the size of the compressed version? When the compressed version is decompressed is it identical to the original?
4 MB vs 6.3 MB

#### 3. Compress the complete works of Shakespeare using other compression methods and see what a poor job they do (eg. gzip, bzip2, zip). Bear in mind that we are using the optimal symbol code with frequencies constructed from the exact corpus that we are looking to compress, so nothing can beat us right?