In [1]:
# Huffman Coding is for decompression of string

# Characters with higher frequency will have more space than characters with lower frequency

In [2]:
# Different data structures in Huffman Coding

# A HashMap to store frequencies
# A tree
# A MinHeap to store minimum frequency
# A HashMap for character, bits

# "a" -> 00
# "d" -> 11
# "b" -> 101
# "c" -> 1001
# "x" -> 10000

# Please note that 0 and 1 are bits and not characters or integers

# The codes above are prefix free codes because all these characters are leaf nodes. For example, if 'a' is 00 and 'a' is a leaf node,
# then no other character can start with 00.
# No node is below another node. The nodes are either on the right or left of each other.

In [3]:
# Implement HuffmanCoding

In [40]:
import heapq
import os

class BinaryTreeNode:
    def __init__(self, value, freq):
        self.value = value
        self.freq  = freq
        self.left = None
        self.right = None
    
    def __lt__(self, other):
        return self.freq < other.freq
    
    def __gt__(self, other):
        return self.freq > other.freq
    
    def __eq__(self, other):
        return self.freq == other.freq

class HuffmanCoding:
    def __init__(self, path):
        self.path = path
        self.__heap = []
        self.__codes = {}
        self.__reverseCodes = {}
    
    def __make_frequency_dict(self, text):
        
        freq_dict = {}
        for letter in text:
            freq_dict[letter] = freq_dict.get(letter, 0) + 1
        
        return freq_dict
    
    def __buildHeap(self, freq_dict):
        
        for key in freq_dict:
            value = freq_dict[key]
            node = BinaryTreeNode(key, value)
            heapq.heappush(self.__heap, node)
    
    def __buildTree(self):
        while len(self.__heap) > 1:
            node1 = heapq.heappop(self.__heap)
            node2 = heapq.heappop(self.__heap)
            freq = node1.freq + node2.freq
            node3 = BinaryTreeNode(None, freq)
            node3.left = node1
            node3.right = node2
            heapq.heappush(self.__heap, node3)
        return
    
    def __buildCodesHelper(self, root, code):
        
        if root is None:
            return
        if root.value is not None:
            self.__codes[root.value] = code
            self.__reverseCodes[code] = root.value
            return
        self.__buildCodesHelper(root.left, code + "0")
        self.__buildCodesHelper(root.right, code + "1")
            
    def __buildCodes(self):
        root = heapq.heappop(self.__heap)
        self.__buildCodesHelper(root, "")
    
    def __getEncodedText(self, text):
        encoded_text = ""
        for char in text:
            encoded_text += self.__codes[char]
        
        return encoded_text
        
    def __getPaddedEncodedText(self, encoded_text):
        l = len(encoded_text)
        r = 8 - (l % 8)
        padded_encoded_text = encoded_text + ("0"*r)
        binary = "{0:08b}".format(r)
        padded_encoded_text = binary + padded_encoded_text
        return padded_encoded_text
    
    def __getBytesArray(self, padded_encoded_text):
        bytes_array = []
        for i in range(0, len(padded_encoded_text), 8):
            byte = padded_encoded_text[i:i+8]
            num = int(byte, 2)
            bytes_array.append(num)
        return bytes_array
            
    def compress(self):
        
        file_name, file_extention = os.path.splitext(self.path)
        output_path = file_name + ".bin"
        
        # 'r+' format is read and write. 'wb' is 'write in binary' format
        with open(self.path, 'r+') as file, open(output_path, 'wb') as output:
            text = file.read() # if you want to remove whitespace from both the left and right ends of the string, 
            # you can use the strip() method instead of rstrip(). If you only want to remove whitespace from the left end of the 
            # string, you can use the lstrip() method.
            text = text.rstrip()
            freq_dict = self.__make_frequency_dict(text)
            self.__buildHeap(freq_dict)
            self.__buildTree()
            self.__buildCodes()
            encoded_text = self.__getEncodedText(text)
            padded_encoded_text = self.__getPaddedEncodedText(encoded_text)
            bytes_array = self.__getBytesArray(padded_encoded_text)
            final_bytes = bytes(bytes_array)
            output.write(final_bytes)
        
        print("Compressed")
        return output_path
    
    def __removePadding(self, bit_string):
        padding = bit_string[0:8]
        padding_num = int(padding, 2)
        actual_text = bit_string[8:(-1*padding_num)]
        return actual_text
    
    def __decode(self, actual_text):
        decoded_text = ""
        current_bits = ""
        for bit in actual_text:
            current_bits += bit
            if current_bits in self.__reverseCodes:
                character = self.__reverseCodes[current_bits]
                decoded_text += character
                current_bits = ""
            
        return decoded_text
    
    # In decompress function, we read the binary file created by compress function and convert it into the 
    # actual decompressed original text
    def decompress(self, input_path): # Here the input path is the path of the binary file created by compress function
        filename, file_extention = os.path.splitext(self.path)
        output_path = filename + "_decompressed" + ".txt"
        # 'rb' is read binary format
        with open(input_path, 'rb') as file, open(output_path, 'w') as output:
            bit_string = ""
            byte = file.read(1) # Reads the first byte
            while byte:
                byte = ord(byte) # Gives the ascii value of the byte
                bits = bin(byte)[2:].rjust(8, '0') # bin converts the number into the binary format
                bit_string += bits
                byte = file.read(1) # Reads the byte one by one. When the byte becomes null (reaches the end), the loop stops.
            actual_text = self.__removePadding(bit_string)
            decoded_text = self.__decode(actual_text)
            output.write(decoded_text)
        return

In [41]:
h = HuffmanCoding("sample.txt")
output_path = h.compress()
h.decompress(output_path) # Here the output path is the path of the binary file created by compress function

Compressed


In [18]:
bin(5)

'0b101'

In [20]:
int('0001',2)

1

In [27]:
a = 'abvffdfrgrgrfgfg11111111'
a[8:-3]

'grgrfgfg11111'

In [30]:
a = {'a':5,'b':6,'c':"10"}
a.get_key(5)


True

In [31]:
a.get_key(5)

AttributeError: 'dict' object has no attribute 'get_key'