## Huffman Coding applied in Data Science

#### by Jorge Agustín Erosa Herrera and Saúl Luna Estévez

##### 1. Importing necessary libraries <a class="anchor" id="import-libraries"></a>

In [1]:
import pandas as pd
import numpy as np

##### 2. Opening the file <a class="anchor" id="open-file"></a>

In [2]:
print("English")
with open('GenesisE.txt','r') as file:
    lines = []
    for line in file:
        lines.append(line[:-1])
        
originalE = " ".join(lines)
print("Start of text:",originalE[:100])
print("End of text:", originalE[-100:])

print("Spanish")
with open('GenesisS.txt','r') as file:
    lines = []
    for line in file:
        lines.append(line[:-1])
        
originalS = " ".join(lines)
print("Start of text:",originalS[:100])
print("End of text:", originalS[-100:])

print("Greek")
with open('GenesisG.txt','r') as file:
    lines = []
    for line in file:
        lines.append(line[:-1])
        
originalG = " ".join(lines)
print("Start of text:",originalG[:100])
print("End of text:", originalG[-100:])

English
Start of text: The Beginning In the beginning God created the heavens and the earth. Now the earth was formless and
End of text: l that he had made, and it was very good. And there was evening, and there was morning—the sixth day
Spanish
Start of text: La creación  En el principio creó Dios los cielos y la tierra.  Y la tierra estaba desordenada y vac
End of text: lo que había hecho, y he aquí que era bueno en gran manera. Y fue la tarde y la mañana el día sexto.
Greek
Start of text: ἐν ἀρχῇ ἐποίησεν ὁ θεὸς τὸν οὐρανὸν καὶ τὴν γῆν ἡ δὲ γῆ ἦν ἀόρατος καὶ ἀκατασκεύαστος καὶ σκότος ἐπά
End of text:  εἶδεν ὁ θεὸς τὰ πάντα ὅσα ἐποίησεν καὶ ἰδοὺ καλὰ λίαν καὶ ἐγένετο ἑσπέρα καὶ ἐγένετο πρωί ἡμέρα ἕκτ


##### 3. Setting up a starting dictionary of the frequency of found characters <a class="anchor" id="freq-dict"></a>

In [3]:
#DataCamp, ehm I mean, UPY courses paying off
freqE={key: originalE.count(key) for key in originalE}

print ("Per char frequency in English is :\n {}".format(str(freqE)))

Per char frequency in English is :
 {'T': 5, 'h': 206, 'e': 404, ' ': 769, 'B': 3, 'g': 92, 'i': 186, 'n': 218, 'I': 3, 't': 326, 'b': 29, 'G': 30, 'o': 174, 'd': 221, 'c': 55, 'r': 181, 'a': 277, 'v': 57, 's': 173, '.': 45, 'N': 1, 'w': 63, 'f': 41, 'm': 50, 'l': 98, 'p': 18, 'y': 38, ',': 45, 'k': 31, 'u': 50, 'S': 4, 'A': 24, '“': 16, 'L': 8, '”': 16, '—': 9, ':': 3, '-': 2, 'H': 1, '[': 1, ']': 1, ';': 2, 'R': 1, 'x': 1}


In [4]:
freqS={key: originalS.count(key) for key in originalS}

print ("Per char frequency in Spanish is :\n {}".format(str(freqS)))

Per char frequency in Spanish is :
 {'L': 3, 'a': 386, ' ': 771, 'c': 49, 'r': 189, 'e': 374, 'i': 160, 'ó': 24, 'n': 140, 'E': 6, 'l': 196, 'p': 44, 'o': 196, 'D': 36, 's': 261, 'y': 54, 't': 90, '.': 43, 'Y': 31, 'b': 58, 'd': 104, 'v': 30, 'í': 24, ',': 51, 'f': 22, 'z': 18, 'm': 57, 'u': 127, 'g': 41, 'j': 18, ':': 11, 'S': 1, ';': 10, 'q': 35, 'N': 1, 'h': 22, 'ñ': 13, 'H': 4, 'x': 10, 'C': 1, 'é': 14, 'J': 1, 'ú': 12, 'á': 8, 'T': 1, 'M': 1, 'P': 4, 'F': 2}


In [5]:
freqG={key: originalG.count(key) for key in originalG}

print ("Per char frequency in Greek is :\n {}".format(str(freqG)))

Per char frequency in Greek is :
 {'ἐ': 65, 'ν': 273, ' ': 740, 'ἀ': 17, 'ρ': 115, 'χ': 26, 'ῇ': 1, 'π': 111, 'ο': 169, 'ί': 29, 'η': 48, 'σ': 122, 'ε': 209, 'ὁ': 34, 'θ': 64, 'ὸ': 49, 'ς': 161, 'τ': 288, 'ὐ': 28, 'α': 282, 'κ': 170, 'ὶ': 120, 'ὴ': 7, 'γ': 76, 'ῆ': 57, 'ἡ': 18, 'δ': 33, 'ὲ': 1, 'ἦ': 2, 'ό': 38, 'ύ': 13, 'ά': 42, 'ω': 71, 'β': 6, 'υ': 26, 'ῦ': 39, 'μ': 57, 'φ': 17, 'έ': 81, 'ὕ': 18, 'ἶ': 17, 'ή': 12, 'ῶ': 30, 'ὅ': 9, 'ι': 53, 'λ': 36, 'ώ': 9, 'ὰ': 44, 'ἑ': 17, 'ῳ': 1, 'ἔ': 7, 'ζ': 7, 'ὃ': 3, 'ὑ': 6, 'ἰ': 18, 'ὀ': 1, 'ξ': 12, 'ὤ': 1, 'ῖ': 10, '’': 5, 'ὗ': 2, 'ῷ': 9, 'ὺ': 9, 'ὥ': 2, 'ἄ': 5, 'ψ': 4, 'ᾶ': 6, 'ῴ': 1, 'ἃ': 1, 'ἕ': 2}


In [6]:
english=pd.DataFrame(data = {'Character': list(freqE.keys()),'Frequency': list(freqE.values())}).sort_values(by='Frequency').reset_index().drop(columns='index')
display(english.head(),english.tail())

Unnamed: 0,Character,Frequency
0,x,1
1,],1
2,[,1
3,H,1
4,R,1


Unnamed: 0,Character,Frequency
39,d,221
40,a,277
41,t,326
42,e,404
43,,769


In [7]:
spanish=pd.DataFrame(data = {'Character': list(freqS.keys()),'Frequency': list(freqS.values())}).sort_values(by='Frequency').reset_index().drop(columns='index')
display(spanish.head(),spanish.tail())

Unnamed: 0,Character,Frequency
0,M,1
1,T,1
2,J,1
3,C,1
4,N,1


Unnamed: 0,Character,Frequency
43,l,196
44,s,261
45,e,374
46,a,386
47,,771


In [8]:
greek=pd.DataFrame(data = {'Character': list(freqG.keys()),'Frequency': list(freqG.values())}).sort_values(by='Frequency').reset_index().drop(columns='index')
display(greek.head(),greek.tail())

Unnamed: 0,Character,Frequency
0,ῴ,1
1,ῇ,1
2,ὲ,1
3,ῳ,1
4,ὤ,1


Unnamed: 0,Character,Frequency
65,ε,209
66,ν,273
67,α,282
68,τ,288
69,,740


In [9]:
total_char_freqE=sum(english.Frequency)
print("Total character frequency found in English",total_char_freqE)
total_char_freqS=sum(spanish.Frequency)
print("Total character frequency found in Spanish",total_char_freqS)
total_char_freqG=sum(greek.Frequency)
print("Total character frequency found in Greek",total_char_freqG)

Total character frequency found in English 3978
Total character frequency found in Spanish 3754
Total character frequency found in Greek 4062


In [10]:
total_charE=english.Character.count()
print("Total number of characters found in English (linespace counts as a regular space)", total_charE)
total_charS=spanish.Character.count()
print("Total number of characters found in Spanish (linespace counts as a regular space)", total_charS)
total_charG=greek.Character.count()
print("Total number of characters found in Greek (linespace counts as a regular space)", total_charG)

Total number of characters found in English (linespace counts as a regular space) 44
Total number of characters found in Spanish (linespace counts as a regular space) 48
Total number of characters found in Greek (linespace counts as a regular space) 70


##### 3. Calculating the encoding (Huffman Tree) <a class="anchor" id="huff-tree"></a>

In [11]:
def sorted_tuple(tuple_):
    def getKey(item):
        return item[1]
    return sorted(tuple_, key = getKey)

In [12]:
freq_tuplesE = [(key,freq) for key,freq in zip(english.Character.values,english.Frequency.values)]
encoding_dictE = {key: '' for key in english.Character.values}
freq_tuplesS = [(key,freq) for key,freq in zip(spanish.Character.values,spanish.Frequency.values)]
encoding_dictS = {key: '' for key in spanish.Character.values}
freq_tuplesG = [(key,freq) for key,freq in zip(greek.Character.values,greek.Frequency.values)]
encoding_dictG = {key: '' for key in greek.Character.values}

In [13]:
while True:
    new_node = freq_tuplesE[0:2]
    # asignar el 0 al sub arbol izquierdo
    for char in new_node[0][0]:
        encoding_dictE[char] = '0' + encoding_dictE[char]
        
    # asignar el 1 a los sub arbol derecho
    for char in new_node[1][0]:
        encoding_dictE[char] = '1' + encoding_dictE[char]
    
    
    new_node = (new_node[0][0]+new_node[1][0], new_node[0][1]+new_node[1][1])
    freq_tuplesE = freq_tuplesE[2:]
    freq_tuplesE.append(new_node)
    freq_tuplesE = sorted_tuple(freq_tuplesE)
    if len(freq_tuplesE) == 1:
        break

In [14]:
print(encoding_dictE)

{'x': '101011011000', ']': '101011011001', '[': '101011011010', 'H': '101011011011', 'R': '01110110000', 'N': '01110110001', ';': '10101011110', '-': '10101011111', 'B': '0111011001', ':': '0111011010', 'I': '0111011011', 'S': '1010101110', 'T': '1010110111', 'L': '101010110', '—': '101011010', '”': '01110111', '“': '10101010', 'p': '10101100', 'A': '0111010', 'b': '1010000', 'G': '1010001', 'k': '1010100', 'y': '1010111', 'f': '1110000', ',': '1110001', '.': '1111100', 'm': '1111101', 'u': '011100', 'c': '011110', 'v': '011111', 'w': '101001', 'g': '111001', 'l': '111111', 's': '11010', 'o': '11011', 'r': '11101', 'i': '11110', 'h': '0110', 'n': '1000', 'd': '1001', 'a': '1011', 't': '1100', 'e': '010', ' ': '00'}


In [15]:
while True:
    new_node = freq_tuplesS[0:2]
    # asignar el 0 al sub arbol izquierdo
    for char in new_node[0][0]:
        encoding_dictS[char] = '0' + encoding_dictS[char]
        
    # asignar el 1 a los sub arbol derecho
    for char in new_node[1][0]:
        encoding_dictS[char] = '1' + encoding_dictS[char]
    
    
    new_node = (new_node[0][0]+new_node[1][0], new_node[0][1]+new_node[1][1])
    freq_tuplesS = freq_tuplesS[2:]
    freq_tuplesS.append(new_node)
    freq_tuplesS = sorted_tuple(freq_tuplesS)
    if len(freq_tuplesS) == 1:
        break

In [16]:
print(encoding_dictS)

{'M': '110101010110', 'T': '110101010111', 'J': '111000001000', 'C': '111000001001', 'N': '111000001010', 'S': '111000001011', 'F': '11010101010', 'L': '1010111000', 'H': '1010111001', 'P': '1101010100', 'E': '1110000011', 'á': '101011101', 'x': '110101011', ';': '111000000', ':': '111001110', 'ú': '111001111', 'ñ': '10100100', 'é': '10100101', 'j': '10101111', 'z': '11010100', 'h': '11100001', 'f': '11100110', 'ó': '11101100', 'í': '11101101', 'v': '1010011', 'Y': '1010110', 'q': '1101000', 'D': '1101001', 'g': '1101011', '.': '1110001', 'p': '1110010', 'c': '1110111', ',': '100110', 'y': '100111', 'm': '101000', 'b': '101010', 't': '111010', 'd': '10010', 'u': '11000', 'n': '11001', 'i': '11011', 'r': '0100', 'o': '0101', 'l': '1000', 's': '1011', 'e': '1111', 'a': '011', ' ': '00'}


In [17]:
while True:
    new_node = freq_tuplesG[0:2]
    # asignar el 0 al sub arbol izquierdo
    for char in new_node[0][0]:
        encoding_dictG[char] = '0' + encoding_dictG[char]
        
    # asignar el 1 a los sub arbol derecho
    for char in new_node[1][0]:
        encoding_dictG[char] = '1' + encoding_dictG[char]
    
    
    new_node = (new_node[0][0]+new_node[1][0], new_node[0][1]+new_node[1][1])
    freq_tuplesG = freq_tuplesG[2:]
    freq_tuplesG.append(new_node)
    freq_tuplesG = sorted_tuple(freq_tuplesG)
    if len(freq_tuplesG) == 1:
        break

In [18]:
print(encoding_dictG)

{'ῴ': '101100100010', 'ῇ': '101100100011', 'ὲ': '101100100100', 'ῳ': '101100100101', 'ὤ': '101100100110', 'ἃ': '101100100111', 'ὀ': '111111000110', 'ἕ': '111111000111', 'ἦ': '01101111110', 'ὥ': '01101111111', 'ὗ': '10110010000', 'ὃ': '11111100010', 'ψ': '0110111110', 'ἄ': '1111011000', '’': '1111011001', 'β': '1111011110', 'ὑ': '1111011111', 'ᾶ': '1111110000', 'ζ': '011011100', 'ἔ': '011011101', 'ὴ': '011011110', 'ὅ': '101100101', 'ῷ': '101111000', 'ώ': '101111001', 'ὺ': '101111010', 'ῖ': '101111011', 'ή': '111101101', 'ξ': '111101110', 'ύ': '111111001', 'ἀ': '10001110', 'ἶ': '10001111', 'ἑ': '10110000', 'φ': '10110001', 'ἡ': '10110011', 'ὕ': '10111010', 'ἰ': '10111011', 'υ': '11111101', 'χ': '0101100', 'ὐ': '0101101', 'ί': '0110110', 'ῶ': '1000000', 'δ': '1000001', 'ὁ': '1000110', 'λ': '1011100', 'ό': '1011111', 'ῦ': '1111000', 'ά': '1111001', 'ὰ': '1111010', 'η': '1111100', 'ὸ': '1111101', 'ι': '1111111', 'ῆ': '010111', 'μ': '011010', 'θ': '100001', 'ἐ': '100010', 'ω': '101101', 'γ':

##### 4. Creating columns of found encoding, its character length and total character length  <a class="anchor" id="encoding-len"></a>

In [19]:
english['Encoding']=encoding_dictE.values()
english['Length']=english.Encoding.str.len()
english['Total_Length']=english.Length*english.Frequency

spanish['Encoding']=encoding_dictS.values()
spanish['Length']=spanish.Encoding.str.len()
spanish['Total_Length']=spanish.Length*spanish.Frequency

greek['Encoding']=encoding_dictG.values()
greek['Length']=greek.Encoding.str.len()
greek['Total_Length']=greek.Length*greek.Frequency

In [20]:
display(english.head())
display(spanish.head())
display(greek)

Unnamed: 0,Character,Frequency,Encoding,Length,Total_Length
0,x,1,101011011000,12,12
1,],1,101011011001,12,12
2,[,1,101011011010,12,12
3,H,1,101011011011,12,12
4,R,1,1110110000,11,11


Unnamed: 0,Character,Frequency,Encoding,Length,Total_Length
0,M,1,110101010110,12,12
1,T,1,110101010111,12,12
2,J,1,111000001000,12,12
3,C,1,111000001001,12,12
4,N,1,111000001010,12,12


Unnamed: 0,Character,Frequency,Encoding,Length,Total_Length
0,ῴ,1,101100100010,12,12
1,ῇ,1,101100100011,12,12
2,ὲ,1,101100100100,12,12
3,ῳ,1,101100100101,12,12
4,ὤ,1,101100100110,12,12
...,...,...,...,...,...
65,ε,209,0100,4,836
66,ν,273,1001,4,1092
67,α,282,1010,4,1128
68,τ,288,1100,4,1152


In [21]:
total_encoding_freqE=sum(english.Total_Length)
print("Total encoding frequency in English: ",total_encoding_freqE)
total_encoding_freqS=sum(spanish.Total_Length)
print("Total encoding frequency in Spanish: ",total_encoding_freqS)
total_encoding_freqG=sum(greek.Total_Length)
print("Total encoding frequency in Greek: ",total_encoding_freqG)

Total encoding frequency in English:  16980
Total encoding frequency in Spanish:  16205
Total encoding frequency in Greek:  19704


##### 5. "Compressing"  <a class="anchor" id="compress"></a>

In [22]:
#hash-ish
code_dictE=pd.Series(english.Encoding.values,index=english.Character.values).to_dict()
print(code_dictE)
code_dictS=pd.Series(spanish.Encoding.values,index=spanish.Character.values).to_dict()
print(code_dictS)
code_dictG=pd.Series(greek.Encoding.values,index=greek.Character.values).to_dict()
print(code_dictG)

{'x': '101011011000', ']': '101011011001', '[': '101011011010', 'H': '101011011011', 'R': '01110110000', 'N': '01110110001', ';': '10101011110', '-': '10101011111', 'B': '0111011001', ':': '0111011010', 'I': '0111011011', 'S': '1010101110', 'T': '1010110111', 'L': '101010110', '—': '101011010', '”': '01110111', '“': '10101010', 'p': '10101100', 'A': '0111010', 'b': '1010000', 'G': '1010001', 'k': '1010100', 'y': '1010111', 'f': '1110000', ',': '1110001', '.': '1111100', 'm': '1111101', 'u': '011100', 'c': '011110', 'v': '011111', 'w': '101001', 'g': '111001', 'l': '111111', 's': '11010', 'o': '11011', 'r': '11101', 'i': '11110', 'h': '0110', 'n': '1000', 'd': '1001', 'a': '1011', 't': '1100', 'e': '010', ' ': '00'}
{'M': '110101010110', 'T': '110101010111', 'J': '111000001000', 'C': '111000001001', 'N': '111000001010', 'S': '111000001011', 'F': '11010101010', 'L': '1010111000', 'H': '1010111001', 'P': '1101010100', 'E': '1110000011', 'á': '101011101', 'x': '110101011', ';': '111000000'

In [23]:
encoded_textE=originalE
encoded2textE=originalE
for i in code_dictE:
    encoded2textE = encoded2textE.replace(i, code_dictE[i])
    encoded_textE = encoded_textE.replace(i, code_dictE[i])

encoded_textS=originalS
encoded2textS=originalS
for i in code_dictS:
    encoded2textS = encoded2textS.replace(i, code_dictS[i])
    encoded_textS = encoded_textS.replace(i, code_dictS[i])
    
encoded_textG=originalG
encoded2textG=originalG
for i in code_dictG:
    encoded2textG = encoded2textG.replace(i, code_dictG[i])
    encoded_textG = encoded_textG.replace(i, code_dictG[i])

In [24]:
print("Character frequency found in compressed English text",len(encoded_textE))
print("Character frequency found in compressed English text",len(encoded_textS))
print("Character frequency found in compressed English text",len(encoded_textG))

Character frequency found in compressed English text 16980
Character frequency found in compressed English text 16205
Character frequency found in compressed English text 19704


In [25]:
def to_bytes(bits, size=8, pad='0'):
    chunks = [bits[n:n+size] for n in range(0, len(bits), size)]
    if pad:
        chunks[-1] = chunks[-1].ljust(size, pad)
    return bytearray([int(c, 2) for c in chunks])

In [26]:
byte_arrayE = to_bytes(encoded2textE)
byte_arrayS = to_bytes(encoded2textS)
byte_arrayG = to_bytes(encoded2textG)

In [27]:
print("English Compressed File Size:",len(byte_arrayE))
print("Spanish Compressed File Size:",len(byte_arrayS))
print("Greek Compressed File Size:",len(byte_arrayG))

English Compressed File Size: 2123
Spanish Compressed File Size: 2026
Greek Compressed File Size: 2463


In [28]:
newFile = open("mean/binaryE.bin", "wb")
newFile.write(byte_arrayE)
newFile = open("mean/binaryS.bin", "wb")
newFile.write(byte_arrayS)
newFile = open("mean/binaryG.bin", "wb")
newFile.write(byte_arrayG)

2463

##### 6. "Decompressing"  <a class="anchor" id="decompress"></a>

In [29]:
decoded_textE = ''
for i in range(len(originalE)):
    for char, code in code_dictE.items():
        try:
            post = encoded_textE.index(code)
            if post == 0:
                decoded_textE = decoded_textE + char
                encoded_textE = encoded_textE[len(code):]
        except:
            continue

In [30]:
decoded_textS = ''
for i in range(len(originalS)):
    for char, code in code_dictS.items():
        try:
            post = encoded_textS.index(code)
            if post == 0:
                decoded_textS = decoded_textS + char
                encoded_textS = encoded_textS[len(code):]
        except:
            continue

In [31]:
decoded_textG = ''
for i in range(len(originalG)):
    for char, code in code_dictG.items():
        try:
            post = encoded_textG.index(code)
            if post == 0:
                decoded_textG = decoded_textG + char
                encoded_textG = encoded_textG[len(code):]
        except:
            continue

In [32]:
print("Character frequency found in English decompressed file",len(decoded_textE))
print("Character frequency found in Spanish decompressed file",len(decoded_textS))
print("Character frequency found in Greek decompressed file",len(decoded_textG))

Character frequency found in English decompressed file 3978
Character frequency found in Spanish decompressed file 3754
Character frequency found in Greek decompressed file 4062


In [33]:
print("Total character frequency found in English",total_char_freqE)
print("Total character frequency found in Spanish",total_char_freqS)
print("Total character frequency found in Greek",total_char_freqG)

Total character frequency found in English 3978
Total character frequency found in Spanish 3754
Total character frequency found in Greek 4062
