In [1]:
from Project2Tools import *
# Project2Tools contains functions to perform the bulk tasks, and contains the main algorithms
import numpy as np
# numpy contains many functions for mathematical operations, 
# which are vectorised and therefore can operate on entire Data-Series objects (DataFrame columns)
import pandas as pd
# Provides access to an R-like DataFrame object, great for working with tabular data
import string
# gives access to lists of characters, including ascii_lowercase which we will use throughout

# SECTION 1: Huffman Codes for Several Languages

## Part 1.1: Reading In The Data

In [2]:
df = pd.read_csv('LetterFrequenciesFromWiki.csv', index_col=0)
print('Available Languages: {}'.format([i for i in df.columns[1:]]))
df.head()

Available Languages: ['English', 'French', 'German', 'Spanish', 'Portuguese', 'Esperanto', 'Italian', 'Turkish', 'Swedish', 'Polish', 'Dutch', 'Danish', 'Icelandic', 'Finnish', 'Czech']


Unnamed: 0,Letter,English,French,German,Spanish,Portuguese,Esperanto,Italian,Turkish,Swedish,Polish,Dutch,Danish,Icelandic,Finnish,Czech
0,a,0.081671,0.078537,0.066708,0.118787,0.152593,0.124111,0.118743,0.135727,0.098191,0.113743,0.074743,0.062115,0.11667,0.127287,0.10513
1,b,0.01492,0.009267,0.019308,0.02283,0.010876,0.010038,0.009372,0.029877,0.016063,0.018843,0.015815,0.020619,0.012036,0.002928,0.010262
2,c,0.02782,0.033529,0.027969,0.041424,0.040479,0.007948,0.045506,0.015369,0.015551,0.042181,0.012401,0.005825,0.0,0.002928,0.009238
3,d,0.04253,0.037736,0.051966,0.051638,0.052053,0.031179,0.037771,0.05469,0.049205,0.04034,0.059237,0.060393,0.018176,0.010867,0.043383
4,e,0.127021,0.151345,0.167856,0.125549,0.131071,0.092134,0.119218,0.104127,0.106207,0.079619,0.188804,0.159313,0.074064,0.083017,0.094406


## Part 1.2: Calculating The Binary Entropies

In [3]:
bin_ents = binary_entropies(df)
for be,val in bin_ents.items():
    print(be, val)

English 4.175787122056571
French 4.043449697747604
German 4.083791709001442
Spanish 4.06666650808618
Portuguese 3.9776461867774056
Esperanto 4.038832760051349
Italian 3.9816664458574564
Turkish 4.053289589591276
Swedish 4.159418764371151
Polish 4.242104940268861
Dutch 4.069551191418464
Danish 4.094787316514763
Icelandic 4.092024052016186
Finnish 3.9619711342068595
Czech 4.121934522161667


## Part 1.3: Constructing Binary Huffman Codes From Given Frequencies

In [4]:
codes = {lang: Huffman_encode_language(df,lang,d=2) for lang in df.columns[1:]}

df_binary_codes = df.copy()
df_binary_codes = df_binary_codes.drop(df_binary_codes.columns[1:], axis=1)
for name,code in codes.items():
    print(name, '\nAWL: ', code[1],'\nBE: ', bin_ents[name], '\nRel.Err.: ', 100*(code[1]-bin_ents[name])/bin_ents[name],'%\n')
    df_binary_codes[name+'_codes'] = df_binary_codes.Letter.apply(lambda x: code[0][x])

English 
AWL:  4.205062050620507 
BE:  4.175787122056571 
Rel.Err.:  0.701063720640963 %

French 
AWL:  4.081581437446003 
BE:  4.043449697747604 
Rel.Err.:  0.9430496865001105 %

German 
AWL:  4.115869327081561 
BE:  4.083791709001442 
Rel.Err.:  0.7854861453735188 %

Spanish 
AWL:  4.09769949083713 
BE:  4.06666650808618 
Rel.Err.:  0.7631061629775527 %

Portuguese 
AWL:  4.005359637963754 
BE:  3.9776461867774056 
Rel.Err.:  0.6967299222961151 %

Esperanto 
AWL:  4.071094950322647 
BE:  4.038832760051349 
Rel.Err.:  0.798799855998183 %

Italian 
AWL:  4.011656944121483 
BE:  3.9816664458574564 
Rel.Err.:  0.7532147323698717 %

Turkish 
AWL:  4.078631383219001 
BE:  4.053289589591276 
Rel.Err.:  0.6252154717196207 %

Swedish 
AWL:  4.197940539352652 
BE:  4.159418764371151 
Rel.Err.:  0.9261336057689644 %

Polish 
AWL:  4.276770630279402 
BE:  4.242104940268861 
Rel.Err.:  0.8171813403640968 %

Dutch 
AWL:  4.111854388609882 
BE:  4.069551191418464 
Rel.Err.:  1.0395052230974218 %

D

## Part 1.4: Compare To Trivial Block Code

In [5]:
AWL_trivial_block_codes = {lang: AWL(df,trivial_block_code(),lang) for lang in df.columns[1:]}
for name,ave in AWL_trivial_block_codes.items():
    print(name, ave)

English 5.000000000000001
French 5.0
German 5.0
Spanish 5.000000000000001
Portuguese 5.0
Esperanto 5.0
Italian 5.0
Turkish 5.0
Swedish 5.0
Polish 5.000000000000001
Dutch 5.0
Danish 4.999999999999999
Icelandic 5.0
Finnish 5.000000000000001
Czech 4.999999999999999


# SECTION 2: Encoding Text

## Part 2.1: Encoding Single Phrase

In [6]:
"""
Cats & Dogs:
"""

phrases = {'English': 'It is raining cats and dogs'.lower().replace(' ',''),
           'French':  'Il pleut des chiens et des chats'.lower().replace(' ',''),
           'Italian': 'Sta piovendo cani e gatti'.lower().replace(' ',''),
           'German':  'Es regnet Hunde und Katzen'.lower().replace(' ','')
          }

encoded_phrases = {'English': [],
                   'French':  [],
                   'Italian': [],
                   'German':  []
                  }

for l,c in phrases.items():
    code = codes[l][0]
    for symbol in c:
        encoded_phrases[l].append(code[symbol])
        
lengths_encoded_phrases = {'English': 0,
                           'French':  0,
                           'Italian': 0,
                           'German':  0
                          }

for i,j in encoded_phrases.items():
    lengths_encoded_phrases[i] = np.sum([len(k) for k in j])

for name,phrase in encoded_phrases.items():
    print('Orginal Length: {}'.format(len(phrases[name])))
    print('ratio: {}'.format(len(phrases[name])/lengths_encoded_phrases[name]))
    print('Total Length {}: {}\nencoded: {}\n\n'.format(name, lengths_encoded_phrases[name], phrase))

Orginal Length: 22
ratio: 0.23655913978494625
Total Length English: 93
encoded: ['1011', '000', '1011', '0111', '0101', '1110', '1011', '1010', '1011', '1010', '110011', '01001', '1110', '000', '0111', '1110', '1010', '11111', '11111', '1101', '110011', '0111']


Orginal Length: 26
ratio: 0.25
Total Length French: 104
encoded: ['1110', '0100', '00111', '0100', '110', '0111', '1011', '0010', '110', '000', '10010', '0101001', '1110', '110', '1010', '000', '110', '1011', '0010', '110', '000', '10010', '0101001', '1111', '1011', '000']


Orginal Length: 21
ratio: 0.25609756097560976
Total Length Italian: 82
encoded: ['11111', '0101', '011', '11000', '001', '000', '111010', '100', '1101', '11100', '000', '11110', '011', '1101', '001', '100', '110010', '011', '0101', '0101', '001']


Orginal Length: 22
ratio: 0.2619047619047619
Total Length German: 84
encoded: ['111', '1100', '1011', '111', '10000', '001', '111', '0111', '0001', '0000', '001', '0100', '111', '0000', '001', '0100', '010111', 

## Part 2.2: Encoding Large Text

In [7]:
encoded_UDHR = {'English': [],
                'French':  [],
                'Italian': [],
                'German':  []
               }

for l,c in encoded_UDHR.items():
    with open('UDHR/{}_fixed'.format(l),'r') as f:
        text = f.read().replace('\n','')
        code = codes[l][0]
        for symbol in text:
            encoded_UDHR[l].append(code[symbol])
            
lengths_encoded_UDHR = {'English': 0,
                        'French':  0,
                        'Italian': 0,
                        'German':  0
                        }

for i,j in encoded_UDHR.items():
    lengths_encoded_UDHR[i] = np.sum([len(k) for k in j])

original_lengths = {name: len(encodings) for name,encodings in encoded_UDHR.items()}
    
print('Encoded Lengths: ', lengths_encoded_UDHR)
print('Original Lengths: ', original_lengths)
print('AWL (In Practice): \n', {name: lengths_encoded_UDHR[name]/original_lengths[name] for name in encoded_UDHR})
print('AWL (In Theory): \n', {name: code[1] for name,code in codes.items() if name in encoded_UDHR.keys()})

Encoded Lengths:  {'English': 36156, 'French': 38309, 'Italian': 41464, 'German': 41079}
Original Lengths:  {'English': 8673, 'French': 9626, 'Italian': 10459, 'German': 10102}
AWL (In Practice): 
 {'English': 4.168799723279142, 'French': 3.9797423644296694, 'Italian': 3.9644325461325174, 'German': 4.066422490595921}
AWL (In Theory): 
 {'English': 4.205062050620507, 'French': 4.081581437446003, 'German': 4.115869327081561, 'Italian': 4.011656944121483}


## Part 2.3: Cleaning The Sample Texts

### Data-Cleaning with RegEx and Transliteration:

First, obtain a large sample of text in a plaintext format - this will be < original file >.

Next, use the following BASH script to clean the text sample, and save it to a new file, < parsed file >.

    $ sed 's/[^a-zA-Z]//g' <original file> | iconv -f utf8 -t ascii//TRANSLIT | sed 's/[A-Z]/\L&/g' > <parsed file>
    
    
* 's/[^a-zA-Z]//g' : replace all characters in <original file> that aren't lower-/upper-case 'normal' with '' (i.e. remove them)

* iconv : tool for converting text from one encoding to another. We tell it to move -f (from) utf-8 encoding, -t (to) ascii encoding, using transliteration to approximate symbols where needed (e.g. é → e)

* 's/[A-Z]/\L&/g' : from the remaining lower-/upper-case letters, replace the upper case with lower case equivalents


\* *Note: the above commands (sed, iconv) are built-ins for Ubuntu 18.04+, and I expect are for almost all other versions of Linux.* 

\* *Note: For this project, the original texts are in format ./UDHR/< language >, and the cleaned ones are in format ./UDHR/< language >_fixed*

## Part 2.4: Constructing Ad-Hoc Codes

In [8]:
"""
Generating the new dataframe:
"""
languages= ['English', 'French', 'Italian', 'German']
df1,df2,df3,df4 = [generate_letter_frequencies_ad_hoc('UDHR/{}_fixed'.format(lang), lang) for lang in languages]
df = df1.join(
     df2[df2.columns[1:]]).join(
     df3[df3.columns[1:]]).join(
     df4[df4.columns[1:]])

codes = {lang: Huffman_encode_language(df,lang,d=2) for lang in df.columns[1:]}
df.head()

Unnamed: 0,Letter,English,French,Italian,German
0,a,0.081287,0.079784,0.101922,0.058305
1,b,0.012568,0.007895,0.008605,0.012968
2,c,0.033668,0.035321,0.030691,0.031974
3,d,0.037357,0.04571,0.050961,0.057018
4,e,0.124178,0.170995,0.118271,0.174421


In [9]:
encoded_UDHR = {'English': [],
                'French':  [],
                'Italian': [],
                'German':  []
               }

for l,c in encoded_UDHR.items():
    with open('UDHR/{}_fixed'.format(l),'r') as f:
        text = f.read().replace('\n','')
        code = codes[l][0]
        for symbol in text:
            encoded_UDHR[l].append(code[symbol])
            
lengths_encoded_UDHR = {'English': 0,
                        'French':  0,
                        'Italian': 0,
                        'German':  0
                        }

for i,j in encoded_UDHR.items():
    lengths_encoded_UDHR[i] = np.sum([len(k) for k in j])

original_lengths = {name: len(encodings) for name,encodings in encoded_UDHR.items()}
    
print('Encoded Lengths: ', lengths_encoded_UDHR)
print('Original Lengths: ', original_lengths)
print('ratios: ', {name: lengths_encoded_UDHR[name]/original_lengths[name] for name in encoded_UDHR})
print('AWL (In Practice): \n', {name: lengths_encoded_UDHR[name]/original_lengths[name] for name in encoded_UDHR})
print('AWL (In Theory): \n', {name: code[1] for name,code in codes.items()})

Encoded Lengths:  {'English': 35955, 'French': 38046, 'Italian': 41253, 'German': 41034}
Original Lengths:  {'English': 8673, 'French': 9626, 'Italian': 10459, 'German': 10102}
ratios:  {'English': 4.145624351435489, 'French': 3.952420527737378, 'Italian': 3.944258533320585, 'German': 4.06196792714314}
AWL (In Practice): 
 {'English': 4.145624351435489, 'French': 3.952420527737378, 'Italian': 3.944258533320585, 'German': 4.06196792714314}
AWL (In Theory): 
 {'English': 4.1456243514354885, 'French': 3.952420527737378, 'Italian': 3.944258533320585, 'German': 4.06196792714314}


# SECTION 3: Extensions

## Part 3.1: d-ary Huffman Codes

In [10]:
"""
Ternary:
"""
codes = {lang: Huffman_encode_language(df,lang,d=3) for lang in df.columns[1:]}

df_ternary_codes = df.copy()
df_ternary_codes = df_ternary_codes.drop(df.columns[1:], axis=1)
for name,code in codes.items():
    print(name, '\nAWL: ', code[1],'\n')
    df_ternary_codes[name+'_codes'] = df_ternary_codes.Letter.apply(lambda x: code[0][x])
    
df_ternary_codes

English 
AWL:  2.6447595987547565 

French 
AWL:  2.529815084147102 

Italian 
AWL:  2.521177932880773 

German 
AWL:  2.605622648980399 



Unnamed: 0,Letter,English_codes,French_codes,Italian_codes,German_codes
0,a,1,2,11,212
1,b,1221,20112,2100,2000
2,c,120,112,21,12
3,d,121,200,212,211
4,e,20,21,12,22
5,f,2222,1100,22021,11
6,g,2201,1102,2200,121
7,h,211,20110,22022,201
8,i,0,10,20,2
9,j,122211,2011102,2202000,200102


In [11]:
"""
Octal:
"""
codes = {lang: Huffman_encode_language(df,lang,d=8) for lang in df.columns[1:]}

df_octal_codes = df.copy()
df_octal_codes = df_octal_codes.drop(df.columns[1:], axis=1)
for name,code in codes.items():
    print(name, '\nAWL: ', code[1],'\n')
    df_octal_codes[name+'_codes'] = df_octal_codes.Letter.apply(lambda x: code[0][x])
    
df_octal_codes


English 
AWL:  1.4645451400899343 

French 
AWL:  1.4160606690214004 

Italian 
AWL:  1.4073047136437518 

German 
AWL:  1.454068501286874 



Unnamed: 0,Letter,English_codes,French_codes,Italian_codes,German_codes
0,a,0,0,0,0
1,b,63,753,64,65
2,c,72,72,72,71
3,d,73,73,75,77
4,e,1,1,1,1
5,f,71,754,62,70
6,g,65,756,67,72
7,h,75,752,63,74
8,i,2,2,2,2
9,j,605,7506,600,606


In [12]:
"""
Hex:
"""
codes = {lang: Huffman_encode_language(df,lang,d=16) for lang in df.columns[1:]}

df_hex_codes = df.copy()
df_hex_codes = df_hex_codes.drop(df.columns[1:], axis=1)
for name,code in codes.items():
    print(name, '\nAWL: ', code[1],'\n')
    df_hex_codes[name+'_codes'] = df_hex_codes.Letter.apply(lambda x: code[0][x])
    
df_hex_codes

English 
AWL:  1.0952380952380951 

French 
AWL:  1.0455017660502803 

Italian 
AWL:  1.0575580839468404 

German 
AWL:  1.0807760839437732 



Unnamed: 0,Letter,English_codes,French_codes,Italian_codes,German_codes
0,a,0,0,0,0
1,b,FC,FC,FD,FE
2,c,1,1,1,1
3,d,2,2,2,2
4,e,3,3,3,3
5,f,4,FD,FB,4
6,g,FE,FF,4,5
7,h,5,FB,FC,6
8,i,6,4,5,7
9,j,F7,F8,F0,F8


## Part 3.2: Source Extension For Block Codes

## Part 3.3: Source Extension for Huffman Codes

## Part 3.4: Letter Frequencies for Short Text Samples (tweets)

In [13]:
tweets = ['English_BBC_tweets_fixed',
          'French_lemondefr_tweets_fixed',
          'German_BILD_tweets_fixed',
          'Italian_repubblica_tweets_fixed',
          'Spanish_el_pais_tweets_fixed']

languages = ['English',
             'French',
             'German',
             'Italian']       

df1,df2,df3,df4 = [generate_letter_frequencies_ad_hoc('NEWS_TWEETS/{}'.format(tweets[index]), lang) for index,lang in enumerate(languages)]
df_twitter = df1.join(
     df2[df2.columns[1:]]).join(
     df3[df3.columns[1:]]).join(
     df4[df4.columns[1:]])

codes = {lang: Huffman_encode_language(df_twitter,lang,d=2) for lang in df.columns[1:]}

In [14]:
comparison = df.join(df_twitter[languages], lsuffix='_UDHR', rsuffix='_twitter')
comparison = comparison.reindex(['Letter']+sorted(comparison.columns[1:]), axis=1)

combined = df_twitter.join(df[df.columns[1:]], lsuffix='_twitter', rsuffix='UDHR')
comparison

Unnamed: 0,Letter,English_UDHR,English_twitter,French_UDHR,French_twitter,German_UDHR,German_twitter,Italian_UDHR,Italian_twitter
0,a,0.081287,0.063032,0.079784,0.086877,0.058305,0.072004,0.101922,0.111778
1,b,0.012568,0.063577,0.007895,0.016117,0.012968,0.032845,0.008605,0.029889
2,c,0.033668,0.044165,0.035321,0.036151,0.031974,0.037851,0.030691,0.049392
3,d,0.037357,0.031191,0.04571,0.040636,0.057018,0.052269,0.050961,0.027644
4,e,0.124178,0.115928,0.170995,0.154602,0.174421,0.118994,0.118271,0.092663
5,f,0.025827,0.018663,0.008207,0.019023,0.020986,0.015256,0.008031,0.0102
6,g,0.019025,0.008031,0.008519,0.009212,0.037616,0.026467,0.016636,0.020219
7,h,0.051424,0.037142,0.004779,0.012834,0.049891,0.0431,0.008031,0.015921
8,i,0.080595,0.059203,0.082069,0.058703,0.076124,0.078283,0.137489,0.107714
9,j,0.00173,0.002759,0.001143,0.006621,0.003564,0.006611,0.0,0.002445


In [15]:
abs_differences = df[['Letter']].join(np.abs(df_twitter[languages] - df[languages]))
rel_differences = df[['Letter']].join((np.abs(df_twitter[languages] - df[languages]))/df_twitter[languages])
print("""
s.t.d. probabilities UDHR:\n{}\n\n
s.t.d. probabilities Twitter:\n{}\n\n
mean abs differences in probabilities:\n{}\n\n
mean rel differences in probabilities:\n{}\n\n
         """.format(df[languages].std(), df_twitter[languages].std(), abs_differences.mean(), rel_differences.mean()))


s.t.d. probabilities UDHR:
English    0.034748
French     0.041554
German     0.039772
Italian    0.040242
dtype: float64


s.t.d. probabilities Twitter:
English    0.036080
French     0.036357
German     0.030283
Italian    0.034156
dtype: float64


mean abs differences in probabilities:
English    0.012247
French     0.006708
German     0.011201
Italian    0.008671
dtype: float64


mean rel differences in probabilities:
English    0.435794
French     0.356082
German     0.382233
Italian    0.402040
dtype: float64


         
