# Zipf's law of compression



In [1]:
from io import open
import numpy as np
import pandas as pd
import re
import os 
from conllu import parse_incr
from collections import Counter
from string import digits, punctuation
from csv import writer
import pycountry

# romanize languages
import pinyin                                                                               # chinese
import pykakasi                                                                             # japanese
kks = pykakasi.kakasi()
from korean_romanizer.romanizer import Romanizer                                            # korean


In [2]:
# Get PUD languages
my_list = os.listdir('ud-treebanks-v2.9')
pud_langs = [folder[3:-4] for folder in my_list if re.search(r'\bPUD\b', folder)]      # get PUD languages
pud_langs.sort()                                                                       # sort alphabetically

# non latin languages
non_latin_langs = ['Arabic','Chinese','Hindi','Japanese','Korean','Russian','Thai']

# iso codes
langs_dictionary = {}

for language in pud_langs:
    language_item = pycountry.languages.get(name=language)
    langs_dictionary[language] = language_item.alpha_3

langs_df = pd.DataFrame.from_dict(langs_dictionary, orient= "index")
langs_df.to_csv("utils/PUD_languages.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'ud-treebanks-v2.9'

In [4]:
# define filtering function
regex = f'[{digits}{punctuation}“。、・”«»—，）（·》《；：？％』『？」「–„–£…°€’‘¿№＝〜℃ⅲ＆１２２１６／α•／]+'
non_lat_regex = f'[a-zA-Z]+'

def is_valid(language,value):
    # does it contain latin letters while being non latin language?
    if (language in non_latin_langs and re.findall(non_lat_regex, value)):
        return False
    # contains digits or punctuation?
    elif bool(re.findall(regex, value)):
        return False
    elif value == "":
        return False
    
    return True

In [5]:
# define function to romanize languages (!! not currently used)
def romanize(language,word):
    non_romance_langs = ['Chinese','Japanese','Korean']
    # apply Caesar move!
    if   language == "Chinese":
        return pinyin.get(word, format="strip", delimiter=" ")
    elif language == "Japanese":
        return kks.convert(word)[0]['hepburn']   
    elif language == "Korean":
        return Romanizer(word).romanize() 
    elif not language in non_romance_langs:
        return word

In [5]:

# # MAIN FUNCTION
# def freq_length_df(language):
#     folder = "UD_"+language+"-PUD"
#     print(folder)
#     [right_file] = [f for f in os.listdir(f'ud-treebanks-v2.9/{folder}') if f.endswith('.conllu')]
#     data_file = open(f'ud-treebanks-v2.9/{folder}/{right_file}', "r", encoding="utf-8")

#     word_counts_raw = Counter()  

#     for sentence in parse_incr(data_file):
#         tokens = [token["form"].lower() for token in sentence]
#         word_counts_raw.update(tokens)

#     word_counts =  {} # will hold word count and length 

#     for word, count in word_counts_raw.items():
#         if is_valid(language, word): 
#             romanized_word = romanize(language, word)     # romanize
#             romanized_word = romanized_word.replace(' ', '')        # replace all white spaces
#             # annotate frequency and length
#             word_counts[romanized_word] = {
#                 'word': word,
#                 'frequency': count,
#                 'length': len(romanized_word)
#             }


#     # save frequency and length 
#     df = pd.DataFrame.from_dict(word_counts, orient='index')
#     print(df.head())
#     df = df.sort_values(by=['frequency'],ascending=False)
#     df.to_csv(f'data/{langs_dictionary[language]}.csv')
#     #df.to_csv(f'data/{langs_dictionary[language]}_pud.txt', index=None, sep=',', mode='a')
#     return df


# MAIN FUNCTION
def freq_length_df(language):
    folder = "UD_"+language+"-PUD"
    print(folder)
    [right_file] = [f for f in os.listdir(f'ud-treebanks-v2.9/{folder}') if f.endswith('.conllu')]
    data_file = open(f'ud-treebanks-v2.9/{folder}/{right_file}', "r", encoding="utf-8")

    word_counts_raw = Counter()  

    for sentence in parse_incr(data_file):
        tokens = [token["form"].lower() for token in sentence]
        word_counts_raw.update(tokens)

    word_counts =  {} # will hold word count and length 

    for word, count in word_counts_raw.items():
        if is_valid(language, word): 
            word = word.replace(' ', '')        # replace all white spaces
            # annotate frequency and length
            word_counts[word] = {
                'word': word,
                'frequency': count,
                'length': len(word)
            }


    # save frequency and length 
    df = pd.DataFrame.from_dict(word_counts, orient='index')
    print(df.head())
    df = df.sort_values(by=['frequency'],ascending=False)
    df.to_csv(f'data/{langs_dictionary[language]}.csv')
    #df.to_csv(f'data/{langs_dictionary[language]}_pud.txt', index=None, sep=',', mode='a')
    return df

In [6]:
# BUILD DATAFRAMES
[freq_length_df(lang) for lang in pud_langs]

UD_Arabic-PUD
              word  frequency  length
كتبت          كتبت          1       4
كوري          كوري          4       4
شولمان      شولمان          1       6
المساعدة  المساعدة          3       8
الخاصة      الخاصة         26       6
UD_Chinese-PUD
    word  frequency  length
雖然    雖然          5       2
美國    美國         30       2
的      的       1362       1
許多    許多         16       2
數字化  數字化          1       3
UD_Czech-PUD
                word  frequency  length
v                  v        466       1
tomto          tomto          3       5
procesu      procesu          1       7
předávání  předávání          3       9
moci            moci          5       4
UD_English-PUD
            word  frequency  length
while      while         13       5
much        much         12       4
of            of        620       2
the          the       1441       3
digital  digital          3       7
UD_Finnish-PUD
                        word  frequency  length
vaikka                vaikka

[          word  frequency  length
 و            و        710       1
 في          في        652       2
 من          من        544       2
 ب            ب        536       1
 ل            ل        442       1
 ...        ...        ...     ...
 ذكية      ذكية          1       4
 أناساً  أناساً          1       6
 رأيت      رأيت          1       4
 ستايل    ستايل          1       5
 لسلام    لسلام          1       5
 
 [6602 rows x 3 columns],
    word  frequency  length
 的     的       1362       1
 在     在        415       1
 了     了        380       1
 一     一        249       1
 是     是        215       1
 ..  ...        ...     ...
 談     談          1       1
 死者   死者          1       2
 橋樑   橋樑          1       2
 界限   界限          1       2
 宣言   宣言          1       2
 
 [4974 rows x 3 columns],
             word  frequency  length
 v              v        466       1
 a              a        466       1
 se            se        320       2
 na            na        244       2
 je

In [154]:
# save alphabet size

def countDis(str):
    # Stores all frequencies
    freq = Counter(str)
    # Return the size of the freq dictionary
    return len(freq)

alphabet_dict = {}
for lang in pud_langs:
    df = pd.read_csv(f'data/{lang}.csv')
    str = df['roman_word'].str.cat()
    alphabet_dict[lang] = countDis(str)
df = pd.DataFrame.from_dict(alphabet_dict, orient = 'index', columns=["ab_size"])
df.to_csv("utils/ab_sizes.csv")


In [155]:
# count empty lemma instances and check for which languages more than 30% is missing or less than 2000 remain.

empty_lemma_dict = {}

for language in pud_langs:
    folder = "UD_"+language+"-PUD"
    [right_file] = [f for f in os.listdir(f'ud-treebanks-v2.9/{folder}') if f.endswith('.conllu')]
    data_file = open(f'ud-treebanks-v2.9/{folder}/{right_file}', "r", encoding="utf-8")

    lemma_counter = Counter()

    for sentence in parse_incr(data_file):
        tokens = [ token['lemma'] for token in sentence if (is_valid(language,token['lemma']) or token['lemma'] == "_")] 
        lemma_counter.update(tokens)   


    share_empty = lemma_counter['_']/sum(lemma_counter.values())
    empty_lemma_dict[language] = {
        "_": round(share_empty,3),
        "tot_lemmas": len(lemma_counter)
    }


removed_lang_list = []
for key, values in dict(empty_lemma_dict).items():
    if (values["_"] >= 0.3 or values["tot_lemmas"] <= 2000):
        removed_lang_list.append(key)
        empty_lemma_dict.pop(key)

print(empty_lemma_dict)
print(removed_lang_list)

{'Czech': {'_': 0.004, 'tot_lemmas': 5090}, 'English': {'_': 0.0, 'tot_lemmas': 4439}, 'Finnish': {'_': 0.001, 'tot_lemmas': 3763}, 'French': {'_': 0.027, 'tot_lemmas': 4302}, 'German': {'_': 0.018, 'tot_lemmas': 5123}, 'Icelandic': {'_': 0.0, 'tot_lemmas': 4468}, 'Indonesian': {'_': 0.024, 'tot_lemmas': 3464}, 'Italian': {'_': 0.069, 'tot_lemmas': 4542}, 'Japanese': {'_': 0.0, 'tot_lemmas': 4636}, 'Polish': {'_': 0.003, 'tot_lemmas': 4789}, 'Portuguese': {'_': 0.269, 'tot_lemmas': 3766}, 'Russian': {'_': 0.0, 'tot_lemmas': 4644}, 'Swedish': {'_': 0.0, 'tot_lemmas': 4708}, 'Turkish': {'_': 0.054, 'tot_lemmas': 4340}}
['Arabic', 'Chinese', 'Hindi', 'Korean', 'Spanish', 'Thai']
