# Zipf's law of compression



In [1]:
from io import open
import numpy as np
import pandas as pd
import re
import os 
from conllu import parse_incr
from collections import Counter
from string import digits, punctuation
from csv import writer
import pycountry

In [7]:
# Get PUD languages
my_list = os.listdir('ud-treebanks-v2.9')
pud_langs = [folder[3:-4] for folder in my_list if re.search(r'\bPUD\b', folder)]      # get PUD languages
pud_langs.sort()                                                                       # sort alphabetically

# non latin languages
non_latin_langs = ['Arabic','Chinese','Hindi','Japanese','Korean','Russian','Thai']

# iso codes
langs_dictionary = {}

for language in pud_langs:
    language_item = pycountry.languages.get(name=language)
    langs_dictionary[language] = language_item.alpha_3

langs_df = pd.DataFrame.from_dict(langs_dictionary, orient= "index")

In [8]:
# define filtering function
regex = f'[{digits}{punctuation}“。、・”«»—，）（·》《；：？％』『？」「–„–£…°€’‘¿№＝〜℃ⅲ＆１２２１６／α•／]+'
non_lat_regex = f'[a-zA-Z]+'

def is_valid(language,value):
    # does it contain latin letters while being non latin language?
    if (language in non_latin_langs and re.findall(non_lat_regex, value)):
        return False
    # contains digits or punctuation?
    elif bool(re.findall(regex, value)):
        return False
    elif value == "":
        return False
    
    return True

In [15]:

# MAIN FUNCTION
def freq_length_df(language):
    folder = "UD_"+language+"-PUD"
    print(folder)
    [right_file] = [f for f in os.listdir(f'ud-treebanks-v2.9/{folder}') if f.endswith('.conllu')]
    data_file = open(f'ud-treebanks-v2.9/{folder}/{right_file}', "r", encoding="utf-8")

    word_counts_raw = Counter()  

    for sentence in parse_incr(data_file):
        tokens = [token["form"].lower() for token in sentence if token["upos"] != 'PUNCT']
        word_counts_raw.update(tokens)

    word_counts =  {} # will hold word count and length 

    for word, count in word_counts_raw.items():
        if is_valid(language, word): 
            word = word.replace(' ', '')        # replace all white spaces
            # annotate frequency and length
            word_counts[word] = {
                'word': word,
                'frequency': count,
                'length': len(word)
            }


    # save frequency and length 
    df = pd.DataFrame.from_dict(word_counts, orient='index')
    print(df.head())
    df = df.sort_values(by=['frequency'],ascending=False)
    df.to_csv(f'../data/pud/{langs_dictionary[language]}_pud.csv')
    return df

In [None]:
# BUILD DATAFRAMES
[freq_length_df(lang) for lang in pud_langs]