In [1]:
from io import open
import numpy as np
import pandas as pd
import re
import os 
from conllu import parse_incr
from collections import Counter
from string import digits, punctuation
from csv import writer
import pycountry

The following code assumes that the UD treebanks are under the same directory, in a folder called 'ud-treebanks-v2.9'. 
The treebanks can be downloaded at https://universaldependencies.org/#download.
Here version 2.9 is used (it can be downloaded from https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-4611#show-files), but a different one can be used by specifying it in the 'ud_version' variable.

In [2]:
# Get PUD languages
ud_version = 9

my_list = os.listdir( f'ud-treebanks-v2.{ud_version}')
pud_langs = [folder[3:-4] for folder in my_list if re.search(r'\bPUD\b', folder)]      # get PUD languages
pud_langs.sort()                                                                       # sort alphabetically

# iso codes
langs_dictionary = {}

for language in pud_langs:
    language_item = pycountry.languages.get(name=language)
    langs_dictionary[language] = language_item.alpha_3

langs_df = pd.DataFrame.from_dict(langs_dictionary, orient= "index")

In [3]:
# define filtering function
regex = f'[{digits}{punctuation}“。、・”«»—，）（·》《；：？％』『？」「–„–£…°€’‘¿№＝〜℃ⅲ＆１２２１６／α•／]+'

def is_valid(language,value):
    # contains digits or punctuation?
    if bool(re.findall(regex, value)):
        return False
    elif value == "":
        return False
    
    return True

In [4]:

# MAIN FUNCTION
def freq_length_df(language):
    folder = "UD_"+language+"-PUD"
    print(folder)
    [right_file] = [f for f in os.listdir(f'ud-treebanks-v2.9/{folder}') if f.endswith('.conllu')]
    data_file = open(f'ud-treebanks-v2.9/{folder}/{right_file}', "r", encoding="utf-8")

    word_counts_raw = Counter()  

    for sentence in parse_incr(data_file):
        tokens = [token["form"].lower() for token in sentence if token["upos"] != 'PUNCT']
        word_counts_raw.update(tokens)

    word_counts =  {} # will hold word count and length 

    for word, count in word_counts_raw.items():
        if is_valid(language, word): 
            word = word.replace(' ', '')        # replace all white spaces
            # annotate frequency and length
            word_counts[word] = {
                'word': word,
                'frequency': count,
                'n_characters': len(word)
            }


    # save frequency and length 
    df = pd.DataFrame.from_dict(word_counts, orient='index')
    print(df.head())
    df = df.sort_values(by=['frequency'],ascending=False)
    df.to_csv(f'./../../../data/non_filtered/corpora/pud/{langs_dictionary[language]}_pud.csv',index=False)
    return df


In [5]:
# BUILD DATAFRAMES
[freq_length_df(lang) for lang in pud_langs]

UD_Arabic-PUD
              word  frequency  n_characters
كتبت          كتبت          1             4
كوري          كوري          4             4
شولمان      شولمان          1             6
المساعدة  المساعدة          3             8
الخاصة      الخاصة         26             6
UD_Chinese-PUD
    word  frequency  n_characters
雖然    雖然          5             2
美國    美國         30             2
的      的       1362             1
許多    許多         16             2
數字化  數字化          1             3
UD_Czech-PUD
                word  frequency  n_characters
v                  v        466             1
tomto          tomto          3             5
procesu      procesu          1             7
předávání  předávání          3             9
moci            moci          5             4
UD_English-PUD
            word  frequency  n_characters
while      while         13             5
much        much         12             4
of            of        620             2
the          the       1441    

[          word  frequency  n_characters
 و            و        710             1
 في          في        652             2
 من          من        544             2
 ب            ب        536             1
 ل            ل        442             1
 ...        ...        ...           ...
 ذكية      ذكية          1             4
 أناساً  أناساً          1             6
 رأيت      رأيت          1             4
 ستايل    ستايل          1             5
 لسلام    لسلام          1             5
 
 [6600 rows x 3 columns],
    word  frequency  n_characters
 的     的       1362             1
 在     在        415             1
 了     了        380             1
 一     一        249             1
 是     是        215             1
 ..  ...        ...           ...
 警官   警官          1             2
 葉子   葉子          1             2
 多樣   多樣          1             2
 綠色   綠色          1             2
 宣言   宣言          1             2
 
 [5224 rows x 3 columns],
             word  frequency  n_characters
 