In [14]:
from io import open
import numpy as np
import pandas as pd
import re
import os 
from conllu import parse_incr
from collections import Counter
from string import digits, punctuation
from csv import writer
import pycountry

The following code assumes that the UD treebanks are under the same directory, in a folder called 'ud-treebanks-v2.9'. 
The treebanks can be downloaded at https://universaldependencies.org/#download.
Here version 2.9 is used (it can be downloaded from https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-4611#show-files), but a different one can be used by specifying it in the 'ud_version' variable.

In [30]:
# Get PUD languages
ud_version = 9

my_list = os.listdir( f'ud-treebanks-v2.{ud_version}')
# iso codes
langs_dictionary = {}
pud_langs=['Spanish']

language_item = pycountry.languages.get(name='Spanish')
langs_dictionary['Spanish'] = language_item.alpha_3


langs_df = pd.DataFrame.from_dict(langs_dictionary, orient= "index")

In [31]:
# define filtering function
regex = f'[{digits}{punctuation}“。、・”«»—，）（·》《；：？％』『？」「–„–£…°€’‘¿№＝〜℃ⅲ＆１２２１６／α•／]+'

def is_valid(language,value):
    # contains digits or punctuation?
    if bool(re.findall(regex, value)):
        return False
    elif value == "":
        return False
    
    return True

In [62]:

# MAIN FUNCTION
def freq_length_df(language):
    folder = "UD_"+language+"-PUD"
    print(folder)
    [right_file] = [f for f in os.listdir(f'ud-treebanks-v2.9/{folder}') if f.endswith('.conllu')]
    data_file = open(f'ud-treebanks-v2.9/{folder}/{right_file}', "r", encoding="utf-8")

    word_counts_raw = Counter()  

    for sentence in parse_incr(data_file):
        tokens = [token["form"].lower() for token in sentence if token["upos"] != 'PUNCT']
        tokens_spanish=[]
        for token in tokens:
            if token!='del':
                tokens_spanish.append(token)
        word_counts_raw.update(tokens_spanish)

    word_counts =  {} # will hold word count and length 

    for word, count in word_counts_raw.items():
        if is_valid(language, word): 
            word = word.replace(' ', '')        # replace all white spaces
            # annotate frequency and length
            word_counts[word] = {
                'word': word,
                'frequency': count,
                'n_characters': len(word)
            }


    # save frequency and length 
    df = pd.DataFrame.from_dict(word_counts, orient='index')
    print(df.head())
    df = df.sort_values(by=['frequency'],ascending=False)
    df.to_csv(f'./../../../data/non_filtered/corpora/pud/{langs_dictionary[language]}_pud.csv',index=False)
    return df


In [63]:
# BUILD DATAFRAMES
[freq_length_df(lang) for lang in pud_langs]

UD_Spanish-PUD
                    word  frequency  n_characters
aunque            aunque          8             6
no                    no        124             2
haya                haya          2             4
precedentes  precedentes          1            11
para                para        144             4


[                  word  frequency  n_characters
 de                  de       1730             2
 el                  el       1102             2
 la                  la        795             2
 en                  en        691             2
 a                    a        551             1
 ...                ...        ...           ...
 krasnoyark  krasnoyark          1            10
 maldivas      maldivas          1             8
 índico          índico          1             6
 granito        granito          1             7
 declaraba    declaraba          1             9
 
 [5764 rows x 3 columns]]