In [1]:
import pandas as pd

#NLP libraries
import nltk 
from nltk.corpus import udhr #corpora with texts 
import re
import spacy

nlp = {}

# Load spacy models for languages with pipelines
nlp["es"] = spacy.load("es_core_news_sm") #Spanish
nlp["ko"] = spacy.load("ko_core_news_sm") #Korean
nlp["fi"] = spacy.load("fi_core_news_sm") #Finnish
nlp["ja"] = spacy.load("ja_core_news_sm") #Japanese
nlp["pl"] = spacy.load("pl_core_news_sm") #Polish
nlp["de"] = spacy.load("de_core_news_sm") #German

#other spacy models for less explored languages 
from spacy.lang.tr import Turkish
nlp["tr"] = Turkish()
from spacy.lang.id import Indonesian
nlp["id"] = Indonesian()
from spacy.lang.tl import Tagalog
nlp["tl"] = Tagalog()
from spacy.lang.eu import Basque
nlp["eu"] = Basque()
from spacy.lang.et import Estonian
nlp["et"] = Estonian()
from spacy.lang.kn import Kannada
nlp["kn"] = Kannada()
from spacy.lang.yo import Yoruba 
nlp["yo"] = Yoruba()
from spacy.lang.ms import Malay
nlp["ms"] = Malay()
from spacy.lang.ga import Irish
nlp["ga"] = Irish()
from spacy.lang.tn import Setswana
nlp["tn"] = Setswana()
from spacy.lang.bg import Bulgarian
nlp["bg"] = Bulgarian()

# from spacy.lang.ch import Chamorro
# nlp["ch"] = Chamorro()
# from spacy.lang.kl import Greenlandic
# nlp["kl"] = Greenlandic()
# nlp["que"] = Add Quechua

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
language_data= pd.read_csv("../data/language_data.csv", sep=";")
language_data

Unnamed: 0,Language,Family,ISO code
0,Spanish,Indo-European,es
1,Korean,Koreanic,ko
2,Finnish,Uralic,fi
3,Turkish,Turkic,tr
4,Indonesian,Austronesian,id
5,Japanese,Japonic,ja
6,Tagalog,Austronesian,tl
7,Basque,Language isolate,eu
8,Estonian,Uralic,et
9,Kannada,Dravian,kn


In [3]:
#Download languages
nltk.download('udhr')

all_file_ids= nltk.corpus.udhr.fileids()
for _, row in language_data.iterrows():
    for file_id in all_file_ids:
        if len(re.findall(row['Language'], file_id)) > 0:
            file = open("../data/" + str(row["ISO code"]) + ".txt", "w")
            file.write(nltk.corpus.udhr.raw(file_id))
            file.close()

[nltk_data] Downloading package udhr to /Users/bunetz/nltk_data...
[nltk_data]   Package udhr is already up-to-date!


In [4]:
def get_dict_raw_texts(list_of_codes):
  dict_raw_texts= {}
  for code in list_of_codes:
    file = open("../data/" + str(code) + ".txt", "r")
    dict_raw_texts[code] = file.read()
    file.close()
  return dict_raw_texts

def tokenizer(text, model_lang):
    doc = model_lang(text)
    tokens = [token.text for token in doc if not token.is_space and not token.is_punct and not token.is_digit]
    return tokens

def tokens(dict_raw_texts):
    tokens_langs= {}
    for code in dict_raw_texts.keys():
        tokens_langs[code] = tokenizer(dict_raw_texts[code], nlp[code])
    return tokens_langs

tokenized_languages = tokens(get_dict_raw_texts(language_data['ISO code'].values))


In [5]:
from collections import Counter
import csv

def process(tokens):
    token_freq = Counter(tokens)
    matrix = []
    for token in set(tokens):
        matrix.append([token, len(token), token_freq[token]])
    matrix.sort(key=lambda x: x[2], reverse=True)
    return matrix

tokenized_languages_matrices= {}
for code in tokenized_languages.keys():
    tokenized_languages_matrices[code] = process(tokenized_languages[code])

def matrix_to_csv(matrix, filename):
    file = open(filename, 'w', newline='', encoding="utf-8")
    writer = csv.writer(file)
    writer.writerow(['Token', 'Length', 'Frequency'])
    for row in matrix:
        writer.writerow(row)
    file.close()

for code in tokenized_languages_matrices.keys():
    filename = f"data/output_{code.lower()}.csv"
    matrix_to_csv(tokenized_languages_matrices[code], filename)