# Data Processing/Cleaning

In [5]:
import glob

raw_dir = '../raw/dictionaries/'
out_dir = '../webapp/data/languages/'

In [6]:
### English
# load english words from index.dic.txt in raw/en/

en_5words = []
with open(raw_dir + 'en/index.dic', 'r') as f:
    for line in f:
        word = line.split('/')[0]
        # if word is capitalized, skip
        if word[0].isupper() or not word.isalpha():
            continue
        if len(word) == 5 and word not in en_5words:
            en_5words.append(word)


import random
random.seed(42)
random.shuffle(en_5words)

print(f"English words: {len(en_5words)}")

en_5words[:5]

English words: 2864


['fluky', 'mourn', 'telly', 'probe', 'burly']

In [7]:
def dict_to_5words(lang):
    """
    filters a word dictionary to 5-letter words and saves them to a file
    """
    words = []
    with open(raw_dir + lang + '/index.dic', 'r') as f:
        for line in f:
            word = line.split('/')[0]
            # if word is capitalized, skip
            if word[0].isupper() or not word.isalpha():
                continue
            if len(word) == 5 and word not in words:
                words.append(word)

    random.seed(42)
    random.shuffle(words)

    with open(out_dir + lang + '/' + lang + '_5words.txt', 'w') as f:
        for word in words:
            f.write(word + '\n')

In [15]:
# for every folder in raw/ (except en), create a folder in webapp/data/

import os
import glob
import random

# problematic: tlh, vi, sr, ne, el, rw  

def dict_to_5words(lang):
    """
    filters a word dictionary to 5-letter words and saves them to a file
    """
    words = []
    with open(raw_dir + lang + '/index.dic', 'r') as f:
        i = 0
        for line in f:
            i += 1
            if line[0] == '/':
                continue
            try:
                word = line.split('/')[0]
                word = word.strip().lower()
                forbidden_charset = "0123456789/-"
                if any(c in forbidden_charset for c in word):
                    continue
                if len(word) == 5 and word not in words:
                    words.append(word.lower())
            except Exception as e:
                print(f"Error at line {i}: {e} in {lang}")
                pass
            
    # if '_characters.txt' file exists in out_dir, load it
    try:
        with open(out_dir + lang + '/' + lang + '_characters.txt', 'r') as f:
            characters = [line.strip() for line in f]
    except FileNotFoundError:
        characters = []
    
    # filter words by characters
    words = [word for word in words if all([char in characters for char in word])]

    random.seed(42)
    random.shuffle(words)

    with open(out_dir + lang + '/' + lang + '_5words.txt', 'w') as f:
        for word in words:
            f.write(word + '\n')

    # make list of all appearing characters
    characters = []
    for word in words:
        for char in word:
            if char not in characters:
                characters.append(char)
    
    # order characters
    characters.sort()
    # write characters to file if not already there
    if not os.path.exists(out_dir + lang + '/' + lang + '_characters.txt'):
        with open(out_dir + lang + '/' + lang + '_characters.txt', 'w') as f:
            for char in characters:
                f.write(char + '\n')
    
    print(f"{lang} words: {len(words)} characters: {len(characters)}")  

    return words

In [16]:
folders = [folder for folder in glob.glob(raw_dir + '*') if "-" not in folder]

for folder in folders:
    lang = folder.split('/')[-1]
    if lang == 'en':  # use official word list for english
        continue
    if not glob.glob(out_dir + lang):
        os.mkdir(out_dir + lang)
    words = dict_to_5words( lang)

ie words: 2146 characters: 33
ko words: 8921 characters: 65
ne words: 6561 characters: 63
sl words: 12091 characters: 33
nl words: 8075 characters: 52
lb words: 1798 characters: 40
hu words: 6998 characters: 46
fr words: 4944 characters: 61
oc words: 4203 characters: 38
ia words: 2506 characters: 36
mk words: 6004 characters: 34
ga words: 5778 characters: 36
tr words: 9276 characters: 32
is words: 8305 characters: 35
it words: 2790 characters: 33
ru words: 4687 characters: 32
pt words: 9480 characters: 45
es words: 3601 characters: 33
lv words: 2783 characters: 34
br words: 7252 characters: 38
ca words: 9173 characters: 74
hr words: 3591 characters: 28
et words: 9583 characters: 31
ltg words: 387 characters: 34
sr words: 17967 characters: 30
pl words: 10389 characters: 35
he words: 64771 characters: 29
vi words: 738 characters: 88
fa words: 11393 characters: 42
eu words: 7575 characters: 41
uk words: 9707 characters: 34
ro words: 8629 characters: 32
fur words: 3576 characters: 37
fy wo

In [None]:
words = dict_to_5words('de')


In [5]:
# Custom arabic word source
import pandas as pd
df = pd.read_excel(raw_dir + 'ar/Top-50000-Arabic-Words-Masterlist_ModernStandardArabic.com_.xlsx')

Unnamed: 0,Word Number,Frequency,Arabic,English,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,1,2285403,لا,No,,,,,
1,2,2229495,من,Of,,,,,
2,3,1789391,في,In,,,,,
3,4,1761748,أن,That,,,,,
4,5,1624794,هذا,This is,,,,,
...,...,...,...,...,...,...,...,...,...
4995,4996,2304,احتفظ,Keep a,,,,,
4996,4997,2303,الصخور,Rock,,,,,
4997,4998,2303,أخبروني,They told me,,,,,
4998,4999,2303,اخرجوا,Get out,,,,,


In [40]:
raw_words = df[["Arabic"]].values.tolist()
words = []
for word in raw_words:
    word = word[0]
    # ـ
    forbidden_charset = 'abcdefghijklmnopqrstuvwxyz -0123456789'
    try:
        if any (char.isupper() for char in word) or any (char in forbidden_charset for char in word) or any (char.isdigit() for char in word):
            continue
        if len(word) == 5 and word not in words:
            words.append(word.lower())
    except Exception as e:
        print(f"{e}")
        pass

# make list of all appearing characters
characters = []
for word in words:
    for char in word:
        if char not in characters:
            characters.append(char)

print(f"Arabic 5 words: {len(words)}")
print(f"Arabic characters: {len(characters)}")
print(" ".join(characters))

'bool' object is not iterable
Arabic 5 words: 14157
Arabic characters: 37
ع ن د م ا ل ذ أ ر ت ق و ي ك ج س ح ة ب ه ش ء ؤ خ ص ظ ث آ ئ ض ط ى غ ز ف إ ـ


In [59]:
# Let's do a more reusable version of all this
import random

def process_wordlist(wordlist, language_code, desired_word_length=5, forbidden_charset=None, acceptable_charset=None):
    """Takes in a wordlist and processes it for wordle consumption"""
    words = []
    for word in wordlist:
        word = word.strip().lower()
        if forbidden_charset:
            if any (char in forbidden_charset for char in word):
                continue
        if acceptable_charset:
            if not any (char in acceptable_charset for char in word):
                continue
        if len(word) == desired_word_length and word not in words:
            words.append(word)
    
    # mix up the words in case they were in a certain order
    random.seed(42)
    random.shuffle(words)
    
    characters = []
    for word in words:
        for char in word:
            if char not in characters:
                characters.append(char)
    
    # write to file
    with open(out_dir + 'languages/' + language_code + '/' + language_code + '_' + str(desired_word_length) + 'words.txt', 'w') as f:
        for word in words:
            f.write(word + '\n')
    
    with open(out_dir + 'languages/' + language_code + '/' + language_code + '_characters.txt', 'w') as f:
        for char in characters:
            f.write(char + '\n')
    
    print(f"{language_code} words: {len(words)} characters: {len(characters)}")
    print(f"characterset for {language_code}: {' '.join(characters)}")
    return words, characters

In [None]:


process_wordlist(words, 'tlh',

In [60]:
words, characters = process_wordlist(words, "ar", 5, forbidden_charset="abcdefghijklmnopqrstuvwxyz -0123456789 ـ")

ar words: 13882 characters: 36
characterset for ar: و ا ل س ق ب ه م ح أ ج ت ض ي ر ف ة ذ ع ك ن ز ش ص إ د ئ خ ط ظ ء ث غ آ ى ؤ
