# Data Processing/Cleaning

In [2]:
import glob
import datetime

raw_dir = '../raw/'
out_dir = '../webapp/data/'


def get_todays_idx():
    n_days = (datetime.datetime.utcnow() - datetime.datetime(1970, 1, 1)).days
    idx = n_days - 18992 + 195
    return idx

get_todays_idx()

248

In [3]:
### English
# load english words from index.dic.txt in raw/en/

en_5words = []
with open(raw_dir + 'en/index.dic.txt', 'r') as f:
    for line in f:
        word = line.split('/')[0]
        # if word is capitalized, skip
        if word[0].isupper() or not word.isalpha():
            continue
        if len(word) == 5 and word not in en_5words:
            en_5words.append(word)


import random
random.seed(42)
random.shuffle(en_5words)

print(f"English words: {len(en_5words)}")

en_5words

English words: 2864


['fluky',
 'mourn',
 'telly',
 'probe',
 'burly',
 'glued',
 'flier',
 'usage',
 'rugby',
 'singe',
 'muggy',
 'hatch',
 'spook',
 'cutie',
 'pappy',
 'melon',
 'faint',
 'twang',
 'atlas',
 'choir',
 'klutz',
 'outgo',
 'tizzy',
 'diver',
 'grebe',
 'sieve',
 'huffy',
 'tarty',
 'hogan',
 'scull',
 'fleck',
 'crust',
 'groat',
 'chant',
 'belay',
 'extra',
 'women',
 'medic',
 'ghost',
 'mezzo',
 'baldy',
 'roach',
 'shirr',
 'beach',
 'shoot',
 'tress',
 'ethyl',
 'spend',
 'nylon',
 'timid',
 'viand',
 'frack',
 'haiku',
 'lurch',
 'bijou',
 'plank',
 'smith',
 'nacre',
 'amigo',
 'idler',
 'macho',
 'going',
 'tramp',
 'cutup',
 'grain',
 'unfit',
 'antic',
 'queer',
 'nippy',
 'senor',
 'score',
 'prism',
 'grope',
 'dotty',
 'spume',
 'anise',
 'ceder',
 'fruit',
 'copay',
 'smell',
 'horde',
 'sinus',
 'nadir',
 'pinon',
 'miler',
 'geode',
 'above',
 'spicy',
 'hammy',
 'shade',
 'laird',
 'satyr',
 'girth',
 'titch',
 'tweed',
 'storm',
 'shrub',
 'forge',
 'virus',
 'motto',


In [5]:
def dict_to_5words(lang):
    """
    filters a word dictionary to 5-letter words and saves them to a file
    """
    words = []
    with open(raw_dir + lang + '/index.dic.ytxt', 'r') as f:
        for line in f:
            word = line.split('/')[0]
            # if word is capitalized, skip
            if word[0].isupper() or not word.isalpha():
                continue
            if len(word) == 5 and word not in words:
                words.append(word)

    random.seed(42)
    random.shuffle(words)

    with open(out_dir + lang + '/' + lang + '_5words.txt', 'w') as f:
        for word in words:
            f.write(word + '\n')

len(dict_to_5words('en'))

FileNotFoundError: [Errno 2] No such file or directory: '../webapp/data/en/en_5words.txt'

In [65]:
# for every folder in raw/ (except en), create a folder in webapp/data/

import os
import glob
import random

def dict_to_5words(lang):
    """
    filters a word dictionary to 5-letter words and saves them to a file
    """
    words = []
    with open(raw_dir+ 'dictionaries/' + lang + '/index.dic', 'r') as f:
        i = 0
        for line in f:
            i += 1
            if line[0] == '/':
                continue
            try:
                word = line.split('/')[0]
                # if word is capitalized, skip
                if word[0].isupper() or not word.isalpha():
                    continue
                if len(word) == 5 and word not in words:
                    words.append(word.lower())
            except Exception as e:
                print(f"Error at line {i}: {e} in {lang}")
                pass
            

    random.seed(42)
    random.shuffle(words)

    with open(out_dir + lang + '/' + lang + '_5words.txt', 'w') as f:
        for word in words:
            f.write(word + '\n')

    # make list of all appearing characters
    characters = []
    for word in words:
        for char in word:
            if char not in characters:
                characters.append(char)
    
    # write characters to file
    with open(out_dir + lang + '/' + lang + '_characters.txt', 'w') as f:
        for char in characters:
            f.write(char + '\n')
    
    print(f"{lang} words: {len(words)} characters: {len(characters)}")  

    return words

folders = [folder for folder in glob.glob(raw_dir + 'dictionaries/*') if "-" not in folder]

for folder in folders:
    lang = folder.split('/')[-1]
    if lang == 'en':  # use official word list for english
        continue
    if not glob.glob(out_dir + "languages/" + lang):
        os.mkdir(out_dir + "languages/" + lang)
    words = dict_to_5words( "languages/" + lang)

ie words: 1668 characters: 31
ko words: 8887 characters: 65
ne words: 18 characters: 25
sl words: 6813 characters: 27
nl words: 2956 characters: 37
lb words: 311 characters: 31
hu words: 4604 characters: 35
fr words: 3373 characters: 46
oc words: 3977 characters: 37
ia words: 2261 characters: 34
mk words: 4126 characters: 31
ga words: 2843 characters: 25
tr words: 6652 characters: 30
is words: 632 characters: 33
it words: 1385 characters: 28
ru words: 3821 characters: 32
pt words: 7089 characters: 41
es words: 3029 characters: 32
lv words: 2305 characters: 33
br words: 2253 characters: 30
ca words: 6436 characters: 37
hr words: 2932 characters: 27
et words: 9577 characters: 28
ltg words: 367 characters: 34
sr words: 0 characters: 0
pl words: 5181 characters: 37
he words: 64475 characters: 27
vi words: 0 characters: 0
fa words: 7703 characters: 37
eu words: 5985 characters: 33
uk words: 4354 characters: 33
ro words: 5022 characters: 32
fur words: 2590 characters: 32
fy words: 180 charac

In [5]:
# Custom arabic word source
import pandas as pd
df = pd.read_excel(raw_dir + 'ar/Top-50000-Arabic-Words-Masterlist_ModernStandardArabic.com_.xlsx')

Unnamed: 0,Word Number,Frequency,Arabic,English,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,1,2285403,لا,No,,,,,
1,2,2229495,من,Of,,,,,
2,3,1789391,في,In,,,,,
3,4,1761748,أن,That,,,,,
4,5,1624794,هذا,This is,,,,,
...,...,...,...,...,...,...,...,...,...
4995,4996,2304,احتفظ,Keep a,,,,,
4996,4997,2303,الصخور,Rock,,,,,
4997,4998,2303,أخبروني,They told me,,,,,
4998,4999,2303,اخرجوا,Get out,,,,,


In [40]:
raw_words = df[["Arabic"]].values.tolist()
words = []
for word in raw_words:
    word = word[0]
    # ـ
    forbidden_charset = 'abcdefghijklmnopqrstuvwxyz -0123456789'
    try:
        if any (char.isupper() for char in word) or any (char in forbidden_charset for char in word) or any (char.isdigit() for char in word):
            continue
        if len(word) == 5 and word not in words:
            words.append(word.lower())
    except Exception as e:
        print(f"{e}")
        pass

# make list of all appearing characters
characters = []
for word in words:
    for char in word:
        if char not in characters:
            characters.append(char)

print(f"Arabic 5 words: {len(words)}")
print(f"Arabic characters: {len(characters)}")
print(" ".join(characters))

'bool' object is not iterable
Arabic 5 words: 14157
Arabic characters: 37
ع ن د م ا ل ذ أ ر ت ق و ي ك ج س ح ة ب ه ش ء ؤ خ ص ظ ث آ ئ ض ط ى غ ز ف إ ـ


In [59]:
# Let's do a more reusable version of all this
import random

def process_wordlist(wordlist, language_code, desired_word_length=5, forbidden_charset=None, acceptable_charset=None):
    """Takes in a wordlist and processes it for wordle consumption"""
    words = []
    for word in wordlist:
        word = word.strip().lower()
        if forbidden_charset:
            if any (char in forbidden_charset for char in word):
                continue
        if acceptable_charset:
            if not any (char in acceptable_charset for char in word):
                continue
        if len(word) == desired_word_length and word not in words:
            words.append(word)
    
    # mix up the words in case they were in a certain order
    random.seed(42)
    random.shuffle(words)
    
    characters = []
    for word in words:
        for char in word:
            if char not in characters:
                characters.append(char)
    
    # write to file
    with open(out_dir + 'languages/' + language_code + '/' + language_code + '_' + str(desired_word_length) + 'words.txt', 'w') as f:
        for word in words:
            f.write(word + '\n')
    
    with open(out_dir + 'languages/' + language_code + '/' + language_code + '_characters.txt', 'w') as f:
        for char in characters:
            f.write(char + '\n')
    
    print(f"{language_code} words: {len(words)} characters: {len(characters)}")
    print(f"characterset for {language_code}: {' '.join(characters)}")
    return words, characters

In [60]:
words, characters = process_wordlist(words, "ar", 5, forbidden_charset="abcdefghijklmnopqrstuvwxyz -0123456789 ـ")

ar words: 13882 characters: 36
characterset for ar: و ا ل س ق ب ه م ح أ ج ت ض ي ر ف ة ذ ع ك ن ز ش ص إ د ئ خ ط ظ ء ث غ آ ى ؤ
