## Introduction

## Setup

## Dataset - Wordlists

Imports

In [None]:
from tensorflow.keras import utils
import numpy as np
import os
import re

List of constants

In [None]:
ABSOLUTE_MAX_STRING_LEN = 16
MINIBATCH_SIZE = 32
VAL_SPLIT = 0.2
ALPHABET = u'abcdefghijklmnopqrstuvwxyz '

Download and uncompress archive of raw word source lists.

In [None]:
fdir = os.path.dirname(
    utils.get_file('wordlists.tgz',
                   origin='http://www.mythic-ai.com/datasets/wordlists.tgz',
                   untar=True)
)
monogram_file = os.path.join(fdir, 'wordlist_mono_clean.txt')
bigram_file = os.path.join(fdir, 'wordlist_bi_clean.txt')

`build_word_list`: Function that builds a list of words satisfying the following criteria:
* Only words with lowercase alphabetic characters and spaces are included
* Words greater than `max_string_len` are excluded
* The ratio of monograms to bigrams is made to equal to `mono_fraction`
* Common words are interlaced with uncommon words (based on their frequency in English speech)
* Mixing in blank words. Prevously handled by data generator.

In [None]:
def build_word_list(num_words, max_string_len=None, mono_fraction=0.5):
    assert max_string_len <= ABSOLUTE_MAX_STRING_LEN
    assert num_words % MINIBATCH_SIZE == 0
    assert (VAL_SPLIT * num_words) % MINIBATCH_SIZE == 0

    string_list = [''] * num_words
    tmp_string_list = []
    X_text = []
    Y_data = np.ones([num_words, ABSOLUTE_MAX_STRING_LEN]) * -1
    Y_len = [0] * num_words
    
    def _text_to_labels(text):
        ret = []
        for char in text:
            ret.append(ALPHABET.find(char))
        return ret
    
    def _is_valid_str(in_str):
        search = re.compile(r'^[a-z ]+$', re.UNICODE).search
        return bool(search(in_str))

    def _is_length_of_word_valid(word):
        return (max_string_len == -1 or
                max_string_len is None or
                len(word) <= max_string_len)

    # monogram file contains words sorted by frequency in english speech
    with open(monogram_file, mode='r', encoding='utf-8') as f:
        for line in f:
            if len(tmp_string_list) == int(num_words * mono_fraction):
                break
            word = line.rstrip()
            if _is_length_of_word_valid(word):
                tmp_string_list.append(word)

    # bigram file contains common word pairings in english speech
    with open(bigram_file, mode='r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            if len(tmp_string_list) == num_words:
                break
            columns = line.lower().split()
            word = columns[0] + ' ' + columns[1]
            if _is_valid_str(word) and _is_length_of_word_valid(word):
                tmp_string_list.append(word)
                
    if len(tmp_string_list) != num_words:
        raise IOError('Could not pull enough words'
                      'from supplied monogram and bigram files.')
        
    # interlace to mix up the easy and hard words
    string_list[::2] = tmp_string_list[:num_words // 2]
    string_list[1::2] = tmp_string_list[num_words // 2:]
    
    # insert blank words every 4th word
    for i in range(0, num_words, 4):
        string_list.insert(i, '')
    string_list = string_list[:num_words]

    for i, word in enumerate(string_list):
        Y_len[i] = len(word)
        Y_data[i, 0:len(word)] = _text_to_labels(word)
        X_text.append(word)
    Y_len = np.expand_dims(np.array(Y_len), 1)
   
    return X_text, Y_data, Y_len

Build initial wordlist of 16000 short monograms (len < 4).

In [None]:
X_t, Y_d, Y_l = build_word_list(num_words=16000, max_string_len=4, mono_fraction=1)

print(len(X_t))
print("First five words:")
print(X_t[:5])
print("\n" + "First five words converted to integer labels:")
print(Y_d[:5])
print("\n" + "Length of each word:")
print(Y_l[:5])