## Introduction

## Setup

## Dataset - Wordlists

Imports

In [None]:
from tensorflow.keras import utils
import numpy as np
import os
import re

List of constants

In [None]:
ABSOLUTE_MAX_STRING_LEN = 16
MINIBATCH_SIZE = 32
VAL_SPLIT = 0.2
ALPHABET = u'abcdefghijklmnopqrstuvwxyz '

Download and uncompress archive of raw word source lists.

In [None]:
fdir = os.path.dirname(
    utils.get_file('wordlists.tgz',
                   origin='http://www.mythic-ai.com/datasets/wordlists.tgz',
                   untar=True)
)
monogram_file = os.path.join(fdir, 'wordlist_mono_clean.txt')
bigram_file = os.path.join(fdir, 'wordlist_bi_clean.txt')

`build_word_list`: Function that builds a list of words satisfying the following criteria:
* Only words with lowercase alphabetic characters and spaces are included
* Words greater than `max_string_len` are excluded
* The ratio of monograms to bigrams is made to equal to `mono_fraction`
* Common words are interlaced with uncommon words (based on their frequency in English speech)
* Mixing in blank words. Prevously handled by data generator.

In [None]:
def build_word_list(num_words, max_string_len=None, mono_fraction=0.5):
    assert max_string_len <= ABSOLUTE_MAX_STRING_LEN
    assert num_words % MINIBATCH_SIZE == 0
    assert (VAL_SPLIT * num_words) % MINIBATCH_SIZE == 0

    string_list = [''] * num_words
    tmp_string_list = []
    X_text = []
    Y_data = np.ones([num_words, ABSOLUTE_MAX_STRING_LEN]) * -1
    Y_len = [0] * num_words
    
    def _text_to_labels(text):
        ret = []
        for char in text:
            ret.append(ALPHABET.find(char))
        return ret
    
    def _is_valid_str(in_str):
        search = re.compile(r'^[a-z ]+$', re.UNICODE).search
        return bool(search(in_str))

    def _is_length_of_word_valid(word):
        return (max_string_len == -1 or
                max_string_len is None or
                len(word) <= max_string_len)

    # monogram file contains words sorted by frequency in english speech
    with open(monogram_file, mode='r', encoding='utf-8') as f:
        for line in f:
            if len(tmp_string_list) == int(num_words * mono_fraction):
                break
            word = line.rstrip()
            if _is_length_of_word_valid(word):
                tmp_string_list.append(word)

    # bigram file contains common word pairings in english speech
    with open(bigram_file, mode='r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            if len(tmp_string_list) == num_words:
                break
            columns = line.lower().split()
            word = columns[0] + ' ' + columns[1]
            if _is_valid_str(word) and _is_length_of_word_valid(word):
                tmp_string_list.append(word)
                
    if len(tmp_string_list) != num_words:
        raise IOError('Could not pull enough words'
                      'from supplied monogram and bigram files.')
        
    # interlace to mix up the easy and hard words
    string_list[::2] = tmp_string_list[:num_words // 2]
    string_list[1::2] = tmp_string_list[num_words // 2:]
    
    # insert blank words every 4th word
    for i in range(0, num_words, 4):
        string_list.insert(i, '')
    string_list = string_list[:num_words]

    for i, word in enumerate(string_list):
        Y_len[i] = len(word)
        Y_data[i, 0:len(word)] = _text_to_labels(word)
        X_text.append(word)
    Y_len = np.expand_dims(np.array(Y_len), 1)
   
    return X_text, Y_data, Y_len

Build initial wordlist of 16000 short monograms (len < 4).

In [None]:
X_t, Y_d, Y_l = build_word_list(num_words=16000, max_string_len=4, mono_fraction=1)

print(len(X_t))
print("First five words:")
print(X_t[:5])
print("\n" + "First five words converted to integer labels:")
print(Y_d[:5])
print("\n" + "Length of each word:")
print(Y_l[:5])

## Dataset: Callback class -> Sequence class

Imports

In [None]:
from tensorflow.keras import utils
import tensorflow.keras.backend as K
from tensorflow.keras.preprocessing import image
from scipy import ndimage
import cairocffi as cairo
import matplotlib.pyplot as plt

Constants

In [None]:
BATCH_SIZE = 32
IMG_H = 64
IMG_W = 128
DOWNSAMPLE_FACTOR = 4

Helper functions for generating synthetic images from text

In [None]:
# this creates larger "blotches" of noise which look
# more realistic than just adding gaussian noise
# assumes greyscale with pixels ranging from 0 to 1

def speckle(img):
    severity = np.random.uniform(0, 0.6)
    blur = ndimage.gaussian_filter(np.random.randn(*img.shape) * severity, 1)
    img_speck = (img + blur)
    img_speck[img_speck > 1] = 1
    img_speck[img_speck <= 0] = 0
    return img_speck


# paints the string in a random location the bounding box
# also uses a random font, a slight random rotation,
# and a random amount of speckle noise

def paint_text(text, w, h, rotate=False, ud=False, multi_fonts=False):
    surface = cairo.ImageSurface(cairo.FORMAT_RGB24, w, h)
    with cairo.Context(surface) as context:
        context.set_source_rgb(1, 1, 1)  # White
        context.paint()
        # this font list works in CentOS 7
        if multi_fonts:
            fonts = [
                'Century Schoolbook', 'Courier', 'STIX',
                'URW Chancery L', 'FreeMono']
            context.select_font_face(
                np.random.choice(fonts),
                cairo.FONT_SLANT_NORMAL,
                np.random.choice([cairo.FONT_WEIGHT_BOLD, cairo.FONT_WEIGHT_NORMAL]))
        else:
            context.select_font_face('Courier',
                                     cairo.FONT_SLANT_NORMAL,
                                     cairo.FONT_WEIGHT_BOLD)
        context.set_font_size(25)
        box = context.text_extents(text)
        border_w_h = (4, 4)
        if box[2] > (w - 2 * border_w_h[1]) or box[3] > (h - 2 * border_w_h[0]):
            raise IOError(('Could not fit string into image.'
                           'Max char count is too large for given image width.'))

        # teach the RNN translational invariance by
        # fitting text box randomly on canvas, with some room to rotate
        max_shift_x = w - box[2] - border_w_h[0]
        max_shift_y = h - box[3] - border_w_h[1]
        top_left_x = np.random.randint(0, int(max_shift_x))
        if ud:
            top_left_y = np.random.randint(0, int(max_shift_y))
        else:
            top_left_y = h // 2
        context.move_to(top_left_x - int(box[0]), top_left_y - int(box[1]))
        context.set_source_rgb(0, 0, 0)
        context.show_text(text)

    buf = surface.get_data()
    a = np.frombuffer(buf, np.uint8)
    a.shape = (h, w, 4)
    a = a[:, :, 0]  # grab single channel
    a = a.astype(np.float32) / 255
    a = np.expand_dims(a, 0)
    if rotate:
        a = image.random_rotation(a, 3 * (w - top_left_x) / w + 1)
    a = speckle(a)

    return a

Sequence class definition

In [None]:
class TextImageSequence(utils.Sequence):
    def __init__(self, X_text, Y_data, Y_len, batch_size,
                 img_w, img_h, downsample_factor, start_epoch=0):
        self.X_text = X_text
        self.Y_data = Y_data
        self.Y_len = Y_len
        self.batch_size = batch_size
        self.img_w = img_w 
        self.img_h = img_h 
        self.downsample_factor = downsample_factor
        self.epoch_num = start_epoch
        self.rotate=False
        self.ud=False
        self.multi_fonts=False

    def __len__(self):
        return len(self.X_text)
        
    def __getitem__(self, index):
        # width and height are backwards from typical Keras convention
        # because width is the time dimension when it gets fed into the RNN
        if K.image_data_format() == 'channels_first':
            X_data = np.ones([self.batch_size, 1, self.img_w, self.img_h])
        else:
            X_data = np.ones([self.batch_size, self.img_w, self.img_h, 1])

        labels = np.ones([self.batch_size, ABSOLUTE_MAX_STRING_LEN])
        input_length = np.zeros([self.batch_size, 1])
        label_length = np.zeros([self.batch_size, 1])
        source_str = []
        
        for i in range(self.batch_size):
            if K.image_data_format() == 'channels_first':
                X_data[i, 0, 0:self.img_w, :] = (
                    paint_text(self.X_text[index + i],
                               self.img_w, self.img_h,
                               self.rotate,
                               self.ud,
                               self.multi_fonts)[0, :, :].T
                )
            else:
                X_data[i, 0:self.img_w, :, 0] = (
                    paint_text(self.X_text[index + i],
                               self.img_w, self.img_h,
                               self.rotate,
                               self.ud,
                               self.multi_fonts)[0, :, :].T
                )
            labels[i, :] = self.Y_data[index + i]
            input_length[i] = self.img_w // self.downsample_factor - 2
            label_length[i] = self.Y_len[index + i]
            source_str.append(self.X_text[index + i])
            
        inputs = {'the_input': X_data,
                  'the_labels': labels,
                  'input_length': input_length,
                  'label_length': label_length,
                  'source_str': np.array(source_str)  # used for visualization only
                  }
        outputs = {'ctc': np.zeros([self.batch_size])}  # dummy data for dummy loss function
        return inputs, outputs
        
    def on_epoch_end(self):
        # update paint function parameters to implement curriculum learning
        self.epoch_num += 1
        if self.epoch_num >= 2:
            self.ud=True
        if self.epoch_num >= 5:
            self.multi_fonts=True
        if self.epoch_num >= 8:
            self.rotate=True

Create Sequence dataset

In [None]:
sequence = TextImageSequence(X_t, Y_d, Y_l, BATCH_SIZE, 
                             IMG_W, IMG_H, DOWNSAMPLE_FACTOR)

Visualize images from a sample batch

In [None]:
sample_batch = next(iter(sequence))

f, axarr = plt.subplots(3, 3)
for i, ax in enumerate(f.axes):
    # Image is in (W, H, 1) format. Squeeze changes this to (W, H), and .T
    # transposes it to (H, W), allowing it to be displayed as grayscale image
    ax.imshow(np.squeeze(sample_batch[0]["the_input"][i]).T,
              cmap='gray', vmin=0, vmax=1)

List of changes during migration from TextImageGenerator to TextImageSequence:
* `on_epoch_begin` -> `on_epoch_end` (-1 to each epoch value)
* Move blank word insertion to `build_word_list` (so blank words will be used for validation, too)
* Leave out second wordlist (32000, with 12-len word) from on_epoch_end in favor of making a new sequence and a new model.fit() call.
* Remove lambda function usage for wrapping paint_text

## CTC loss

## Model setup

## Training procedure

## Sample inference on new data