In [1]:
# Based on Keras "Handwriting recognition" code example
import numpy as np
import os
import pickle

In [2]:
words_info = [] # will store all word sample info here
words_info_path = "data/words.txt" # path of file that holds all sample info data

words_info_file = open(words_info_path, "r")   # Open file
words_info_lines = words_info_file.readlines() # store all lines in list format
words_info_file.close() # Close file

for line in words_info_lines:
    if line[0] == "#": # skip lines with "#" as they explain sample format 
        continue
    if line.split(" ")[1] != "err":  # We don't need to deal with errored entries.
        words_info.append(line)

np.random.shuffle(words_info)

In [3]:
# Split wordsInfo into train and test samples
p = 0.8 # train sample percentage. Ranges from 0 to 1.

split_idx = int( p * len(words_info) )
train_samples = words_info[:split_idx]
test_samples = words_info[split_idx:]

# get validation set from test samples
vp = 0.5 # validation sample percentage. Ranges from 0 to 1

val_split_idx = int( vp * len(test_samples))
validation_samples = test_samples[:val_split_idx]
test_samples = test_samples[val_split_idx:]

# Throws assertion error if the number of samples
# do not match the sum of train, test, and validation.
assert len(words_info) == len(train_samples) + len(validation_samples) + len(
    test_samples
)

print(f"Total training samples: {len(train_samples)}")
print(f"Total validation samples: {len(validation_samples)}")
print(f"Total test samples: {len(test_samples)}")

Total training samples: 77164
Total validation samples: 9646
Total test samples: 9646


In [4]:
# Fixed IO issue given a iopub data rate limit stopping process: https://towardsdatascience.com/leveraging-the-power-of-jupyter-notebooks-26b4b8d7c622
# jupyter lab --NotebookApp.iopub_data_rate_limit=10000000000
# Data Input Pipeline image paths
images_path = "data/words" # path of file that holds all sample images corresponding to the sample data

def get_image_paths_and_labels(samples):
    paths = []
    corrected_samples = []
    for (i, file_line) in enumerate(samples):
        line_split = file_line.strip() # remove white spaces at beginning and end
        line_split = line_split.split(" ") # split string by white spaces in a list

        # Each line split will have this format for the corresponding image:
        # part1/part1-part2/part1-part2-part3.png
        image_name = line_split[0]
        partI = image_name.split("-")[0]
        partII = image_name.split("-")[1]
        img_path = os.path.join(
            images_path, partI, partI + "-" + partII, image_name + ".png"
        )
        if os.path.getsize(img_path):
            paths.append(img_path)
            corrected_samples.append(file_line.split("\n")[0])

    return paths, corrected_samples


train_img_paths, train_labels = get_image_paths_and_labels(train_samples)
validation_img_paths, validation_labels = get_image_paths_and_labels(validation_samples)
test_img_paths, test_labels = get_image_paths_and_labels(test_samples)

In [5]:
# Ground truth labels: Max length of words and size of vocabulary in training data
train_labels_cleaned = []
characters = set()
max_len = 0

for label in train_labels:
    label = label.split(" ")[-1].strip()
    for char in label:
        characters.add(char)

    max_len = max(max_len, len(label))
    train_labels_cleaned.append(label)

characters = sorted(list(characters))

print("Maximum length: ", max_len)
print("Vocab size: ", len(characters))

# Check some label samples.
train_labels_cleaned[:10]

Maximum length:  21
Vocab size:  78


['in',
 'same',
 "Dan's",
 'flowering',
 'the',
 'thought',
 'coregoni',
 ',',
 'with',
 '-']

In [6]:
# Clean validation, and test labels
def clean_labels(labels):
    cleaned_labels = []
    for label in labels:
        label = label.split(" ")[-1].strip()
        cleaned_labels.append(label)
    return cleaned_labels

validation_labels_cleaned = clean_labels(validation_labels)
test_labels_cleaned = clean_labels(test_labels)

In [7]:
# Store prepared data in pickle file
file = open('iam_dataset.pickle', 'wb')
     
# source, destination
pickle.dump(train_img_paths, file)         
pickle.dump(train_labels_cleaned, file)    
pickle.dump(validation_img_paths, file)
pickle.dump(validation_labels_cleaned, file)
pickle.dump(test_img_paths, file)
pickle.dump(test_labels_cleaned, file)
pickle.dump(characters, file)
pickle.dump(max_len, file) 
file.close()