In [5]:
# !wget -q https://github.com/sayakpaul/Handwriting-Recognizer-in-Keras/releases/download/v1.0.0/IAM_Words.zip
# !unzip -qq IAM_Words.zip
# !
# !mkdir data
# !mkdir data/words
# !tar -xf IAM_Words/words.tgz -C data/words
# !mv IAM_Words/words.txt data


In [6]:
# !head -20 data/words.txt

In [1]:
#Importing all libraries and setting up logger and seeds
import os
import logging
import json

import keras
from keras.layers import StringLookup
from keras import ops
import tensorflow as tf
import numpy as np
import mlflow

logger = logging.getLogger('Training')
logging.basicConfig(filename='log.log', filemode='w', encoding='utf-8', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

np.random.seed(42)
keras.utils.set_random_seed(42)
logger.info("Initialized logger and set up random seeds")

2025-04-13 22:24:38.002918: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-13 22:24:40.418667: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744563281.229443    4266 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744563281.477553    4266 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744563283.312307    4266 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
# Setting up our mlflow environment
try:
    mlflow.set_tracking_uri("http://127.0.0.1:8080")
    mlflow.set_experiment("handwriting-recognition")
    logger.info("MLFlow server set up at http://127.0.0.1:8080 and experiment initialized")
except Exception as e:
    logger.exception(f"Failed to set up MLFlow server : {e}")

In [3]:
# Get the texts corresponding to the image data
base_path = "data"
words_list = []

try:
    words = open(f"{base_path}/words.txt", "r").readlines()
    for line in words:
        if line[0] == "#":
            continue
        if line.split(" ")[1] != "err":  # We don't need to deal with errored entries.
            words_list.append(line)
    logger.info("Successfully read label file")
except Exception as e:
    logger.exception(f"Unable to open label file : {e}")

np.random.shuffle(words_list)

In [29]:
# Splitting up the labels into train, val, test
train_val_split = 0.5
val_test_split = 0.1
split_idx = int(train_val_split * len(words_list))
train_samples = words_list[:split_idx]
test_samples = words_list[split_idx:]

val_split_idx = int(val_test_split * len(test_samples))
validation_samples = test_samples[:val_split_idx]
test_samples = test_samples[val_split_idx:]

assert len(words_list) == len(train_samples) + len(validation_samples) + len(
    test_samples
)
logger.info("Successfully split train-valid-test=0.9:0.05:0.05 labels")
logger.info(f"Original dataset has {len(train_samples)+len(validation_samples)+len(test_samples)} datapoints")


In [30]:
# Retrieving the images from the corresponding text
base_image_path = os.path.join(base_path, "words")


def get_image_paths_and_labels(samples):
    paths = []
    corrected_samples = []
    for i, file_line in enumerate(samples):
        line_split = file_line.strip()
        line_split = line_split.split(" ")

        # Each line split will have this format for the corresponding image:
        # part1/part1-part2/part1-part2-part3.png
        image_name = line_split[0]
        partI = image_name.split("-")[0]
        partII = image_name.split("-")[1]
        img_path = os.path.join(
            base_image_path, partI, partI + "-" + partII, image_name + ".png"
        )
        try:
            if os.path.getsize(img_path):
                paths.append(img_path)
                corrected_samples.append(file_line.split("\n")[0])
        except Exception as e:
            logger.exception(f"{img_path} is corrupt or does not exist : {e}")

    return paths, corrected_samples


train_img_paths, train_labels = get_image_paths_and_labels(train_samples)
validation_img_paths, validation_labels = get_image_paths_and_labels(validation_samples)
test_img_paths, test_labels = get_image_paths_and_labels(test_samples)
logger.info(f"Corrected dataset has {len(train_labels)+len(validation_labels)+len(test_labels)} datapoints")


In [31]:
# Find maximum length and the size of the vocabulary in the training data.
train_labels_cleaned = []
characters = set()
max_len = 0

for label in train_labels:
    label = label.split(" ")[-1].strip()
    for char in label:
        characters.add(char)

    max_len = max(max_len, len(label))
    train_labels_cleaned.append(label)

characters = sorted(list(characters))

logger.info(f"Maximum length: {max_len}")
logger.info(f"Vocab size: {len(characters)}")


In [32]:
# Clean the labels
def clean_labels(labels):
    cleaned_labels = []
    for label in labels:
        label = label.split(" ")[-1].strip()
        cleaned_labels.append(label)
    return cleaned_labels


validation_labels_cleaned = clean_labels(validation_labels)
test_labels_cleaned = clean_labels(test_labels)
logger.info("Successfully cleaned all labels")

In [33]:
# Cleaning the images themselves
def distortion_free_resize(image, img_size):
    w, h = img_size
    image = tf.image.resize(image, size=(h, w), preserve_aspect_ratio=True)

    # Check tha amount of padding needed to be done.
    pad_height = h - ops.shape(image)[0]
    pad_width = w - ops.shape(image)[1]

    # Only necessary if you want to do same amount of padding on both sides.
    if pad_height % 2 != 0:
        height = pad_height // 2
        pad_height_top = height + 1
        pad_height_bottom = height
    else:
        pad_height_top = pad_height_bottom = pad_height // 2

    if pad_width % 2 != 0:
        width = pad_width // 2
        pad_width_left = width + 1
        pad_width_right = width
    else:
        pad_width_left = pad_width_right = pad_width // 2

    image = tf.pad(
        image,
        paddings=[
            [pad_height_top, pad_height_bottom],
            [pad_width_left, pad_width_right],
            [0, 0],
        ],
    )

    image = ops.transpose(image, (1, 0, 2))
    image = tf.image.flip_left_right(image)
    return image


In [34]:
# Vocabulary preparation
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

AUTOTUNE = tf.data.AUTOTUNE

# Mapping characters to integers.
char_to_num = StringLookup(vocabulary=list(characters), mask_token=None)

# Mapping integers back to original characters.
num_to_char = StringLookup(
    vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True
)

logger.info("Successfully initialized lookup tables for character to number conversion")

try:
    with open("vocab.json", "w", encoding="utf-8") as f:
        json.dump(char_to_num.get_vocabulary(), f, ensure_ascii=False, indent=2)
    logger.info("Succesfully saved the vocabulary of our StringLookup layer")
except Exception as e:
    logger.exception("Unable to save the vocabulary of our StringLookup layer")

In [35]:
# Defining hyper parameters and preprocessing functions for our dataset

batch_size = 64
padding_token = 99
image_width = 128
image_height = 32



def preprocess_image(image_path, img_size=(image_width, image_height)):
    try:
        image = tf.io.read_file(image_path)
        image = tf.image.decode_png(image, 1)
        image = distortion_free_resize(image, img_size)
        image = ops.cast(image, tf.float32) / 255.0
    except Exception as e:
        logger.exception(f"Error processing {image_path} : {e}")
    return image


def vectorize_label(label):
    try:
        label = char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8"))
        length = ops.shape(label)[0]
        pad_amount = max_len - length
        label = tf.pad(label, paddings=[[0, pad_amount]], constant_values=padding_token)
    except Exception as e:
        logger.exception(f"Error processing {label} : {e}")
    return label


def process_images_labels(image_path, label):
    image = preprocess_image(image_path)
    label = vectorize_label(label)
    return {"image": image, "label": label}


def prepare_dataset(image_paths, labels):
    dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels)).map(
        process_images_labels, num_parallel_calls=AUTOTUNE
    )
    return dataset.batch(batch_size).cache().prefetch(AUTOTUNE)


In [36]:
# Preparing datasets for our model and inference

try:
    train_ds = prepare_dataset(train_img_paths, train_labels_cleaned)
    validation_ds = prepare_dataset(validation_img_paths, validation_labels_cleaned)
    test_ds = prepare_dataset(test_img_paths, test_labels_cleaned)
    logger.info("Successfully created train, valid, test datasets")
except Exception as e:
    logger.exception(f"Error preparing datasets : {e}")

In [37]:
# Function to decode input tensor

def decode_input(label):
    indices = tf.gather(label, tf.where(tf.math.not_equal(label, padding_token)))
    label = tf.strings.reduce_join(num_to_char(indices))
    label = label.numpy().decode("utf-8")

    return label
# Preparing a sample image for inference

samples = next(iter(test_ds.take(1)))
images, labels = samples["image"], samples["label"]
rand_idx = np.random.randint(samples["image"].shape[0])
x_test = tf.expand_dims(images[rand_idx],0).numpy().astype('float32')
y_test = labels[rand_idx]

print(f"True label of the sample is \"{decode_input(y_test)}\"")
payload = {
    "inputs": x_test.tolist(), # or use "dataframe_split",
}

with open("sample.json", "w") as f:
    json.dump(payload, f)

2025-04-13 23:01:17.260446: W tensorflow/core/kernels/data/cache_dataset_ops.cc:916] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


True label of the sample is "his"


In [38]:
# Model building

class CTCLayer(keras.layers.Layer):
    def __init__(self, name=None,**kwargs):
        super().__init__(name=name,**kwargs)
        self.loss_fn = tf.keras.backend.ctc_batch_cost

    def call(self, y_true, y_pred):
        batch_len = ops.cast(ops.shape(y_true)[0], dtype="int64")
        input_length = ops.cast(ops.shape(y_pred)[1], dtype="int64")
        label_length = ops.cast(ops.shape(y_true)[1], dtype="int64")

        input_length = input_length * ops.ones(shape=(batch_len, 1), dtype="int64")
        label_length = label_length * ops.ones(shape=(batch_len, 1), dtype="int64")
        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
        self.add_loss(loss)

        # At test time, just return the computed predictions.
        return y_pred


def build_model():
    # Inputs to the model
    input_img = keras.Input(shape=(image_width, image_height, 1), name="image")
    labels = keras.layers.Input(name="label", shape=(None,))

    # First conv block.
    x = keras.layers.Conv2D(
        32,
        (3, 3),
        activation="relu",
        kernel_initializer="he_normal",
        padding="same",
        name="Conv1",
    )(input_img)
    x = keras.layers.MaxPooling2D((2, 2), name="pool1")(x)

    # Second conv block.
    x = keras.layers.Conv2D(
        64,
        (3, 3),
        activation="relu",
        kernel_initializer="he_normal",
        padding="same",
        name="Conv2",
    )(x)
    x = keras.layers.MaxPooling2D((2, 2), name="pool2")(x)

    # We have used two max pool with pool size and strides 2.
    # Hence, downsampled feature maps are 4x smaller. The number of
    # filters in the last layer is 64. Reshape accordingly before
    # passing the output to the RNN part of the model.
    new_shape = ((image_width // 4), (image_height // 4) * 64)
    x = keras.layers.Reshape(target_shape=new_shape, name="reshape")(x)
    x = keras.layers.Dense(64, activation="relu", name="dense1")(x)
    x = keras.layers.Dropout(0.2)(x)

    # RNNs.
    x = keras.layers.Bidirectional(
        keras.layers.LSTM(128, return_sequences=True, dropout=0.25)
    )(x)
    x = keras.layers.Bidirectional(
        keras.layers.LSTM(64, return_sequences=True, dropout=0.25)
    )(x)

    # +2 is to account for the two special tokens introduced by the CTC loss.
    # The recommendation comes here: https://git.io/J0eXP.
    x = keras.layers.Dense(
        len(char_to_num.get_vocabulary()) + 2, activation="softmax", name="dense2"
    )(x)

    # Add CTC layer for calculating CTC loss at each step.
    output = CTCLayer(name="ctc_loss")(labels, x)

    # Define the model.
    model = keras.models.Model(
        inputs=[input_img, labels], outputs=output, name="handwriting_recognizer"
    )
    # Optimizer.
    opt = keras.optimizers.Adam()
    # Compile the model and return.
    model.compile(optimizer=opt)
    return model


# Get the model.
try:
    model = build_model()
    logger.info(f"Model successfully built")
except Exception as e:
    logger.exception(f"Model build error : {e}")


In [39]:
# Preparing the validation set

validation_images = []
validation_labels = []

for batch in validation_ds:
    validation_images.append(batch["image"])
    validation_labels.append(batch["label"])


2025-04-13 23:02:59.472530: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [40]:
# Additional metrics and callbacks

def calculate_edit_distance(labels, predictions):
    # Get a single batch and convert its labels to sparse tensors.
    sparse_labels = ops.cast(tf.sparse.from_dense(labels), dtype=tf.int64)

    # Make predictions and convert them to sparse tensors.
    input_len = np.ones(predictions.shape[0]) * predictions.shape[1]
    predictions_decoded = keras.ops.nn.ctc_decode(
        predictions, sequence_lengths=input_len
    )[0][0][:, :max_len]
    sparse_predictions = ops.cast(
        tf.sparse.from_dense(predictions_decoded), dtype=tf.int64
    )

    # Compute individual edit distances and average them out.
    edit_distances = tf.edit_distance(
        sparse_predictions, sparse_labels, normalize=False
    )
    return tf.reduce_mean(edit_distances)


class MLFlowMetricCallback(keras.callbacks.Callback):
    def __init__(self, pred_model):
        super().__init__()
        self.prediction_model = pred_model

    def on_epoch_end(self, epoch, logs=None):
        edit_distances = []

        for i in range(len(validation_images)):
            labels = validation_labels[i]
            predictions = self.prediction_model.predict(validation_images[i])
            edit_distances.append(calculate_edit_distance(labels, predictions).numpy())
        logs["avg_edit_distance"] = np.mean(edit_distances)
        try:
            for k,v in logs.items():
                mlflow.log_metric(f"{k}", v, step=epoch)
            logger.info("Successfully logged metrics in the server")
        except Exception as e:
            logger.exception(f"Unable to log metrics in the server : {e}")


In [41]:
# Training and logging

epochs = 10  # To get good results this should be at least 50.
model = build_model()
prediction_model = keras.models.Model(
    model.get_layer(name="image").output, model.get_layer(name="dense2").output
)


custom_metric_callback = MLFlowMetricCallback(prediction_model)

# Train the model.
with mlflow.start_run(run_name='split=0.5') as run:
    mlflow.log_param("batch_size",batch_size)
    mlflow.log_param("padding_token",padding_token)
    mlflow.log_param("image_width",image_width)
    mlflow.log_param("image_height",image_height)
    mlflow.log_param("epochs",epochs)
    mlflow.log_param("max_len",max_len)
    history = model.fit(
        train_ds,
        validation_data=validation_ds,
        epochs=epochs,
        callbacks=[custom_metric_callback],
    )
    mlflow.log_artifact('vocab.json')
    mlflow.tensorflow.log_model(prediction_model,"models")

logger.info("Succesfully finished training and logger")


Epoch 1/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 57ms/step step - loss: 1206.77
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━



🏃 View run split=0.5 at: http://127.0.0.1:8080/#/experiments/754929904376585580/runs/a6bd709b2650409baf5fe49c279f4839
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/754929904376585580
