In [None]:
!pip install pandas matplotlib scikit-learn tensorflow torchvision torchaudio


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0->torchvision)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.6.0->torchvision)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.6.0->torchvision)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
print("TensorFlow version:", tf.__version__)


TensorFlow version: 2.18.0


In [2]:
import torch
print("PyTorch version:", torch.__version__)


PyTorch version: 2.6.0+cu124


In [3]:
#download and prepare the dataset
!wget -O synth90k.zip https://thor.robots.ox.ac.uk/~vgg/data/text/
!unzip synth90k.zip -d ./synth90k/


--2025-05-23 02:07:19--  https://thor.robots.ox.ac.uk/~vgg/data/text/
Resolving thor.robots.ox.ac.uk (thor.robots.ox.ac.uk)... 129.67.95.98
Connecting to thor.robots.ox.ac.uk (thor.robots.ox.ac.uk)|129.67.95.98|:443... connected.
HTTP request sent, awaiting response... 308 Permanent Redirect
Location: https://thor.robots.ox.ac.uk/text/ [following]
--2025-05-23 02:07:20--  https://thor.robots.ox.ac.uk/text/
Reusing existing connection to thor.robots.ox.ac.uk:443.
HTTP request sent, awaiting response... 403 Forbidden
2025-05-23 02:07:20 ERROR 403: Forbidden.

Archive:  synth90k.zip
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of synth90k.zip or
        synth90k.zip.zip, and cannot find synth90k.zip.ZIP, period.


In [9]:
# --- Start of Code Cell 4: download_dataset ---
# Assistant: Generate a synthetic dataset of 200 images directly in the notebook.
# This avoids downloading the very large MJSynth dataset and ensures a runnable example.

print("Generating a synthetic dataset of 200 images...")

dataset_root = './synthetic_ocr_dataset'
os.makedirs(dataset_root, exist_ok=True)
image_dir = os.path.join(dataset_root, 'images')
os.makedirs(image_dir, exist_ok=True)

from PIL import Image, ImageDraw, ImageFont
import random
import string

# Define image dimensions
IMAGE_HEIGHT = 32
IMAGE_WIDTH = 128

# Function to generate a random word
def generate_random_word(min_len=3, max_len=10):
    length = random.randint(min_len, max_len)
    return ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(length))

# Function to create a synthetic image of a word
def create_synthetic_image(word, filename, size=(IMAGE_WIDTH, IMAGE_HEIGHT)):
    img = Image.new('L', size, color=255) # White background (L for grayscale)
    d = ImageDraw.Draw(img)

    # Try to use a common font available in Colab, or fallback
    try:
        # Path to a common font on Colab VMs (might vary slightly)
        font_path = "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf"
        font = ImageFont.truetype(font_path, 20)
    except IOError:
        font = ImageFont.load_default() # Fallback to default PIL font

    # Calculate text size to center it
    bbox = d.textbbox((0,0), word, font=font)
    text_width = bbox[2] - bbox[0]
    text_height = bbox[3] - bbox[1]

    x = (size[0] - text_width) / 2
    y = (size[1] - text_height) / 2

    d.text((x, y), word, fill=0, font=font) # Black text
    img.save(os.path.join(image_dir, filename))

# Generate 200 images
num_images_to_generate = 200
all_image_paths = []
all_labels = []

for i in range(num_images_to_generate):
    word = generate_random_word().lower() # Generate lowercase words
    filename = f"{word}_{i}.png" # Format like MJSynth for consistency
    create_synthetic_image(word, filename)
    all_image_paths.append(os.path.join(image_dir, filename))
    all_labels.append(word)

print(f"Generated {len(all_image_paths)} synthetic images in {image_dir}")

# Shuffle the generated images and labels for good measure
combined = list(zip(all_image_paths, all_labels))
random.shuffle(combined)
all_image_paths, all_labels = zip(*combined)
all_image_paths = list(all_image_paths)
all_labels = list(all_labels)

# Define character set dynamically from the generated labels
all_chars_in_labels = sorted(list(set(''.join(all_labels))))
# It's good practice to ensure common characters are included, especially if random generation is limited
# For a robust OCR, you'd typically include 'abcdefghijklmnopqrstuvwxyz0123456789' and possibly punctuation.
# For this synthetic data, we'll rely on what's generated.
characters = all_chars_in_labels

char_to_num = tf.keras.layers.StringLookup(vocabulary=list(characters), mask_token=None)
num_to_char = tf.keras.layers.StringLookup(vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True)

# Add 1 for the blank token used in CTC
num_classes = char_to_num.vocabulary_size() + 1 # +1 for CTC blank token
print(f"Number of unique characters (classes including blank): {num_classes}")
print(f"Character vocabulary (excluding blank): {char_to_num.get_vocabulary()}")

# --- End of Code Cell 4: download_dataset ---

Generating a synthetic dataset of 200 images...
Generated 200 synthetic images in ./synthetic_ocr_dataset/images
Number of unique characters (classes including blank): 38
Character vocabulary (excluding blank): ['[UNK]', np.str_('0'), np.str_('1'), np.str_('2'), np.str_('3'), np.str_('4'), np.str_('5'), np.str_('6'), np.str_('7'), np.str_('8'), np.str_('9'), np.str_('a'), np.str_('b'), np.str_('c'), np.str_('d'), np.str_('e'), np.str_('f'), np.str_('g'), np.str_('h'), np.str_('i'), np.str_('j'), np.str_('k'), np.str_('l'), np.str_('m'), np.str_('n'), np.str_('o'), np.str_('p'), np.str_('q'), np.str_('r'), np.str_('s'), np.str_('t'), np.str_('u'), np.str_('v'), np.str_('w'), np.str_('x'), np.str_('y'), np.str_('z')]


In [11]:
# --- Start of Code Cell 5: preprocessing_functions ---
# Preprocessing Function: Resize images, normalize pixel values, and prepare labels

IMAGE_HEIGHT = 32
IMAGE_WIDTH = 128

def encode_single_sample(img_path, label):
    # 1. Read image
    img = tf.io.read_file(img_path)
    # 2. Decode and convert to grayscale
    img = tf.io.decode_png(img, channels=1) # Or tf.io.decode_jpeg if your synthetic images are JPG
    # 3. Convert to float32 in [0, 1] range
    img = tf.image.convert_image_dtype(img, tf.float32)
    # 4. Resize to the desired size
    img = tf.image.resize(img, [IMAGE_HEIGHT, IMAGE_WIDTH])
    # 5. Transpose the image because the CRNN expects (width, height, channels)
    # This is a common practice for CRNNs with CTC loss, as it aligns timesteps with width.
    img = tf.transpose(img, perm=[1, 0, 2]) # (width, height, channels)

    # 6. Encode label to numbers
    label = char_to_num(tf.strings.unicode_split(label, input_encoding='UTF-8'))

    return img, label

print("Preprocessing functions defined.")

# --- End of Code Cell 5: preprocessing_functions ---

Preprocessing functions defined.


In [10]:
#Preprocessing Function: Resize images, normalize pixel values, and prepare labels
def preprocess_image(image_path, target_size=(128, 32)):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    image = cv2.resize(image, target_size)
    image = np.expand_dims(image, axis=-1) / 255.0  # Normalize
    return image


In [12]:
# --- Start of Code Cell 6: extract_labels ---
# Extract Labels for OCR Training:
def extract_labels(image_path):
    # Assuming filenames follow 'img_label_id.jpg' or similar, as in the dummy data.
    # For MJSynth, labels are typically part of the directory structure or a separate annotation file.
    try:
        label = os.path.basename(image_path).split("_")[0].lower() # Corrected to [0] for synthetic data
    except IndexError:
        label = "UNKNOWN" # Fallback for incorrect format
    return label

# --- End of Code Cell 6: extract_labels ---

In [13]:
# --- Start of Code Cell 7: data_augmentation ---
# OCR models improve accuracy with augmentation to simulate real-world distortions.
# Data augmentation helps prevent overfitting and improves model robustness.
# Note: Horizontal flip is generally not suitable for text recognition.

data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomRotation(factor=0.02), # Small rotation
    tf.keras.layers.RandomBrightness(factor=0.1), # Adjust brightness
    tf.keras.layers.RandomContrast(factor=0.1), # Adjust contrast
    # Consider adding RandomZoom or RandomTranslation if appropriate for your data
])

print("Data augmentation pipeline defined.")

# --- End of Code Cell 7: data_augmentation ---

Data augmentation pipeline defined.


In [14]:
# --- Start of Code Cell 8: build_crnn_model ---
# CRNN combines CNN for feature extraction and RNN for sequential text decoding.
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Reshape, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.models import Model

def build_crnn_model(input_shape, num_classes):
    # Defines the input layer of the model.
    # input_shape is expected to be (width, height, channels) for CRNN.
    inputs = Input(shape=input_shape)

    # --- CNN Feature Extractor (Convolutional Neural Network) ---
    # This part extracts visual features from the input image.
    # It's a common pattern in image processing, often resembling a VGG-like architecture.

    # First Convolutional Block
    # Conv2D: Applies a 2D convolution. 32 filters, 3x3 kernel, ReLU activation, 'same' padding to maintain size.
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
    # MaxPooling2D: Reduces spatial dimensions (width and height) by taking the maximum value over a 2x2 window.
    # This helps in downsampling and making the model more robust to small shifts.
    x = MaxPooling2D(pool_size=(2, 2))(x)

    # Second Convolutional Block
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)

    # Third Convolutional Block
    x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)

    # --- Reshape for RNN (Recurrent Neural Network) ---
    # The output of the CNN is a 4D tensor (batch_size, new_width, new_height, num_filters).
    # RNNs (LSTMs) expect a 3D tensor (batch_size, timesteps, features_per_timestep).
    # For CRNN, 'timesteps' typically corresponds to the width dimension of the image,
    # and 'features_per_timestep' combines the height and the number of filters.

    # Calculate the dimensions after the CNN layers and pooling operations.
    # Each MaxPooling2D with pool_size=(2,2) halves the width and height.
    cnn_output_width = input_shape[0] // (2 * 2 * 2) # Input width / 2^3
    cnn_output_height = input_shape[1] // (2 * 2 * 2) # Input height / 2^3
    num_filters_last_conv = 128 # The number of filters in the last Conv2D layer

    # Check to ensure the height hasn't been reduced to zero, which would cause issues.
    if cnn_output_height == 0:
        raise ValueError("Image height is too small for the current CNN architecture. "
                         "Consider reducing pooling layers or increasing input_height.")

    # Reshape the tensor: (batch_size, cnn_output_width, cnn_output_height * num_filters_last_conv)
    x = Reshape((cnn_output_width, cnn_output_height * num_filters_last_conv))(x)

    # --- RNN Layers for Sequence Modeling ---
    # These layers process the sequence of features extracted by the CNN to predict the text.
    # Bidirectional LSTM: Processes the sequence in both forward and backward directions,
    # capturing context from both past and future elements in the sequence.
    # return_sequences=True: Ensures the LSTM outputs a sequence, not just a single vector,
    # which is necessary because the next layer (another LSTM or Dense) needs a sequence.
    # dropout: Applies dropout to the inputs and recurrent connections to prevent overfitting.

    # First Bidirectional LSTM Layer
    x = Bidirectional(LSTM(256, return_sequences=True, dropout=0.25))(x) # Increased LSTM units for more capacity

    # Second Bidirectional LSTM Layer
    x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.25))(x)

    # --- Output Layer ---
    # Dense layer: Maps the LSTM outputs to character probabilities for each timestep.
    # num_classes: The total number of unique characters in your vocabulary plus one for the CTC blank token.
    # activation="softmax": Outputs a probability distribution over all possible characters for each timestep.
    outputs = Dense(num_classes, activation="softmax")(x)

    # Create the Keras Model.
    # This model takes the 'inputs' (image) and produces 'outputs' (character probabilities over timesteps).
    model = Model(inputs=inputs, outputs=outputs)
    return model

print("CRNN model architecture defined.")

# --- End of Code Cell 8: build_crnn_model ---

CRNN model architecture defined.


In [50]:
# --- Start of Code Cell 9: ctc_loss_implementation ---
# CTC Loss Implementation

class CTCLayer(tf.keras.layers.Layer):
    """
    Custom Keras layer to compute the CTC loss.
    CTC loss is suitable for sequence-to-sequence problems where the alignment
    between input and output sequences is not known (e.g., OCR).
    """
    def __init__(self, name=None):
        super().__init__(name=name)
        # Use the backend CTC loss function
        self.loss_fn = tf.keras.backend.ctc_batch_cost

    # This method helps Keras infer the output shape of the layer.
    # Since `call` returns `y_pred`, its output shape is the same as `y_pred`'s input shape.
    # input_shape here is a list/tuple of shapes for all inputs to the call method:
    # (y_true_shape, y_pred_shape, input_length_shape, label_length_shape)
    # We want the shape of y_pred, which is input_shape[1].
    def compute_output_shape(self, input_shape):
        # Return the shape of the second input to the call method (y_pred's shape)
        # This is the most robust way to indicate that the layer passes through its second input's shape.
        # The output shape should be the shape of the predictions (y_pred) as this layer
        # effectively adds a loss and passes the predictions through.
        return input_shape[1]


    def call(self, y_true, y_pred, input_length, label_length):
        """
        Computes the CTC loss and adds it to the model's total loss.

        Args:
            y_true (tf.Tensor): True labels (ground truth sequences).
                                Shape: (batch_size, max_label_length)
            y_pred (tf.Tensor): Predicted logits from the model (output of the RNN).
                                Shape: (batch_size, timesteps, num_classes)
            input_length (tf.Tensor): Lengths of the input sequences (width of image features).
                                      Shape: (batch_size, 1)
            label_length (tf.Tensor): Lengths of the true label sequences.
                                      Shape: (batch_size, 1)
        """
        # Ensure input_length and label_length are 1D tensors of type int32
        input_length = tf.squeeze(input_length, axis=-1)
        label_length = tf.squeeze(label_length, axis=-1)

        # Cast y_true to int32, as ctc_batch_cost expects integer labels.
        y_true = tf.cast(y_true, tf.int32)

        # Compute the CTC loss for the current batch
        # The loss function expects (y_true, y_pred, input_length, label_length)
        loss = self.loss_fn(y_true, y_pred, input_length, label_length)

        # Add the computed loss to the model's total loss.
        # This is the standard way to add a loss calculated inside a custom layer.
        self.add_loss(loss)

        # Return y_pred. This layer primarily adds a loss, but Keras requires layers
        # to have an output. Returning the prediction tensor allows the model to
        # connect subsequent layers (if any) or serve as the final output for a training model
        # where the loss is handled internally.
        return y_pred

print("CTC Layer defined.")

# Define the CRNN model builder function
def build_crnn_model_with_ctc(input_shape, num_classes):
    # Input for images (batch_size, width, height, channels)
    inputs = Input(shape=input_shape, name='image_input')

    # --- CNN feature extractor ---
    # Extracts visual features. VGG-like architecture.
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
    x = MaxPooling2D(pool_size=(2, 2))(x) # Output shape (None, W/2, H/2, 32)

    x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D(pool_size=(2, 2))(x) # Output shape (None, W/4, H/4, 64)

    x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D(pool_size=(2, 2))(x) # Output shape (None, W/8, H/8, 128)

    # Calculate dimensions after pooling
    cnn_output_width = input_shape[0] // (2 * 2 * 2) # Input width / 8
    cnn_output_height = input_shape[1] // (2 * 2 * 2) # Input height / 8
    num_filters_last_conv = 128

    # Add check for zero height after pooling
    if cnn_output_height == 0:
         raise ValueError(f"Image height ({input_shape[1]}) is too small for the current CNN architecture. "
                          f"After 3x MaxPooling(2,2), height becomes {cnn_output_height}. "
                          "Consider reducing pooling layers or increasing input_height.")

    # Reshape for RNN: (batch_size, timesteps, features_per_timestep)
    # Timesteps = cnn_output_width
    # Features_per_timestep = cnn_output_height * num_filters_last_conv
    x = Reshape((cnn_output_width, cnn_output_height * num_filters_last_conv))(x) # Output shape (None, W/8, H/8 * 128)

    # --- RNN Layers for Sequence Modeling ---
    # Bidirectional LSTMs process sequence in both directions.
    # return_sequences=True: Pass sequence output to the next layer.
    # dropout: Prevent overfitting.
    x = Bidirectional(LSTM(256, return_sequences=True, dropout=0.25))(x) # Output shape (None, W/8, 2*256)
    x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.25))(x) # Output shape (None, W/8, 2*128)

    # --- Output Layer ---
    # Dense layer maps LSTM outputs to character probabilities (logits) for each timestep.
    # num_classes: Total unique characters + blank token.
    # activation="softmax": Provides probability distribution over classes.
    output_logits = Dense(num_classes, activation="softmax", name='output_logits')(x) # Output shape (None, W/8, num_classes)

    # --- Inputs for CTC Loss ---
    # These inputs are required by the CTCLayer during training to compute the loss.
    # labels: Ground truth sequences (numerical IDs). Shape (None, max_label_length).
    # input_length: Lengths of the prediction sequences (width after CNN). Shape (None, 1).
    # label_length: Lengths of the ground truth sequences. Shape (None, 1).
    labels = Input(name='labels', shape=(None,), dtype='float32') # Use float32, will cast to int32 in CTCLayer
    input_length = Input(name='input_length', shape=(1,), dtype='int64')
    label_length = Input(name='label_length', shape=(1,), dtype='int64')

    # --- CTC Layer ---
    # Instantiate the custom CTCLayer.
    ctc_layer = CTCLayer(name='ctc_loss')

    # Call the CTCLayer with the necessary inputs.
    # This call computes the CTC loss and adds it to the model's total loss via `add_loss`.
    # The output of the CTCLayer call (`loss_out`) serves as the output of the *training* model,
    # allowing Keras to build the graph correctly and incorporate the added loss.
    loss_out = ctc_layer(labels, output_logits, input_length, label_length)

    # --- Define Models ---
    # Model for training: Takes image, labels, input_length, label_length as inputs.
    # Its output is the result of the CTCLayer call (which effectively passes the logits through
    # but ensures the layer is part of the graph and its loss is considered).
    train_model = Model(inputs=[inputs, labels, input_length, label_length], outputs=loss_out, name='crnn_train_model')

    # Model for inference: Takes only the image input.
    # Its output is the raw character probabilities (logits) from the Dense layer,
    # which will be decoded externally using CTC decoding algorithms.
    inference_model = Model(inputs=inputs, outputs=output_logits, name='crnn_inference_model')

    return train_model, inference_model

print("CRNN model builder with CTC defined.")

# --- End of Code Cell 9: ctc_loss_implementation ---

CTC Layer defined.
CRNN model builder with CTC defined.


In [44]:
# --- Start of Code Cell 9: ctc_loss_implementation ---
# CTC Loss Implementation

class CTCLayer(tf.keras.layers.Layer):
    """
    Custom Keras layer to compute the CTC loss.
    CTC loss is suitable for sequence-to-sequence problems where the alignment
    between input and output sequences is not known (e.g., OCR).
    """
    def __init__(self, name=None):
        super().__init__(name=name)
        self.loss_fn = tf.keras.backend.ctc_batch_cost

    # This method helps Keras infer the output shape of the layer.
    # Since `call` returns `y_pred`, its output shape is the same as `y_pred`'s input shape.
    # input_shape here is a list/tuple of shapes for all inputs to the call method:
    # (y_true_shape, y_pred_shape, input_length_shape, label_length_shape)
    # We want the shape of y_pred, which is input_shape[1].
    def compute_output_shape(self, input_shape):
        # Return the shape of the second input to the call method (y_pred's shape)
        # This is the most robust way to indicate that the layer passes through its second input's shape.
        return input_shape[1]

    def call(self, y_true, y_pred, input_length, label_length):
        """
        Computes the CTC loss and adds it to the model's total loss.

        Args:
            y_true (tf.Tensor): True labels (ground truth sequences).
                                Shape: (batch_size, max_label_length)
            y_pred (tf.Tensor): Predicted logits from the model (output of the RNN).
                                Shape: (batch_size, timesteps, num_classes)
            input_length (tf.Tensor): Lengths of the input sequences (width of image features).
                                      Shape: (batch_size, 1)
            label_length (tf.Tensor): Lengths of the true label sequences.
                                      Shape: (batch_size, 1)
        """
        # Ensure input_length and label_length are 1D tensors of type int32
        input_length = tf.squeeze(input_length, axis=-1)
        label_length = tf.squeeze(label_length, axis=-1)

        # Cast y_true to int32, as ctc_batch_cost expects integer labels.
        y_true = tf.cast(y_true, tf.int32)

        # Compute the CTC loss for the current batch
        loss = self.loss_fn(y_true, y_pred, input_length, label_length)

        # Add the computed loss to the model's total loss.
        self.add_loss(loss)

        # Return y_pred as a placeholder.
        return y_pred

print("CTC Layer and model builder with CTC defined.")

# Re-define the model to include the CTCLayer for training
def build_crnn_model_with_ctc(input_shape, num_classes):
    # Input for images
    inputs = Input(shape=input_shape, name='image_input')

    # CNN feature extractor
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)

    cnn_output_width = input_shape[0] // (2 * 2 * 2)
    cnn_output_height = input_shape[1] // (2 * 2 * 2)
    num_filters_last_conv = 128

    x = Reshape((cnn_output_width, cnn_output_height * num_filters_last_conv))(x)

    x = Bidirectional(LSTM(256, return_sequences=True, dropout=0.25))(x)
    x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.25))(x)

    # Output layer for character probabilities
    output_logits = Dense(num_classes, activation="softmax", name='output_logits')(x)

    # Inputs for CTC Loss calculation during training
    labels = Input(name='labels', shape=(None,), dtype='float32')
    input_length = Input(name='input_length', shape=(1,), dtype='int64')
    label_length = Input(name='label_length', shape=(1,), dtype='int64')

    # CTC Layer
    loss_out = CTCLayer(name='ctc_loss')(labels, output_logits, input_length, label_length)

    # Model for training
    train_model = Model(inputs=[inputs, labels, input_length, label_length], outputs=loss_out)

    # Model for inference (without CTC layer)
    inference_model = Model(inputs=inputs, outputs=output_logits)

    return train_model, inference_model

print("CTC Layer and model builder with CTC defined.")

# --- End of Code Cell 9: ctc_loss_implementation ---

CTC Layer and model builder with CTC defined.
CTC Layer and model builder with CTC defined.


In [54]:
# --- Start of Code Cell 10: create_datasets ---
# Create tf.data.Dataset objects

# Split data (for real dataset, use proper train/val/test split)
# 'all_image_paths' and 'all_labels' are populated by Code Cell 4 (your synthetic data generation).
num_samples = len(all_image_paths)
train_split = int(0.8 * num_samples) # 80% for training
val_split = int(0.9 * num_samples)   # 10% for validation (from 80% to 90%)

# Divide the image paths and labels into training, validation, and test sets.
train_img_paths = all_image_paths[:train_split]
train_labels = all_labels[:train_split]

val_img_paths = all_image_paths[train_split:val_split]
val_labels = all_labels[train_split:val_split]

test_img_paths = all_image_paths[val_split:] # Remaining 10% for testing
test_labels = all_labels[val_split:]

# Define the batch size for training.
# BATCH_SIZE = 4 is used for the small dummy data; for real training, this would typically be larger (e.g., 32 or 64).
BATCH_SIZE = 4

def prepare_dataset(img_paths, labels):
    """
    Prepares a tf.data.Dataset from image paths and labels.

    Args:
        img_paths (list): List of file paths to the images.
        labels (list): List of corresponding text labels.

    Returns:
        tf.data.Dataset: A TensorFlow dataset ready for model consumption.
    """
    # 1. Create a dataset from slices of image paths and labels.
    # Each element in the dataset will initially be (img_path, label_string).
    dataset = tf.data.Dataset.from_tensor_slices((img_paths, labels))

    # 2. Map the preprocessing function to each element.
    # 'encode_single_sample' (from Code Cell 5) reads the image, decodes it,
    # resizes, transposes, and encodes the label into numerical format.
    # num_parallel_calls=tf.data.AUTOTUNE allows TensorFlow to optimize parallel processing.
    dataset = dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)

    # 3. Batch the dataset.
    # Combines consecutive elements into batches of BATCH_SIZE.
    dataset = dataset.batch(BATCH_SIZE)

    # 4. Add input_length and label_length for CTC Loss.
    # The CTCLayer (from Code Cell 9) requires these lengths.
    # x: image tensor (batch_size, width, height, channels)
    # y: label tensor (batch_size, max_label_length)
    # tf.shape(x)[1]: gets the width of the image features, which is the sequence length for CTC.
    # tf.shape(y)[1]: gets the length of the encoded label sequence.
    dataset = dataset.map(lambda x, y: (x, y, tf.shape(x)[1], tf.shape(y)[1]))

    # 5. Prefetch data.
    # Prefetching overlaps data preprocessing and model execution,
    # ensuring that the next batch of data is ready when the model finishes the current step.
    dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
    return dataset

# Create the training, validation, and test datasets using the prepare_dataset function.
train_dataset = prepare_dataset(train_img_paths, train_labels)
val_dataset = prepare_dataset(val_img_paths, val_labels)
test_dataset = prepare_dataset(test_img_paths, test_labels)

print("tf.data.Dataset objects created.")

# --- End of Code Cell 10: create_datasets ---

tf.data.Dataset objects created.


In [56]:
# --- Start of Code Cell 11: train_model ---
# Use CTC (Connectionist Temporal Classification) Loss, Adam optimizer, and Early Stopping

# Define the input shape for the CRNN model.
# IMAGE_WIDTH and IMAGE_HEIGHT are defined in Code Cell 5.
input_shape = (128, 21, 1) # CRNN expects (width, height, channels)

# Build the training and inference models using the function from Code Cell 9.
# This function returns two models: one for training (with CTCLayer) and one for inference (without CTCLayer).
train_model, inference_model = build_crnn_model_with_ctc(input_shape, num_classes)

# Print a summary of the training model's architecture.
# This is useful for reviewing the layers, output shapes, and parameter counts.
train_model.summary()

# Define the learning rate schedule.
# CosineDecay: A learning rate schedule that decays the learning rate following a cosine curve.
# This often helps models converge better than a fixed learning rate.
# initial_learning_rate: The starting learning rate.
# decay_steps: The number of steps over which the learning rate will decay.
lr_schedule = tf.keras.optimizers.schedules.CosineDecay(initial_learning_rate=0.0001, decay_steps=1000)

# Define the optimizer.
# Adam: A popular optimization algorithm known for its efficiency.
# learning_rate: Uses the defined cosine decay schedule.
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

# Compile the training model.
# Note: The loss is handled internally by the CTCLayer (from Code Cell 9).
# Therefore, we do NOT specify a 'loss' argument in model.compile.
# The `train_model`'s output (which is `loss_out` from CTCLayer) implicitly provides the loss.
# Metrics like 'accuracy' for CTC are complex and are typically handled in a custom evaluation step
# (as seen in Code Cell 12) rather than directly as a Keras metric during compilation.
train_model.compile(optimizer=optimizer)

# Define Early Stopping callback.
# EarlyStopping: A callback that monitors a specified metric (here, 'val_loss')
# and stops training if the metric stops improving for a certain number of epochs ('patience').
# monitor="val_loss": Monitors the validation loss.
# patience=5: Training will stop if validation loss does not improve for 5 consecutive epochs.
# restore_best_weights=True: After stopping, the model's weights will be reset to the epoch
# that had the best monitored value (lowest 'val_loss').
early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)

print("Starting model training...")

# Train the model.
# x=train_dataset: The dataset used for training.
# validation_data=val_dataset: The dataset used to evaluate validation loss and metrics.
# epochs=20: The maximum number of training iterations over the entire dataset.
# callbacks=[early_stopping]: The list of callbacks to apply during training.
# Ensure that Code Cell 10 defining train_dataset and val_dataset has been executed.
history = train_model.fit(
    x=train_dataset,
    validation_data=val_dataset,
    epochs=20, # Adjust epochs based on dataset size and convergence
    callbacks=[early_stopping]
)

print("Model training complete.")

# --- End of Code Cell 11: train_model ---

ValueError: Exception encountered when calling CTCLayer.call().

[1mMethod `compute_output_shape()` of layer CTCLayer is returning a type that cannot be interpreted as a shape. It should return a shape tuple. Received: None[0m

Arguments received by CTCLayer.call():
  • args=('<KerasTensor shape=(None, None), dtype=float32, sparse=False, name=labels>', '<KerasTensor shape=(None, 16, 38), dtype=float32, sparse=False, name=keras_tensor_152>', '<KerasTensor shape=(None, 1), dtype=int64, sparse=False, name=input_length>', '<KerasTensor shape=(None, 1), dtype=int64, sparse=False, name=label_length>')
  • kwargs=<class 'inspect._empty'>

In [18]:
# --- Start of Code Cell 12: evaluate_and_plot ---
# Compare results with other OCR solutions & Plot accuracy trends

# Function to decode CTC output (for evaluation)
def decode_batch_predictions(pred, num_to_char_layer):
    """
    Decodes the raw predictions (logits) from the CRNN model into readable text.
    Uses CTC greedy decoding.

    Args:
        pred (tf.Tensor): The raw output probabilities from the inference model.
                          Shape: (batch_size, timesteps, num_classes)
        num_to_char_layer (tf.keras.layers.StringLookup): The layer to convert
                                                          numerical IDs back to characters.

    Returns:
        list: A list of decoded text strings for the batch.
    """
    # Create an array of input lengths, which is the number of timesteps (width)
    # for each prediction in the batch.
    input_len = np.ones(pred.shape[0]) * pred.shape[1]

    # Use greedy search. For a production system, beam search might be better.
    # tf.keras.backend.ctc_decode performs the decoding.
    # greedy=True: Selects the most probable character at each timestep.
    # The output is a tuple; we take the first element (decoded sequences)
    # and then the first item from that (the actual sequences).
    results = tf.keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]

    # Iterate over the decoded numerical results and convert them back to text.
    output_texts = []
    for res in results.numpy():
        # Remove padding tokens (typically -1 from ctc_decode)
        res = res[res != -1]
        # Join the numerical character IDs into a string using num_to_char_layer
        # and decode from bytes to utf-8.
        output_texts.append(tf.strings.reduce_join(num_to_char_layer(res)).numpy().decode('utf-8'))
    return output_texts

# Evaluate on test data (requires a custom evaluation loop for CTC accuracy)
print("Evaluating model on test dataset...")
correct_predictions = 0
total_samples = 0

# Iterate through each batch in the test_dataset.
for batch in test_dataset:
    # Unpack the batch: images, true labels (numerical), input lengths, label lengths.
    images, labels, input_length, label_length = batch

    # Get predictions from the inference model (without CTC loss layer).
    preds = inference_model.predict(images)
    # Decode these raw predictions into human-readable text.
    decoded_preds = decode_batch_predictions(preds, num_to_char)

    # Decode the true labels (numerical) into human-readable text for comparison.
    true_labels = []
    for label_seq in labels.numpy():
        label_seq = label_seq[label_seq != -1] # Remove padding
        true_labels.append(tf.strings.reduce_join(num_to_char(label_seq)).numpy().decode('utf-8'))

    # Compare decoded predictions with true labels.
    for i in range(len(decoded_preds)):
        total_samples += 1
        # Check if the predicted text matches the true text (case-insensitive).
        if decoded_preds[i].lower() == true_labels[i].lower():
            correct_predictions += 1
        # Print the true and predicted labels for visual inspection.
        print(f"True: {true_labels[i]}, Predicted: {decoded_preds[i]}")

# Calculate the overall test accuracy.
test_accuracy = correct_predictions / total_samples if total_samples > 0 else 0
print(f"\nTest Accuracy: {test_accuracy:.2f}") # Print accuracy formatted to two decimal places

# Plot accuracy trends from history (loss is typically monitored for CTC)
# Check if the 'history' object from model.fit is available and contains loss data.
if history is not None and 'loss' in history.history:
    plt.figure(figsize=(12, 6)) # Create a figure for the plots

    # Plot Training and Validation Loss
    plt.subplot(1, 2, 1) # 1 row, 2 columns, first plot
    plt.plot(history.history['loss'], label='Train Loss')
    if 'val_loss' in history.history:
        plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.legend()
    plt.title('CRNN OCR Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')

    # Note: 'accuracy' metric for CTC is complex and often requires custom implementation.
    # For this basic setup, we're relying on the manual test accuracy calculation above.
    # The commented-out section below shows how you *would* plot accuracy if it were
    # a directly logged metric in history.
    # plt.subplot(1, 2, 2) # 1 row, 2 columns, second plot
    # plt.

Evaluating model on test dataset...


NameError: name 'test_dataset' is not defined

In [None]:
#Extract Labels for OCR Training:
def extract_labels(image_path):
    label = os.path.basename(image_path).split("_")[1]  # Assuming filenames follow 'img_label_id.jpg'
    return label


In [None]:
#OCR models improve accuracy with augmentation to simulate real-world distortions.
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip("horizontal"),
    tf.keras.layers.RandomRotation(0.05),
    tf.keras.layers.RandomBrightness(0.03),
    tf.keras.layers.RandomContrast(0.03),
])


In [None]:
#CRNN combines CNN for feature extraction and RNN for sequential text decoding.
def build_crnn_model(input_shape, num_classes):
    inputs = Input(shape=input_shape)

    # CNN feature extractor
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(inputs)
    x = MaxPooling2D(pool_size=(2, 2))(x)

    x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)

    x = Flatten()(x)

    # RNN layers for sequence modeling
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    x = Dropout(0.3)(x)

    outputs = Dense(num_classes, activation="softmax")(x)

    model = Model(inputs, outputs)
    return model


In [None]:
#use CTC (Connectionist Temporal Classification) Loss, Adam optimizer, and Early Stopping
num_classes = 37  # 26 letters + 10 digits + space
model = build_crnn_model((128, 32, 1), num_classes)

optimizer = Adam(learning_rate=0.0001)
model.compile(loss=tfa.losses.CTCLoss(), optimizer=optimizer, metrics=["accuracy"])

early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)

history = model.fit(train_dataset, validation_data=val_dataset, epochs=20, callbacks=[early_stopping])


NameError: name 'Input' is not defined

In [None]:
#CRNN combines CNN for feature extraction and RNN for sequential text decoding.
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Bidirectional, LSTM, Dropout, Dense
from tensorflow.keras.models import Model

def build_crnn_model(input_shape, num_classes):
    inputs = Input(shape=input_shape)

    # CNN feature extractor
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(inputs)
    x = MaxPooling2D(pool_size=(2, 2))(x)

    x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)

    x = Flatten()(x)

    # RNN layers for sequence modeling
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    x = Dropout(0.3)(x)

    outputs = Dense(num_classes, activation="softmax")(x)

    model = Model(inputs, outputs)
    return model

In [None]:
#Compare results with other OCR solutions.
test_loss, test_accuracy = model.evaluate(test_dataset)
print(f"Test Accuracy: {test_accuracy:.2f}")

# Plot accuracy trends
plt.plot(history.history["accuracy"], label="Train Accuracy")
plt.plot(history.history["val_accuracy"], label="Validation Accuracy")
plt.legend()
plt.title("CRNN OCR Model Accuracy")
plt.show()


NameError: name 'model' is not defined