# Text Recognition System for Distorted Images

**Purpose:** Building a robust text recognition system using deep learning  
**Framework:** TensorFlow/Keras  
**Approach:** CNN-RNN hybrid architecture with CTC loss  

This notebook demonstrates how to build a text recognition system that can handle distorted or challenging text images using a combination of convolutional and recurrent neural networks.

## Environment Configuration

In [None]:
import os
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

os.environ["KERAS_BACKEND"] = "tensorflow"

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

import tensorflow as tf
import keras
from keras import ops
from keras import layers
from keras import models

print(f"TensorFlow version: {tf.__version__}")
print(f"Keras version: {keras.__version__}")
print(f"GPU Available: {tf.config.list_physical_devices('GPU')}")

## Data Acquisition and Preparation

We'll work with a dataset of text images. Each image contains a sequence of characters that we need to recognize.

In [None]:
# Download sample dataset
!curl -LO https://github.com/AakashKumarNain/CaptchaCracker/raw/master/captcha_images_v2.zip
!unzip -qq captcha_images_v2.zip

In [None]:
class DatasetConfig:
    """Configuration for dataset parameters"""
    def __init__(self):
        self.data_path = Path("./captcha_images_v2/")
        self.batch_size = 16
        self.image_width = 200
        self.image_height = 50
        self.pool_factor = 4  # Downsampling through pooling layers
        self.train_split = 0.9
        self.validation_split = 0.1

config = DatasetConfig()

# Load and analyze dataset
image_files = sorted(list(map(str, list(config.data_path.glob("*.png")))))
text_labels = [img.split(os.path.sep)[-1].split(".png")[0] for img in image_files]

# Extract unique character set
unique_chars = set(char for text in text_labels for char in text)
unique_chars = sorted(list(unique_chars))

# Dataset statistics
print(f"Dataset Statistics:")
print(f"==================")
print(f"Total samples: {len(image_files)}")
print(f"Unique characters: {len(unique_chars)}")
print(f"Character set: {unique_chars}")
print(f"Max sequence length: {max([len(text) for text in text_labels])}")
print(f"Min sequence length: {min([len(text) for text in text_labels])}")

## Character Encoding System

In [None]:
class CharacterEncoder:
    """Handles character to integer mapping and vice versa"""
    def __init__(self, characters):
        self.char_to_int = layers.StringLookup(
            vocabulary=list(characters), 
            mask_token=None
        )
        self.int_to_char = layers.StringLookup(
            vocabulary=self.char_to_int.get_vocabulary(), 
            mask_token=None, 
            invert=True
        )
        self.vocab_size = len(self.char_to_int.get_vocabulary()) + 1
    
    def encode(self, text):
        return self.char_to_int(tf.strings.unicode_split(text, input_encoding="UTF-8"))
    
    def decode(self, integers):
        return tf.strings.reduce_join(self.int_to_char(integers))

encoder = CharacterEncoder(unique_chars)
max_text_len = max([len(text) for text in text_labels])

## Data Pipeline Creation

In [None]:
def create_train_val_split(images, labels, split_ratio=0.9, shuffle=True):
    """Split dataset into training and validation sets"""
    dataset_size = len(images)
    indices = ops.arange(dataset_size)
    
    if shuffle:
        indices = keras.random.shuffle(indices)
    
    split_point = int(dataset_size * split_ratio)
    
    train_images = images[indices[:split_point]]
    train_labels = labels[indices[:split_point]]
    val_images = images[indices[split_point:]]
    val_labels = labels[indices[split_point:]]
    
    return train_images, val_images, train_labels, val_labels

# Create splits
train_imgs, val_imgs, train_lbls, val_lbls = create_train_val_split(
    np.array(image_files), 
    np.array(text_labels),
    split_ratio=config.train_split
)

print(f"Training samples: {len(train_imgs)}")
print(f"Validation samples: {len(val_imgs)}")

In [None]:
def preprocess_image(image_path, label):
    """Preprocess individual image and label pair"""
    # Load and process image
    image = tf.io.read_file(image_path)
    image = tf.io.decode_png(image, channels=1)
    image = tf.image.convert_image_dtype(image, tf.float32)
    image = ops.image.resize(image, [config.image_height, config.image_width])
    
    # Transpose for sequence processing (width becomes time dimension)
    image = ops.transpose(image, axes=[1, 0, 2])
    
    # Encode text label
    label = encoder.encode(label)
    
    return {"image": image, "label": label}

# Create TensorFlow datasets
train_data = tf.data.Dataset.from_tensor_slices((train_imgs, train_lbls))
train_data = (
    train_data
    .map(preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)
    .batch(config.batch_size)
    .prefetch(tf.data.AUTOTUNE)
)

val_data = tf.data.Dataset.from_tensor_slices((val_imgs, val_lbls))
val_data = (
    val_data
    .map(preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)
    .batch(config.batch_size)
    .prefetch(tf.data.AUTOTUNE)
)

## Data Visualization

In [None]:
def visualize_samples(dataset, num_samples=16):
    """Visualize sample images with their labels"""
    fig, axes = plt.subplots(4, 4, figsize=(12, 8))
    axes = axes.flatten()
    
    for batch in dataset.take(1):
        images = batch["image"]
        labels = batch["label"]
        
        for idx in range(min(num_samples, len(images))):
            img_array = (images[idx] * 255).numpy().astype("uint8")
            label_text = encoder.decode(labels[idx]).numpy().decode("utf-8")
            
            axes[idx].imshow(img_array[:, :, 0].T, cmap='gray')
            axes[idx].set_title(f"Text: {label_text}", fontsize=10)
            axes[idx].axis('off')
    
    plt.suptitle("Sample Images from Dataset", fontsize=14)
    plt.tight_layout()
    plt.show()

visualize_samples(train_data)

## Custom Loss Implementation

We implement Connectionist Temporal Classification (CTC) loss for sequence recognition.

In [None]:
def compute_ctc_loss(y_true, y_pred, input_length, label_length):
    """Compute CTC loss for batch"""
    label_length = ops.cast(ops.squeeze(label_length, axis=-1), dtype="int32")
    input_length = ops.cast(ops.squeeze(input_length, axis=-1), dtype="int32")
    
    # Convert dense labels to sparse format
    sparse_labels = dense_to_sparse(y_true, label_length)
    
    # Transpose predictions for CTC loss computation
    y_pred = ops.log(ops.transpose(y_pred, axes=[1, 0, 2]) + keras.backend.epsilon())
    
    # Compute CTC loss
    loss = tf.compat.v1.nn.ctc_loss(
        inputs=y_pred, 
        labels=sparse_labels, 
        sequence_length=input_length
    )
    
    return ops.expand_dims(loss, 1)

def dense_to_sparse(labels, label_lengths):
    """Convert dense label tensor to sparse format for CTC"""
    label_shape = ops.shape(labels)
    batch_size = label_shape[0]
    max_length = label_shape[1]
    
    # Create mask for valid label positions
    indices = []
    values = []
    
    for batch_idx in range(batch_size):
        for time_idx in range(label_lengths[batch_idx]):
            indices.append([batch_idx, time_idx])
            values.append(labels[batch_idx, time_idx])
    
    # Create sparse tensor
    indices = ops.cast(indices, dtype="int64")
    values = ops.cast(values, dtype="int32")
    shape = ops.cast([batch_size, max_length], dtype="int64")
    
    return tf.SparseTensor(indices, values, shape)

In [None]:
class CTCLossLayer(layers.Layer):
    """Custom layer for CTC loss computation during training"""
    
    def __init__(self, name="ctc_loss_layer"):
        super().__init__(name=name)
    
    def call(self, y_true, y_pred):
        batch_size = ops.shape(y_true)[0]
        max_time = ops.shape(y_pred)[1]
        label_length = ops.shape(y_true)[1]
        
        # Create input length tensor (all sequences use full length)
        input_length = max_time * ops.ones(shape=(batch_size, 1), dtype="int32")
        label_length = label_length * ops.ones(shape=(batch_size, 1), dtype="int32")
        
        # Compute and add loss
        loss = compute_ctc_loss(y_true, y_pred, input_length, label_length)
        self.add_loss(loss)
        
        return y_pred

## Model Architecture

We design a hybrid CNN-RNN architecture:
- CNN layers extract visual features
- RNN layers process the sequence
- CTC loss handles alignment

In [None]:
def create_recognition_model():
    """Build the text recognition model"""
    
    # Input layers
    image_input = layers.Input(
        shape=(config.image_width, config.image_height, 1), 
        name="image", 
        dtype="float32"
    )
    label_input = layers.Input(
        name="label", 
        shape=(None,), 
        dtype="float32"
    )
    
    # Feature extraction with CNN
    x = image_input
    
    # First convolutional block
    x = layers.Conv2D(
        filters=32,
        kernel_size=(3, 3),
        activation="relu",
        kernel_initializer="he_normal",
        padding="same",
        name="conv_block1"
    )(x)
    x = layers.BatchNormalization(name="bn1")(x)
    x = layers.MaxPooling2D(pool_size=(2, 2), name="pool1")(x)
    
    # Second convolutional block
    x = layers.Conv2D(
        filters=64,
        kernel_size=(3, 3),
        activation="relu",
        kernel_initializer="he_normal",
        padding="same",
        name="conv_block2"
    )(x)
    x = layers.BatchNormalization(name="bn2")(x)
    x = layers.MaxPooling2D(pool_size=(2, 2), name="pool2")(x)
    
    # Third convolutional block (additional depth)
    x = layers.Conv2D(
        filters=128,
        kernel_size=(3, 3),
        activation="relu",
        kernel_initializer="he_normal",
        padding="same",
        name="conv_block3"
    )(x)
    x = layers.BatchNormalization(name="bn3")(x)
    
    # Reshape for RNN processing
    feature_dims = (config.image_width // config.pool_factor, 
                   (config.image_height // config.pool_factor) * 128)
    x = layers.Reshape(target_shape=feature_dims, name="reshape_features")(x)
    
    # Dense layer for feature transformation
    x = layers.Dense(128, activation="relu", name="feature_dense")(x)
    x = layers.Dropout(0.3)(x)
    
    # Bidirectional LSTM layers for sequence processing
    x = layers.Bidirectional(
        layers.LSTM(256, return_sequences=True, dropout=0.2),
        name="bi_lstm1"
    )(x)
    x = layers.Bidirectional(
        layers.LSTM(128, return_sequences=True, dropout=0.2),
        name="bi_lstm2"
    )(x)
    
    # Output layer with character predictions
    x = layers.Dense(
        encoder.vocab_size, 
        activation="softmax", 
        name="character_output"
    )(x)
    
    # Add CTC loss layer
    output = CTCLossLayer()(label_input, x)
    
    # Create and compile model
    model = keras.models.Model(
        inputs=[image_input, label_input], 
        outputs=output, 
        name="text_recognition_model"
    )
    
    # Use Adam optimizer with learning rate scheduling
    optimizer = keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer)
    
    return model

# Build model
recognition_model = create_recognition_model()
recognition_model.summary()

## Training Configuration

In [None]:
class TrainingConfig:
    """Training hyperparameters and callbacks"""
    def __init__(self):
        self.epochs = 50
        self.patience = 10
        self.reduce_lr_patience = 5
        self.checkpoint_path = "best_model.keras"

train_config = TrainingConfig()

# Define callbacks
callbacks = [
    keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=train_config.patience,
        restore_best_weights=True,
        verbose=1
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss",
        factor=0.5,
        patience=train_config.reduce_lr_patience,
        min_lr=1e-6,
        verbose=1
    ),
    keras.callbacks.ModelCheckpoint(
        filepath=train_config.checkpoint_path,
        monitor="val_loss",
        save_best_only=True,
        verbose=1
    )
]

In [None]:
# Train the model
training_history = recognition_model.fit(
    train_data,
    validation_data=val_data,
    epochs=train_config.epochs,
    callbacks=callbacks,
    verbose=1
)

## Training Visualization

In [None]:
def plot_training_history(history):
    """Visualize training metrics"""
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Plot loss
    axes[0].plot(history.history['loss'], label='Training Loss', linewidth=2)
    axes[0].plot(history.history['val_loss'], label='Validation Loss', linewidth=2)
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Loss')
    axes[0].set_title('Model Loss Over Time')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # Plot learning rate if available
    if 'lr' in history.history:
        axes[1].plot(history.history['lr'], label='Learning Rate', linewidth=2, color='green')
        axes[1].set_xlabel('Epoch')
        axes[1].set_ylabel('Learning Rate')
        axes[1].set_title('Learning Rate Schedule')
        axes[1].legend()
        axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

plot_training_history(training_history)

## Inference Pipeline

In [None]:
def decode_ctc_output(y_pred, input_length, use_beam_search=False, beam_width=100):
    """Decode CTC output to text"""
    batch_size = ops.shape(y_pred)[0]
    max_length = ops.shape(y_pred)[1]
    
    # Transpose for CTC decoder
    y_pred = ops.log(ops.transpose(y_pred, axes=[1, 0, 2]) + keras.backend.epsilon())
    input_length = ops.cast(input_length, dtype="int32")
    
    if use_beam_search:
        decoded, log_prob = tf.compat.v1.nn.ctc_beam_search_decoder(
            inputs=y_pred,
            sequence_length=input_length,
            beam_width=beam_width,
            top_paths=1
        )
    else:
        decoded, log_prob = tf.nn.ctc_greedy_decoder(
            inputs=y_pred,
            sequence_length=input_length
        )
    
    # Convert sparse to dense
    sparse_tensor = decoded[0]
    dense_tensor = tf.SparseTensor(
        sparse_tensor.indices, 
        sparse_tensor.values, 
        (batch_size, max_length)
    )
    decoded_dense = tf.sparse.to_dense(sp_input=dense_tensor, default_value=-1)
    
    return decoded_dense, log_prob

# Create prediction model (without CTC loss layer)
prediction_model = keras.models.Model(
    recognition_model.input[0], 
    recognition_model.get_layer(name="character_output").output
)

def predict_text(predictions):
    """Convert model predictions to text"""
    input_lengths = np.ones(predictions.shape[0]) * predictions.shape[1]
    
    # Decode predictions
    decoded_outputs, _ = decode_ctc_output(
        predictions, 
        input_length=input_lengths, 
        use_beam_search=False
    )
    
    # Convert to text
    predicted_texts = []
    for sequence in decoded_outputs[:, :max_text_len]:
        text = encoder.decode(sequence).numpy().decode("utf-8")
        predicted_texts.append(text)
    
    return predicted_texts

## Model Evaluation

In [None]:
def evaluate_model(dataset, num_batches=1):
    """Evaluate model performance on dataset"""
    fig, axes = plt.subplots(4, 4, figsize=(15, 10))
    axes = axes.flatten()
    
    sample_idx = 0
    
    for batch in dataset.take(num_batches):
        batch_images = batch["image"]
        batch_labels = batch["label"]
        
        # Get predictions
        predictions = prediction_model.predict(batch_images, verbose=0)
        predicted_texts = predict_text(predictions)
        
        # Get ground truth
        true_texts = []
        for label in batch_labels:
            text = encoder.decode(label).numpy().decode("utf-8")
            true_texts.append(text)
        
        # Visualize results
        for i in range(min(len(predicted_texts), 16 - sample_idx)):
            if sample_idx >= 16:
                break
                
            img = (batch_images[i, :, :, 0] * 255).numpy().astype(np.uint8)
            img = img.T
            
            # Color code based on accuracy
            color = 'green' if predicted_texts[i] == true_texts[i] else 'red'
            
            axes[sample_idx].imshow(img, cmap='gray')
            axes[sample_idx].set_title(
                f"True: {true_texts[i]}\nPred: {predicted_texts[i]}", 
                fontsize=9, 
                color=color
            )
            axes[sample_idx].axis('off')
            sample_idx += 1
    
    plt.suptitle("Model Predictions (Green=Correct, Red=Incorrect)", fontsize=14)
    plt.tight_layout()
    plt.show()

# Evaluate on validation set
print("Evaluating on validation set...")
evaluate_model(val_data)

## Performance Metrics

In [None]:
def calculate_accuracy(dataset, max_batches=None):
    """Calculate character and sequence accuracy"""
    total_sequences = 0
    correct_sequences = 0
    total_chars = 0
    correct_chars = 0
    
    batch_count = 0
    for batch in dataset:
        if max_batches and batch_count >= max_batches:
            break
            
        batch_images = batch["image"]
        batch_labels = batch["label"]
        
        # Get predictions
        predictions = prediction_model.predict(batch_images, verbose=0)
        predicted_texts = predict_text(predictions)
        
        # Get ground truth
        for i, label in enumerate(batch_labels):
            true_text = encoder.decode(label).numpy().decode("utf-8")
            pred_text = predicted_texts[i]
            
            # Sequence accuracy
            total_sequences += 1
            if pred_text == true_text:
                correct_sequences += 1
            
            # Character accuracy
            for j in range(min(len(true_text), len(pred_text))):
                total_chars += 1
                if j < len(pred_text) and true_text[j] == pred_text[j]:
                    correct_chars += 1
            total_chars += abs(len(true_text) - len(pred_text))
        
        batch_count += 1
    
    seq_accuracy = correct_sequences / total_sequences if total_sequences > 0 else 0
    char_accuracy = correct_chars / total_chars if total_chars > 0 else 0
    
    return {
        'sequence_accuracy': seq_accuracy,
        'character_accuracy': char_accuracy,
        'total_sequences': total_sequences,
        'correct_sequences': correct_sequences
    }

# Calculate metrics
print("Calculating performance metrics...")
train_metrics = calculate_accuracy(train_data, max_batches=10)
val_metrics = calculate_accuracy(val_data)

print("\nTraining Set Performance:")
print(f"  Sequence Accuracy: {train_metrics['sequence_accuracy']:.2%}")
print(f"  Character Accuracy: {train_metrics['character_accuracy']:.2%}")

print("\nValidation Set Performance:")
print(f"  Sequence Accuracy: {val_metrics['sequence_accuracy']:.2%}")
print(f"  Character Accuracy: {val_metrics['character_accuracy']:.2%}")
print(f"  Correct Sequences: {val_metrics['correct_sequences']}/{val_metrics['total_sequences']}")

## Model Export and Deployment

In [None]:
# Save the model
model_save_path = "text_recognition_model.keras"
prediction_model.save(model_save_path)
print(f"Model saved to: {model_save_path}")

# Save configuration
import json

config_dict = {
    'image_width': config.image_width,
    'image_height': config.image_height,
    'vocab_size': encoder.vocab_size,
    'characters': unique_chars,
    'max_text_length': max_text_len
}

with open('model_config.json', 'w') as f:
    json.dump(config_dict, f, indent=2)
print("Configuration saved to: model_config.json")

## Inference Example

In [None]:
def predict_single_image(image_path):
    """Predict text from a single image"""
    # Load and preprocess image
    image = tf.io.read_file(image_path)
    image = tf.io.decode_png(image, channels=1)
    image = tf.image.convert_image_dtype(image, tf.float32)
    image = ops.image.resize(image, [config.image_height, config.image_width])
    image = ops.transpose(image, axes=[1, 0, 2])
    image = ops.expand_dims(image, axis=0)  # Add batch dimension
    
    # Get prediction
    prediction = prediction_model.predict(image, verbose=0)
    predicted_text = predict_text(prediction)[0]
    
    return predicted_text

# Test on a sample image
sample_image_path = image_files[0]
predicted = predict_single_image(sample_image_path)
actual = text_labels[0]

print(f"Sample Image: {sample_image_path}")
print(f"Actual Text: {actual}")
print(f"Predicted Text: {predicted}")
print(f"Match: {'✓' if predicted == actual else '✗'}")