In [None]:
!pip install opendatasets -q
!pip install -q keras-cv

In [None]:
import tensorflow as tf
import numpy as np
import os
import matplotlib.pyplot as plt
import opendatasets as od
import keras_cv

In [None]:
od.download("https://www.kaggle.com/datasets/allahhitler/ocr-synthetic-dataset")

In [None]:
DATA_ROOT = "/content/ocr-synthetic-dataset"
IMG_DIR = os.path.join(DATA_ROOT, "images")
LABEL_FILE = os.path.join(DATA_ROOT, "labels.txt")

In [None]:
print("Images:", len(os.listdir(IMG_DIR)))

with open(LABEL_FILE) as f:
    for _ in range(5):
        print(next(f).strip())

In [None]:
IMG_HEIGHT = 32
IMG_WIDTH = 256
BATCH_SIZE = 32
EPOCHS = 20

In [None]:
# Vocabulary

CHARS = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
char_to_num = tf.keras.layers.StringLookup(vocabulary=CHARS, mask_token=None, oov_token=None)
num_to_char = tf.keras.layers.StringLookup(vocabulary=char_to_num.get_vocabulary()[1:], invert=True)
NUM_CLASSES = len(char_to_num.get_vocabulary()) + 1

In [None]:
# Image & Label Processing

def load_image(path):
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (IMG_HEIGHT, IMG_WIDTH))
    img = tf.cast(img, tf.float32) / 255.0
    return img

def encode_label(text):
    chars = tf.strings.unicode_split(text, "UTF-8")
    return char_to_num(chars)

# Consolidated Dataset Function
def make_dataset(samples, training=True):
    paths = [s[0] for s in samples]
    labels = [s[1] for s in samples]
    ds = tf.data.Dataset.from_tensor_slices((paths, labels))

    def process(p, l):
        img = load_image(p)
        if training:
            img = tf.image.random_brightness(img, 0.2)
            img = tf.image.random_contrast(img, 0.8, 1.2)
        return {"image": img, "label": encode_label(l)}

    ds = ds.map(process, num_parallel_calls=tf.data.AUTOTUNE)

    # Standard batching
    ds = ds.padded_batch(
        BATCH_SIZE,
        padded_shapes={"image": [IMG_HEIGHT, IMG_WIDTH, 3], "label": [None]}
    )

    if training:
        ds = ds.shuffle(1000)
        # Apply geometric augmentations via KerasCV
        augmenter = keras_cv.layers.RandomRotation(factor=0.02, fill_mode="nearest")
        ds = ds.map(lambda x: {"image": augmenter(x["image"]), "label": x["label"]},
                    num_parallel_calls=tf.data.AUTOTUNE)

    return ds.prefetch(tf.data.AUTOTUNE)

In [None]:
# Prepare Data

def parse_labels(label_file):
    samples = []
    with open(label_file, "r") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) == 2: samples.append((os.path.join(IMG_DIR, parts[0]), parts[1]))
    return samples

samples = parse_labels(LABEL_FILE)
np.random.shuffle(samples)
split = int(0.9 * len(samples))
train_ds = make_dataset(samples[:split], True)
val_ds = make_dataset(samples[split:], False)

In [None]:
# Transfer Learning CRNN

def build_crnn():
    inputs = tf.keras.Input(shape=(IMG_HEIGHT, IMG_WIDTH, 3))

    base_model = tf.keras.applications.VGG16(
        input_tensor=inputs,
        include_top=False,
        weights="imagenet"
    )

    x = base_model.get_layer("block3_pool").output

    feature_dim = x.shape[1] * x.shape[3]
    x = tf.keras.layers.Reshape(target_shape=(-1, feature_dim))(x)

    x = tf.keras.layers.Dense(256, activation="relu")(x)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True))(x)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True))(x)

    outputs = tf.keras.layers.Dense(NUM_CLASSES, activation="softmax")(x)
    return tf.keras.Model(inputs, outputs, name="vgg_crnn")

In [None]:
# CTC Training Logic

class CTCModel(tf.keras.Model):
    def __init__(self, backbone):
        super().__init__()
        self.backbone = backbone

    def call(self, inputs, training=False):
        return self.backbone(inputs, training=training)

    def train_step(self, data):
        images, labels = data["image"], data["label"]
        with tf.GradientTape() as tape:
            preds = self.backbone(images, training=True)
            batch_size, time_steps = tf.shape(preds)[0], tf.shape(preds)[1]
            input_len = tf.fill([batch_size, 1], time_steps)
            label_len = tf.math.count_nonzero(labels, axis=1, keepdims=True)
            loss = tf.keras.backend.ctc_batch_cost(labels, preds, input_len, label_len)
        grads = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
        return {"loss": tf.reduce_mean(loss)}

    def test_step(self, data):
        images, labels = data["image"], data["label"]
        preds = self.backbone(images, training=False)
        batch_size, time_steps = tf.shape(preds)[0], tf.shape(preds)[1]
        input_len = tf.fill([batch_size, 1], time_steps)
        label_len = tf.math.count_nonzero(labels, axis=1, keepdims=True)
        loss = tf.keras.backend.ctc_batch_cost(labels, preds, input_len, label_len)
        return {"loss": tf.reduce_mean(loss)}

In [None]:
# Compile and Train

backbone = build_crnn()
model = CTCModel(backbone)
model.compile(optimizer=tf.keras.optimizers.Adam(1e-4))

model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS)

In [None]:
# Decoding (GREEDY)

def decode(pred):
    batch_size, time_steps = tf.shape(pred)[0], tf.shape(pred)[1]
    input_len = tf.fill([batch_size], time_steps)
    decoded, _ = tf.keras.backend.ctc_decode(pred, input_len, greedy=True)
    texts = []
    for seq in decoded[0]:
        seq = tf.boolean_mask(seq, seq != -1)
        text = tf.strings.reduce_join(num_to_char(seq)).numpy().decode()
        texts.append(text)
    return texts

for batch in val_ds.take(1):
    preds = backbone(batch["image"], training=False)
    texts = decode(preds)
    for i in range(5):
        gt = tf.strings.reduce_join(num_to_char(batch["label"][i])).numpy().decode()
        print(f"GT: {gt} | PRED: {texts[i]}")

In [None]:
backbone.save("File_Path_For_Model_Saving/synth90k_crnn.keras")