In [None]:
!pip install opendatasets -q

In [None]:
import tensorflow as tf
import numpy as np
import os
import matplotlib.pyplot as plt
import opendatasets as od

In [None]:
od.download("https://www.kaggle.com/datasets/allahhitler/ocr-synthetic-dataset")

In [None]:
DATA_ROOT = "/content/ocr-synthetic-dataset"
IMG_DIR = os.path.join(DATA_ROOT, "images")
LABEL_FILE = os.path.join(DATA_ROOT, "labels.txt")

In [None]:
print("Images:", len(os.listdir(IMG_DIR)))

with open(LABEL_FILE) as f:
    for _ in range(5):
        print(next(f).strip())

In [None]:
IMG_HEIGHT = 32
BATCH_SIZE = 32
EPOCHS = 20

In [None]:
# Vocabulary (CTC-SAFE)

CHARS = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")

char_to_num = tf.keras.layers.StringLookup(
    vocabulary=CHARS,
    mask_token=None,
    oov_token=None
)

num_to_char = tf.keras.layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary()[1:],
    invert=True
)

NUM_CLASSES = len(char_to_num.get_vocabulary()) + 1

In [None]:
# Dataset Loader (CRITICAL PART)

def parse_labels(label_file):
    samples = []
    with open(label_file, "r") as f:
        for line in f:
            img, text = line.strip().split()
            samples.append((os.path.join(IMG_DIR, img), text))
    return samples

samples = parse_labels(LABEL_FILE)
np.random.shuffle(samples)

split = int(0.9 * len(samples))
train_samples = samples[:split]
val_samples = samples[split:]

In [None]:
# Image & Label Processing

def load_image(path):
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=1)
    h, w = tf.shape(img)[0], tf.shape(img)[1]
    new_w = tf.cast(w * IMG_HEIGHT / h, tf.int32)
    img = tf.image.resize(img, (IMG_HEIGHT, new_w))
    img = tf.cast(img, tf.float32) / 255.0
    return img

def encode_label(text):
    chars = tf.strings.unicode_split(text, "UTF-8")
    return char_to_num(chars)

In [None]:
# tf.data Pipeline (PADDED WIDTH)

def make_dataset(samples, training=True):
    paths = [s[0] for s in samples]
    labels = [s[1] for s in samples]

    ds = tf.data.Dataset.from_tensor_slices((paths, labels))

    def process(p, l):
        return {
            "image": load_image(p),
            "label": encode_label(l)
        }

    ds = ds.map(process, num_parallel_calls=tf.data.AUTOTUNE)

    if training:
        ds = ds.shuffle(1000)

    ds = ds.padded_batch(
        BATCH_SIZE,
        padded_shapes={
            "image": [IMG_HEIGHT, None, 1],
            "label": [None]
        }
    )

    return ds.prefetch(tf.data.AUTOTUNE)

train_ds = make_dataset(train_samples, True)
val_ds = make_dataset(val_samples, False)

In [None]:
# CRNN Model

def build_crnn():
    inputs = tf.keras.Input(shape=(IMG_HEIGHT, None, 1))

    x = tf.keras.layers.Conv2D(64, 3, padding="same", activation="relu")(inputs)
    x = tf.keras.layers.MaxPooling2D((2, 2))(x)
    x = tf.keras.layers.Conv2D(128, 3, padding="same", activation="relu")(x)
    x = tf.keras.layers.MaxPooling2D((2, 2))(x)

    x = tf.keras.layers.Conv2D(256, 3, padding="same", activation="relu")(x)
    x = tf.keras.layers.BatchNormalization()(x)

    x = tf.keras.layers.Permute((2, 1, 3))(x)
    x = tf.keras.layers.Reshape((-1, x.shape[2] * x.shape[3]))(x)

    x = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(256, return_sequences=True)
    )(x)
    x = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(256, return_sequences=True)
    )(x)

    outputs = tf.keras.layers.Dense(
        NUM_CLASSES,
        activation="softmax"
    )(x)

    return tf.keras.Model(inputs, outputs, name="crnn")

In [None]:
# Correct Loss Handling

class CTCModel(tf.keras.Model):
    def __init__(self, backbone):
        super().__init__()
        self.backbone = backbone

    def call(self, inputs, training=False):
        return self.backbone(inputs, training=training)

    def train_step(self, data):
        images = data["image"]
        labels = data["label"]

        with tf.GradientTape() as tape:
            preds = self.backbone(images, training=True)

            batch_size = tf.shape(preds)[0]
            time_steps = tf.shape(preds)[1]

            input_len = tf.fill([batch_size, 1], time_steps)

            label_len = tf.math.count_nonzero(labels, axis=1, keepdims=True)

            loss = tf.keras.backend.ctc_batch_cost(
                labels, preds, input_len, label_len
            )

        grads = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.trainable_variables))

        return {"loss": tf.reduce_mean(loss)}

    def test_step(self, data):
        images = data["image"]
        labels = data["label"]

        preds = self.backbone(images, training=False)

        batch_size = tf.shape(preds)[0]
        time_steps = tf.shape(preds)[1]

        input_len = tf.fill([batch_size, 1], time_steps)
        label_len = tf.math.count_nonzero(labels, axis=1, keepdims=True)

        loss = tf.keras.backend.ctc_batch_cost(
            labels, preds, input_len, label_len
        )

        return {"loss": tf.reduce_mean(loss)}

In [None]:
# Compile and Train

backbone = build_crnn()
model = CTCModel(backbone)

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4)
)

model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS
)

In [None]:
# Decoding (GREEDY)

def decode(pred):
    batch_size = tf.shape(pred)[0]
    time_steps = tf.shape(pred)[1]

    input_len = tf.fill([batch_size], time_steps)
    decoded, _ = tf.keras.backend.ctc_decode(pred, input_len, greedy=True)

    texts = []
    for seq in decoded[0]:
        seq = tf.boolean_mask(seq, seq != -1)
        text = tf.strings.reduce_join(num_to_char(seq)).numpy().decode()
        texts.append(text)

    return texts

In [None]:
for batch in val_ds.take(1):
    preds = backbone(batch["image"], training=False)
    texts = decode(preds)

    for i in range(5):
        gt = tf.strings.reduce_join(
            num_to_char(batch["label"][i])
        ).numpy().decode()
        print("GT:", gt, "| PRED:", texts[i])

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
backbone.save("/content/drive/MyDrive/Machine_Learning/Image_Based/Word_Recognition/synth90k_crnn.keras")