In [2]:
import ultralytics

ultralytics.checks()

Ultralytics 8.3.191 🚀 Python-3.11.13 torch-2.6.0+cu124 CUDA:0 (Tesla P100-PCIE-16GB, 16269MiB)
Setup complete ✅ (4 CPUs, 31.4 GB RAM, 6411.4/8062.4 GB disk)


# Detection with ICDAR2003 dataset

In [4]:
import xml
import xml.etree.ElementTree as ET
import os

def convert_data_from_xml(root_dir):
    xml_path = os.path.join(root_dir, "words.xml")
    tree = ET.parse(xml_path) 
    root = tree.getroot()

    img_paths = []
    img_sizes = []
    img_labels = []
    bboxes = []

    for img in root:
        bbs_of_img = []
        labels_of_img = []

        for bbs in img.findall('taggedRectangles'):
            for bb in bbs:
                if not bb[0].text.isalnum():
                    continue
                
                bbs_of_img.append(
                    [
                        float(bb.attrib['x']), 
                        float(bb.attrib['y']),
                        float(bb.attrib['width']),
                        float(bb.attrib['height'])
                    ]
                )
                labels_of_img.append(bb[0].text.lower())
        
        bboxes.append(bbs_of_img)
        img_paths.append(os.path.join(root_dir, img[0].text))
        img_sizes.append((int(img[1].attrib['x']), int(img[1].attrib['y'])))
        img_labels.append(labels_of_img)

    return img_paths, img_sizes, img_labels, bboxes

dataset_dir = r"/kaggle/input/icdar2003/SceneTrialTrain"
img_paths, img_sizes, img_labels, bboxes = convert_data_from_xml(
    dataset_dir
    )


In [5]:
def convert_yolo_format(img_paths, img_sizes, set_bboxes): 

    yolo_data = []

    for path, size, bboxes in zip(img_paths, img_sizes, set_bboxes):
        img_width, img_height = size

        yolo_labels = []

        for bbox in bboxes:
            x, y, w, h = bbox
            # Calculate nomalized box 
            center_x = ( x + w / 2 ) / img_width
            center_y = ( y + h / 2 ) / img_height
            normalized_w = w / img_width
            normalized_h = h / img_height
            class_id = 0

            yolo_label = f"{class_id} {center_x} {center_y} {normalized_w} {normalized_h}"
            yolo_labels.append(yolo_label)
        
        yolo_data.append(yolo_labels)

    return yolo_data

yolo_data = convert_yolo_format(img_paths, img_sizes, bboxes)

In [6]:
len(yolo_data)

250

In [7]:
import shutil
from sklearn.model_selection import train_test_split

def save_data(img_paths, yolo_data, src_img_dir, save_dir, prefix=""):
    os.makedirs(os.path.join(save_dir, "images"), exist_ok=True)
    os.makedirs(os.path.join(save_dir, "labels"), exist_ok=True)

    for idx, (path, yolo_labels) in enumerate(zip(img_paths, yolo_data)):
        # đặt lại tên ảnh
        new_name = f"{prefix}_{idx:05d}"

        # copy ảnh
        dst_img = os.path.join(save_dir, "images", f"{new_name}.jpg")
        shutil.copy(os.path.join(src_img_dir, path), dst_img)

        dst_lbl = os.path.join(save_dir, "labels", f"{new_name}.txt")
        with open(dst_lbl, "w") as f:
            for label in yolo_labels:
                f.write(f"{label}\n")

seed = 0
val_size = 0.2
test_size = 0.125
is_shuffle = True

train_paths, temp_paths, train_yolo, temp_yolo = train_test_split(
    img_paths, yolo_data,
    test_size=val_size + test_size,
    random_state=seed,
    shuffle=is_shuffle,
)

val_paths, test_paths, val_yolo, test_yolo = train_test_split(
    temp_paths, temp_yolo,
    test_size=test_size / (val_size + test_size),
    random_state=seed,
    shuffle=is_shuffle,
)

save_yolo_data_dir = "datasets/yolo_data"
dataset_dir =r"/kaggle/input/icdar2003/SceneTrialTrain"
os.makedirs(save_yolo_data_dir, exist_ok=True)

save_train_dir = os.path.join(save_yolo_data_dir, "train")
save_val_dir = os.path.join(save_yolo_data_dir, "val")
save_test_dir = os.path.join(save_yolo_data_dir, "test")

save_data(train_paths, train_yolo, dataset_dir, save_train_dir, prefix="train")
save_data(val_paths, val_yolo, dataset_dir, save_val_dir, prefix="val")
save_data(test_paths, test_yolo, dataset_dir, save_test_dir, prefix="test")

In [8]:
import yaml

class_labels = ['text']

data_yaml = {
    "path"  : "./datasets/yolo_data",
    "train" : "./train/images",
    "val"   : "./val/images",
    "test"  : "./test/images",
    "nc"    : 1,
    "names" : class_labels
}

yolo_yaml_path = os.path.join(save_yolo_data_dir , "data.yml")
with open(yolo_yaml_path, 'w') as f:
    yaml.dump(data_yaml, f, default_flow_style=False)

In [10]:
from ultralytics import YOLO

model = YOLO('yolo11s.pt')

results = model.train(
    data= yolo_yaml_path,
    epochs= 100, 
    imgsz = 640,
    cache= True,
    patience= 20, 
    plots= True
)

[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11s.pt to 'yolo11s.pt': 100% ━━━━━━━━━━━━ 18.4/18.4MB 68.3MB/s 0.3s0.2s
Ultralytics 8.3.191 🚀 Python-3.11.13 torch-2.6.0+cu124 CUDA:0 (Tesla P100-PCIE-16GB, 16269MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=True, cfg=None, classes=None, close_mosaic=10, cls=0.5, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=datasets/yolo_data/data.yml, degrees=0.0, deterministic=True, device=None, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=100, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolo11s.pt, momentum=0.937, mosaic=1.0,

  xa[xa < 0] = -1
  xa[xa < 0] = -1


                   all         50        194      0.854      0.856      0.874      0.694
Speed: 0.1ms preprocess, 4.8ms inference, 0.0ms loss, 1.1ms postprocess per image
Results saved to [1mruns/detect/train2[0m


In [11]:
torch.save(model, "text_detection_model.pt")

NameError: name 'torch' is not defined

# Text Recognition with CRNN


In [2]:
import os 
import time
from PIL import Image
import xml.etree.ElementTree as ET
from sklearn.model_selection import train_test_split

import cv2
import matplotlib.pyplot as plt
import numpy as np

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torchvision
from torchvision import transforms

In [8]:
save_dir = r"/home/khanhxoe/PersonalProjects/SceneTextRecognition/datasets/CRNN_data"
root_dir = save_dir

img_paths = []
labels = []

with open(os.path.join(root_dir, 'label.txt'), 'r') as f:
    for line in f:
        old_path, label = line.strip().split('\t')
        filename = os.path.basename(old_path)

        new_path = os.path.join(root_dir, filename)

        img_paths.append(new_path)
        labels.append(label)

print(f"Total images: {len(labels)}")
print(img_paths[1])


Total images: 1093
/home/khanhxoe/PersonalProjects/SceneTextRecognition/datasets/CRNN_data/000001.jpg


In [9]:
letters = [char.split(".")[0].lower() for char in labels]
letters = "".join(letters)
letters = sorted(list(set(letters)))

vocab = "".join(letters)
blank_char = "-"
vocab += blank_char

vocab_size = len(vocab)

print(f"Vocab: {vocab}")
print(f"Vocab size: {vocab_size}")

Vocab: 0123456789abcdefghijklmnopqrstuvwxyzéñ-
Vocab size: 39


In [10]:
char_to_idx = {char: idx + 1 for idx , char in enumerate(sorted(vocab))}
idx_to_char = {index: char for char , index in char_to_idx.items ()}

In [11]:
max_label_len = max([len(obj) for obj in labels])

def encode_tokenize(label, char_to_idx, max_label_len):
    encoded_labels = torch.tensor([
        char_to_idx[token] for token in label],
        dtype=torch.int32
        )
    label_len = len(encoded_labels)
    lengths = torch.tensor(
        label_len, 
        dtype= torch.int32
    )
    padded_labels = F.pad(
        encoded_labels, 
        (0, max_label_len-label_len),
        value= 0
        )
    return padded_labels, lengths

In [12]:
def decode(encoded_seqs, idx_to_char, blank_char= '-'):
    decoded_sequences = []

    for seq in encoded_seqs:
        decoded_label = []
        prev_char = None

        for token in seq:
            if token != 0:
                char = idx_to_char[token.item()]
                if char != blank_char:
                    if char != prev_char or prev_char == blank_char:
                        decoded_label.append(char)

                prev_char = char
        decoded_sequences.append(decoded_label)
    
    print(f"From {encoded_seqs} to {decoded_sequences}")
    
    return decoded_sequences

In [13]:
data_transform = {
    'train': transforms.Compose([
        transforms.Resize((100, 420)),
        transforms.ColorJitter(
            brightness= 0.5,
            contrast= 0.5,
            saturation= 0.5),
        transforms.Grayscale(1),
        transforms.GaussianBlur(3),
        transforms.RandomAffine(
            degrees= 1,
            shear= 1
        ),
        transforms.RandomPerspective(
            distortion_scale= 0.3,
            p= 0.5, 
            interpolation=3
        ),
        transforms.RandomRotation(degrees=20),
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ]),
    'val': transforms.Compose([
        transforms.Resize((100, 420)),
        transforms.Grayscale(1),
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ])
}

In [14]:
seed = 42
val_size = 0.1
test_size = 0.1
is_shuffle = True

X_train, X_val, y_train, y_val = train_test_split(
    img_paths, labels,
    test_size= val_size,
    random_state= seed,
    shuffle= is_shuffle
)
X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train,
    test_size= val_size,
    random_state= seed,
    shuffle= is_shuffle
)

In [15]:
class SceneTextDataset(Dataset):
    def __init__(
        self, X, y, 
        char_to_idx, max_label_len, 
        label_encoder= None, transforms= None) -> None:

        self.img_paths = X
        self.labels = y
        self.char_to_idx = char_to_idx
        self.max_len = max_label_len
        self.encoder = label_encoder
        self.transform = transforms

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        label = self.labels[idx]
        path = self.img_paths[idx]
        img = Image.open(path).convert('RGB')
        
        if self.transform:
            img = self.transform(img)
        
        if self.encoder:
            label, label_len  = self.encoder(label, self.char_to_idx, self.max_len)

        return img, label, label_len

In [16]:
train_dataset = SceneTextDataset(
    X_train,
    y_train,
    char_to_idx=char_to_idx,
    max_label_len=max_label_len,
    label_encoder=encode_tokenize,
    transforms=data_transform["train"],
)
val_dataset = SceneTextDataset(
    X_val,
    y_val,
    char_to_idx=char_to_idx,
    max_label_len=max_label_len,
    label_encoder=encode_tokenize,
    transforms=data_transform["val"],
)
test_dataset = SceneTextDataset(
    X_test,
    y_test,
    char_to_idx=char_to_idx,
    max_label_len=max_label_len,
    label_encoder=encode_tokenize,
    transforms=data_transform["val"],
)

train_batch_size = 64
test_batch_size = 64 * 2

train_loader = DataLoader(
    train_dataset,
    batch_size=train_batch_size,
    shuffle=True,
    num_workers= 4
)
val_loader = DataLoader(
    val_dataset,
    batch_size=test_batch_size,
    shuffle=False,
    num_workers= 4
)
test_loader = DataLoader(
    test_dataset,
    batch_size=test_batch_size,
    shuffle=False,
    num_workers= 4
)

In [17]:
import timm

class CRNN(nn.Module):
    def __init__(
        self, vocab_size, hidden_size, n_layers, dropout=0.2, unfreeze_layers=3
    ):
        super(CRNN, self).__init__()

        backbone = timm.create_model("resnet152", in_chans=1, pretrained=True)
        modules = list(backbone.children())[:-2]
        modules.append(nn.AdaptiveAvgPool2d((1, None)))
        self.backbone = nn.Sequential(*modules)

        for parameter in self.backbone[-unfreeze_layers:].parameters():
            parameter.requires_grad = True

        self.mapSeq = nn.Sequential(
            nn.Linear(2048, 512), nn.ReLU(), nn.Dropout(dropout)
        )

        self.gru = nn.GRU(
            512,
            hidden_size,
            n_layers,
            bidirectional=True,
            batch_first=True,
            dropout=dropout if n_layers > 1 else 0,
        )

        self.layer_norm = nn.LayerNorm(hidden_size * 2)

        self.out = nn.Sequential(
            nn.Linear(hidden_size * 2, vocab_size), nn.LogSoftmax(dim=2)
        )

    @torch.autocast(device_type="cuda")
    def forward(self, x):
        x = self.backbone(x)
        x = x.permute(0, 3, 1, 2)
        x = x.view(x.size(0), x.size(1), -1)
        x = self.mapSeq(x)
        x, _ = self.gru(x)
        x = self.layer_norm(x)
        x = self.out(x)
        x = x.permute(1, 0, 2)

        return x

In [34]:
def evaluate(model, data_loader, criterion, device):
    model.eval()
    val_losses = []
    with torch.no_grad():
        for img, label, text_len in data_loader:
            img = img.to(device)
            label = label.to(device)
            text_len = text_len.to(device)

            outputs = model(img)
            logits_lens = torch.full(
                size=(outputs.size(1),),
                fill_value=outputs.size(0),
                dtype=torch.long
            ).to(device)

            loss = criterion(outputs, label, logits_lens, text_len)
            val_losses.append(loss.item())

    loss = sum(val_losses) / len(val_losses)
    return loss


def fit(
    model, train_loader, val_loader, criterion, optimizer, scheduler, 
    device, epochs, patience=5
):
    train_losses = []
    val_losses = []

    best_val_loss = float("inf")
    patience_counter = 0
    best_model_state = None

    for epoch in range(epochs):
        start = time.time()
        batch_train_losses = []

        model.train()
        for idx, (inputs, labels, labels_len) in enumerate(train_loader):
            inputs = inputs.to(device)
            labels = labels.to(device)
            labels_len = labels_len.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)

            logits_lens = torch.full(
                size=(outputs.size(1),),
                fill_value=outputs.size(0),
                dtype=torch.long
            ).to(device)

            loss = criterion(outputs, labels, logits_lens, labels_len)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
            optimizer.step()

            batch_train_losses.append(loss.item())

        train_loss = sum(batch_train_losses) / len(batch_train_losses)
        train_losses.append(train_loss)

        val_loss = evaluate(model, val_loader, criterion, device)
        val_losses.append(val_loss)

        print(
            f"EPOCH {epoch + 1}: "
            f"\tTrain loss: {train_loss:.4f}"
            f"\tVal loss: {val_loss:.4f}"
            f"\tTime: {time.time() - start:.2f} seconds"
        )

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            best_model_state = model.state_dict()  # save best model
        else:
            patience_counter += 1
            print(f"  Patience counter: {patience_counter}/{patience}")

        if patience_counter >= patience:
            print("⚠️ Early stopping triggered!")
            break

        scheduler.step()
        
    if best_model_state:
        model.load_state_dict(best_model_state)

    return train_losses, val_losses, model

In [18]:
hidden_size = 256
n_layers = 3
dropout = 0.2
unfreeze_layers = 12
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = CRNN(
    vocab_size= vocab_size,
    hidden_size= hidden_size,
    n_layers= n_layers,
    dropout=0.2, 
    unfreeze_layers=3
).to(device)

In [19]:
model.load_state_dict(torch.load(r'/home/khanhxoe/PersonalProjects/SceneTextRecognition/Solution/models/text_recognition_model.pth', weights_only=True))

<All keys matched successfully>

In [20]:
model

CRNN(
  (backbone): Sequential(
    (0): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act1): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (drop_block): Identity()
        (act2): ReLU(inplace=True)
        (aa): Identity()
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=T

In [30]:
epochs = 100
lr = 5e-4
weight_decay = 1e-6
scheduler_step_size = epochs*0.5
criterion = nn.CTCLoss(
    blank= char_to_idx['-'],
    reduction= 'mean',
    zero_infinity= True
)
optimizer = optim.Adam(
    model.parameters(),
    lr = lr,
    weight_decay= weight_decay,
)
scheduler = optim.lr_scheduler.StepLR(
    optimizer, step_size= scheduler_step_size
)

In [35]:
train_losses, val_losses, best_model = fit(
    model, train_loader, val_loader, criterion, optimizer, scheduler, 
    device, epochs=50, patience=3
)

EPOCH 1: 	Train loss: 0.1627	Val loss: 0.9541	Time: 10.48 seconds
EPOCH 2: 	Train loss: 0.1656	Val loss: 1.0776	Time: 10.38 seconds
  Patience counter: 1/3
EPOCH 3: 	Train loss: 0.1654	Val loss: 1.0505	Time: 10.43 seconds
  Patience counter: 2/3
EPOCH 4: 	Train loss: 0.1552	Val loss: 0.9910	Time: 10.49 seconds
  Patience counter: 3/3
⚠️ Early stopping triggered!


In [36]:
torch.save(best_model, "text_recognition_model.pt")