# Imports

In [1]:
import os
import xml.etree.ElementTree as ET
import cv2
from sklearn.model_selection import train_test_split

# For missing libraries, use following command:
# pip install -r src/requirements.txt

# Constants

In [2]:
DATASET_BASE_DIR = "./dataset/cursive"

GROUND_TRUTH_DIR = f"{DATASET_BASE_DIR}/ground_truth"
LABELS_PATH = f"{DATASET_BASE_DIR}/ground_truth/word_labels.txt"
IMAGES_DIR = f"{DATASET_BASE_DIR}/images"
LABELS_W_DIR = f"{DATASET_BASE_DIR}/labels_w"
LABELS_W_WORD_DIR = f"{DATASET_BASE_DIR}/labels_w/word"
LABELS_W_WORD_TXT = f"{DATASET_BASE_DIR}/labels_w/word.txt"

In [3]:
CREATE_HDF5 = True

# Image and label preparation

In [16]:
def read_content(xml_file: str):

    tree = ET.parse(xml_file)
    root = tree.getroot()

    list_with_all_boxes = []
    filename = root.find("filename").text

    size_node = root.find("size")
    h = int(size_node.find("height").text)
    w = int(size_node.find("width").text)

    for w_idx, boxes in enumerate(root.iter("object")):
        ymin, xmin, ymax, xmax = None, None, None, None

        ymin = int(boxes.find("bndbox/ymin").text)
        xmin = int(boxes.find("bndbox/xmin").text)
        ymax = int(boxes.find("bndbox/ymax").text)
        xmax = int(boxes.find("bndbox/xmax").text)
        name = str(boxes.find("name").text).upper()

        row = [filename, w_idx, w, h, xmin, ymin, xmax, ymax, name]
        list_with_all_boxes.append(row)
    
    return list_with_all_boxes

In [17]:
def generate_word_labels(path, labels):
    """
    Format
    filename, img_width, img_height, word_idx, xmin, ymin, xmax, ymax, word
    """
    with open(f"{path}/word_labels.txt", "w", encoding="utf8") as file:
        for label in labels:
            xml_file = f"{path}/{label}"
            boxes = read_content(xml_file)
            label = label.removesuffix(".xml")
            for box in boxes:
                b = [str(i) for i in box]
                file.write(f'{"|".join(b)}\n')

In [18]:
print (GROUND_TRUTH_DIR)
labels = os.listdir(GROUND_TRUTH_DIR)
labels = [l for l in labels if l.endswith(".xml")]

generate_word_labels(GROUND_TRUTH_DIR, labels)

./dataset/cursive/ground_truth


In [19]:
def extract_rectangle(image, top_left, bottom_right, wiggle_room=None):
    if wiggle_room is not None:
        w, h, _ = image.shape
        x1p, y1p = top_left
        x2p, y2p = bottom_right

        top_left = [
            x1p - wiggle_room if 0 <= x1p - wiggle_room <= w else x1p,
            y1p - wiggle_room if 0 <= y1p - wiggle_room <= h else y1p,
        ]
        bottom_right = [
            x2p + wiggle_room if 0 <= x2p + wiggle_room <= w else x2p,
            y2p + wiggle_room if 0 <= y2p + wiggle_room <= h else y2p,
        ]
    x1, y1 = top_left
    x2, y2 = bottom_right
    extracted_region = image[y1:y2, x1:x2]
    return extracted_region

In [21]:
labels = os.listdir(IMAGES_DIR)
labels = [l for l in labels if l.endswith(".xml")]

def generate_word_images(word_labels, src, dest):
    with open(word_labels, "r", encoding="utf8") as read:
        with open(f"{dest}/word.txt", "w", encoding="utf8") as write:
            while line := read.readline():
                line = line.removesuffix("\n").split("|")
                filename, word_idx, w, h, xmin, ymin, xmax, ymax, word = line
                img = cv2.imread(f"{src}/{filename}")
                if img is None:
                    print(f"{src}/{filename}")
                word_img = extract_rectangle(img, [int(xmin), int(ymin)], [int(xmax), int(ymax)])
                filename_no_ext = filename.removesuffix(".jpg")
                img_path = f"{dest}/word/{filename_no_ext}_{word_idx}.jpg"
                try:
                    cv2.imwrite(img_path, word_img)
                    print(f"Created image: {img_path}")
                    line[0] = f"{filename_no_ext}_{word_idx}.jpg"
                    write.write(f"{'|'.join(line)}\n")
                except Exception as e:
                    print(e)

if not os.path.isdir(LABELS_W_DIR):
    os.mkdir(LABELS_W_DIR)

if not os.path.isdir(LABELS_W_WORD_DIR):
    os.mkdir(LABELS_W_WORD_DIR)

generate_word_images(LABELS_PATH, IMAGES_DIR, LABELS_W_DIR)

Created image: ./dataset/cursive/labels_w/word/EM_cursive_000_0.jpg
Created image: ./dataset/cursive/labels_w/word/EM_cursive_000_1.jpg
Created image: ./dataset/cursive/labels_w/word/EM_cursive_000_2.jpg
Created image: ./dataset/cursive/labels_w/word/EM_cursive_000_3.jpg
Created image: ./dataset/cursive/labels_w/word/EM_cursive_000_4.jpg
Created image: ./dataset/cursive/labels_w/word/EM_cursive_000_5.jpg
Created image: ./dataset/cursive/labels_w/word/EM_cursive_000_6.jpg
Created image: ./dataset/cursive/labels_w/word/EM_cursive_000_7.jpg
Created image: ./dataset/cursive/labels_w/word/EM_cursive_000_8.jpg
Created image: ./dataset/cursive/labels_w/word/EM_cursive_000_9.jpg
Created image: ./dataset/cursive/labels_w/word/EM_cursive_000_10.jpg
Created image: ./dataset/cursive/labels_w/word/EM_cursive_000_11.jpg
Created image: ./dataset/cursive/labels_w/word/EM_cursive_000_12.jpg
Created image: ./dataset/cursive/labels_w/word/EM_cursive_000_13.jpg
Created image: ./dataset/cursive/labels_w/wo

In [None]:
def split_train_set(src_txt: str, out: str, train_size=0.7, val_size=0.2, test_size=0.1, random_state=None):
    assert abs(train_size+val_size+test_size-1.0) < 1e-5, "Train, validation, and test sizes must sum to 1.0"
    elements = []
    with open(src_txt, "r", encoding="utf8") as read:
        while line := read.readline():
            elements.append(line)

    if not elements:
        return 0,0,0

    train_val_data, test_data = train_test_split(elements, test_size=test_size, random_state=random_state)
    val_relative_size = val_size / (train_size + val_size)
    train_data, val_data = train_test_split(train_val_data, test_size=val_relative_size, random_state=random_state)
    with open(f"{out}/trainset.txt", "w", encoding="utf8") as train:
        for el in train_data:
            train.write(el)

    with open(f"{out}/testset.txt", "w", encoding="utf8") as test:
        for el in test_data:
            test.write(el)

    with open(f"{out}/validset.txt", "w", encoding="utf8") as val:
        for el in val_data:
            val.write(el)

    return len(train_data), len(val_data), len(test_data)

_out = LABELS_W_DIR
_in = LABELS_W_WORD_TXT
if not os.path.isdir(_out):
    os.mkdir(_out)

split_train_set(_in, _out)

(8116, 2319, 1160)

In [None]:
import shutil

src = LABELS_W_DIR
dst = "./raw/labels_w"

if not os.path.exists("./raw"):
    os.makedirs("./raw")

if not os.path.exists(src):
    print(f"Source directory '{src}' does not exist.")
else:
    shutil.move(src, dst)
    print(f"Moved '{src}' to '{dst}'")

Source directory './dataset/cursive/labels_w' does not exist.


In [6]:
if CREATE_HDF5:
    print("Creating HDF5 dataset...")
    os.system("cd src && python main.py --source=labels_w --transform")

Creating HDF5 dataset...


E0000 00:00:1750261076.904814   52523 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750261076.910891   52523 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750261076.926791   52523 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750261076.926841   52523 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750261076.926845   52523 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750261076.926847   52523 computation_placer.cc:177] computation placer already registered. Please check linka

labels_w dataset will be transformed...


13312it [01:13, 181.18it/s]                           
