In [1]:
import os
import numpy as np
import h5py
from PIL import Image, ImageDraw, ImageFont
import random
from tqdm import tqdm

# Set folders
words_file = "words.txt"
fonts_folder = "fonts"
backgrounds_folder = "backgrounds"

# Output folder for temporary images (optional)
os.makedirs("SynthText-EN/images", exist_ok=True)

# Output H5 file
output_h5 = "SynthText-EN.h5"

In [2]:
# Load words
with open(words_file, "r") as f:
    words = [line.strip() for line in f if line.strip()]

# Load fonts
fonts = [os.path.join(fonts_folder, f) for f in os.listdir(fonts_folder) if f.endswith(".ttf")]

# Load backgrounds
bg_images = [os.path.join(backgrounds_folder, f) for f in os.listdir(backgrounds_folder) 
             if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

print(f"Loaded {len(words)} words, {len(fonts)} fonts, {len(bg_images)} backgrounds")


Loaded 15 words, 4 fonts, 7 backgrounds


In [5]:
def generate_word_image(word, img_size=(64, 256)):
    """
    Generate an image with the given word on a random background
    using a random font.
    """
    # Select random background
    bg_path = random.choice(bg_images)
    bg = Image.open(bg_path).convert("RGB")
    bg = bg.resize(img_size)  # resize background to target image size
    
    # Create Draw object
    draw = ImageDraw.Draw(bg)
    
    # Random font and size
    font_path = random.choice(fonts)
    font_size = random.randint(20, 40)
    font = ImageFont.truetype(font_path, font_size)
    
    # Get text size
    # text_w, text_h = draw.textsize(word, font=font)

    bbox = draw.textbbox((0, 0), word, font=font)
    text_w, text_h = bbox[2] - bbox[0], bbox[3] - bbox[1]
    
    # Random position within the image
    x = random.randint(0, max(0, img_size[1] - text_w))
    y = random.randint(0, max(0, img_size[0] - text_h))
    
    # Draw text
    draw.text((x, y), word, font=font, fill=(0,0,0))
    
    # Convert to numpy array
    img_np = np.array(bg)
    
    # Bounding box: shape (2,2,1) like SynthText
    wordBB = np.array([[x, x + text_w],
                       [y, y + text_h]])[:, :, np.newaxis]
    
    # Text as UTF-8
    txt = [word.encode('utf-8')]
    
    return img_np, wordBB, txt


In [6]:
num_images = 5000  # adjust for dataset size
img_h, img_w = 64, 256

data_dict = {}

for i in tqdm(range(num_images)):
    word = random.choice(words)
    img_np, wordBB, txt = generate_word_image(word, img_size=(img_h, img_w))
    
    # Store in dict
    data_dict[f'image_{i}'] = {
        'rgb': img_np,
        'wordBB': wordBB,
        'txt': txt
    }


100%|██████████| 5000/5000 [03:23<00:00, 24.56it/s]


In [7]:
with h5py.File(output_h5, "w") as f:
    data_grp = f.create_group("data")
    
    for key, val in data_dict.items():
        ds = data_grp.create_dataset(key, data=val['rgb'], compression="gzip")
        ds.attrs['wordBB'] = val['wordBB']
        ds.attrs['txt'] = val['txt']

print(f"H5 file saved: {output_h5}")


H5 file saved: SynthText-EN.h5


In [9]:
db = h5py.File(output_h5, "r")
print("Total images:", len(db['data'].keys()))

first_key = list(db['data'].keys())[0]
img = db['data'][first_key][...]
wordBB = db['data'][first_key].attrs['wordBB']

# txt = [t.decode('utf-8') for t in db['data'][first_key].attrs['txt']]

txt = [t for t in db['data'][first_key].attrs['txt']]


print("Image shape:", img.shape)
print("Bounding box:", wordBB)
print("Text label:", txt)


Total images: 5000
Image shape: (256, 64, 3)
Bounding box: [[[56]
  [80]]

 [[41]
  [56]]]
Text label: ['sat']
