In [None]:
def extract_letters_from_word_image(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    
    _, thresh = cv2.threshold(image, 128, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)

    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    contours = sorted(contours, key=lambda ctr: cv2.boundingRect(ctr)[0])

    letter_images = []
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)
        if w > 5 and h > 5:  # Ignore very small contours
            letter_crop = image[y:y+h, x:x+w]
            letter_images.append(letter_crop)

    return letter_images

In [None]:
import random

def display_random_images(dataset, num_images=10):
    random_samples = random.sample(dataset, num_images)
    plt.figure(figsize=(15, 5))

    for i, (img, char) in enumerate(random_samples):
        plt.subplot(2, 5, i + 1)
        plt.imshow(img, cmap='gray')
        plt.title(f"Label: {char}")

    plt.tight_layout()
    plt.show()

display_random_images(letters_dataset)

In [None]:
sentences_txt_path = os.path.join("Datasets", "words_new.txt")
sentences_folder_path = os.path.join("Datasets", "iam_words", "words")

dataset, vocab, max_len = [], set(), 0
letters_dataset = []
words = open(sentences_txt_path, "r").readlines()
ok = 0
nok = 0

for line in tqdm(words[:8000]):
    if line.startswith("#"):
        continue

    line_split = line.split(" ")
    if line_split[1] == 'err':
        continue

    folder1 = line_split[0][:3]
    folder2 = "-".join(line_split[0].split("-")[:2])
    file_name = line_split[0] + ".png"
    label = line_split[-1].rstrip("\n")

    rel_path = os.path.join(sentences_folder_path, folder1, folder2, file_name)
    if not os.path.exists(rel_path):
        print(f"File not found: {rel_path}")
        continue

    dataset.append([rel_path, label])
    vocab.update(list(label))
    max_len = max(max_len, len(label))

    letter_images = extract_letters_from_word_image(rel_path)
        
    if len(letter_images) != len(label):
        print(f"Mismatch for {rel_path} - SKIPPED")
        nok += 1
        continue
    else:
        print(f"Success for {rel_path}")
        ok += 1

    for img, char in zip(letter_images, label):
        letters_dataset.append((img, char))

print(f"OK: {ok} and NOK: {nok}")