In [2]:
import os
import glob
import numpy as np

# Define the path to your training data
# The '../' goes up one level from 'notebooks/' to the project root
train_data_path = "../data/main/train/"

# Get a list of all captcha image paths
image_paths = sorted(glob.glob(os.path.join(train_data_path, "*.png")))
print(f"Found {len(image_paths)} images in the training folder.")

# --- Task: Extract all labels and create the character vocabulary ---

# 1. Get all labels from the filenames
labels = [os.path.basename(path).split('-')[0] for path in image_paths]

# 2. Find all unique characters present in the dataset
all_characters = set()
for label in labels:
    for char in label:
        all_characters.add(char)

# Convert the set to a sorted list to ensure consistent mapping
vocabulary = sorted(list(all_characters))

print(f"\nNumber of unique characters found: {len(vocabulary)}")
print(f"Vocabulary: {''.join(vocabulary)}")

# 3. Create the character-to-number and number-to-character dictionaries
char_to_num = {char: i + 1 for i, char in enumerate(vocabulary)}
# We add a dummy character at index 0 for the CTC 'blank' token
num_to_char = {i + 1: char for i, char in enumerate(vocabulary)}

print("\nCharacter to Number Mapping (char_to_num):")
print(char_to_num)

print("\nNumber to Character Mapping (num_to_char):")
print(num_to_char)

Found 8010 images in the training folder.

Number of unique characters found: 36
Vocabulary: 0123456789abcdefghijklmnopqrstuvwxyz

Character to Number Mapping (char_to_num):
{'0': 1, '1': 2, '2': 3, '3': 4, '4': 5, '5': 6, '6': 7, '7': 8, '8': 9, '9': 10, 'a': 11, 'b': 12, 'c': 13, 'd': 14, 'e': 15, 'f': 16, 'g': 17, 'h': 18, 'i': 19, 'j': 20, 'k': 21, 'l': 22, 'm': 23, 'n': 24, 'o': 25, 'p': 26, 'q': 27, 'r': 28, 's': 29, 't': 30, 'u': 31, 'v': 32, 'w': 33, 'x': 34, 'y': 35, 'z': 36}

Number to Character Mapping (num_to_char):
{1: '0', 2: '1', 3: '2', 4: '3', 5: '4', 6: '5', 7: '6', 8: '7', 9: '8', 10: '9', 11: 'a', 12: 'b', 13: 'c', 14: 'd', 15: 'e', 16: 'f', 17: 'g', 18: 'h', 19: 'i', 20: 'j', 21: 'k', 22: 'l', 23: 'm', 24: 'n', 25: 'o', 26: 'p', 27: 'q', 28: 'r', 29: 's', 30: 't', 31: 'u', 32: 'v', 33: 'w', 34: 'x', 35: 'y', 36: 'z'}
