# The purpose of this notebook is to ready the data for the training of the CRNN model (for word recognition in OCR).

In [11]:
import os, sys
import json
import random
import torch
from torch.utils.data import Dataset
from PIL import Image

sys.path.append(os.path.abspath(os.path.join('../src')))
sys.path.append(os.path.abspath(os.path.join('../data')))

In [5]:
# Count the number of words in the the cropped_words directory = number of files
cropped_words_dir = "../data/cropped_words/"
word_files = os.listdir(cropped_words_dir)
print(f"Number of cropped word images: {len(word_files)}")

Number of cropped word images: 2186


In [8]:
# Extract character set from JSON annotations
def extract_charset(json_path):
    chars = set()

    with open(json_path, "r") as f:
        data = json.load(f)

    for _, words in data.items():
        for entry in words:
            chars.update(entry["word"])

    chars = sorted(chars)
    return chars

# Build vocabulary mappings from character set
def build_vocab(chars):
    char2idx = {c: i + 1 for i, c in enumerate(chars)}  # 0 = blank
    idx2char = {i + 1: c for i, c in enumerate(chars)}
    blank_idx = 0
    num_classes = len(chars) + 1

    return char2idx, idx2char, blank_idx, num_classes

json_annotations_path = "../data/filename_to_word_files.json"
chars = extract_charset(json_annotations_path)
print(f"Extracted {len(chars)} unique characters from annotations.")
print(chars)

char2idx, idx2char, blank_idx, num_classes = build_vocab(chars)
print(f"Number of classes (including blank): {num_classes}")

Extracted 78 unique characters from annotations.
[' ', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '>', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y']
Number of classes (including blank): 79


In [9]:
# Split receipts into training and validation sets - we do this at the receipt level to avoid data leakage
def split_receipts(json_path, train_ratio=0.8, seed=42):
    random.seed(seed)

    with open(json_path, "r") as f:
        data = json.load(f)

    receipts = list(data.keys())
    random.shuffle(receipts)

    n_train = int(len(receipts) * train_ratio)

    train_receipts = receipts[:n_train]
    val_receipts = receipts[n_train:]

    return train_receipts, val_receipts

# Build samples list from receipt IDs
def build_samples(json_path, receipt_ids):
    with open(json_path, "r") as f:
        data = json.load(f)

    samples = []

    for rid in receipt_ids:
        for entry in data[rid]:
            samples.append(
                (entry["word_file"], entry["word"])
            )

    return samples


train_receipts, val_receipts = split_receipts(json_annotations_path, train_ratio=0.8, seed=42)
print(f"Number of training receipts: {len(train_receipts)}")
print(f"Number of validation receipts: {len(val_receipts)}")

train_samples = build_samples(json_annotations_path, train_receipts)
val_samples = build_samples(json_annotations_path, val_receipts)
print(f"Number of training samples (words): {len(train_samples)}")
print(f"Number of validation samples (words): {len(val_samples)}")

Number of training receipts: 80
Number of validation receipts: 20
Number of training samples (words): 1754
Number of validation samples (words): 432


In [10]:
# print an example sample
print("Example training sample:", train_samples[0])

Example training sample: ('data/cropped_words/dev_receipt_00026_word_0.png', 'Rp.')
