In [1]:
import pathlib
import pandas as pd
from PIL import Image
from tqdm import tqdm
from datasets import Dataset, DatasetDict, Features, Value, Image as ImageFeature

In [2]:
TRAIN_IMAGE_FOLDER = "./train"
TEST_IMAGE_FOLDER = "./test"

In [3]:
assert (pathlib.Path(TRAIN_IMAGE_FOLDER) / "ann_file.jsonl").exists(), "Please generate the data first."
assert (pathlib.Path(TEST_IMAGE_FOLDER) / "ann_file.jsonl").exists(), "Please generate the data first."

In [4]:
symbols = set([",", ".", ";", ":"])
digits = set(["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"])
vocab = set(list(symbols) + list(digits))


def isdigit(char):
    return char in vocab


def disclose_data(df: pd.DataFrame, image_folder: str) -> pd.DataFrame:
    metadata = []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        image = None

        for ann in row["annotations"]:
            x, y, w, h = ann["bbox"]
            text = ann["text"]

            if text.strip() == "":
                continue

            numbers_only = all([isdigit(char) for char in text])

            if image is None:
                image = Image.open(pathlib.Path(image_folder) / row["filename"])

            metadata.append(
                {
                    "image": ImageFeature().encode_example(
                        image.crop((x, y, x + w, y + h)).copy()
                    ),
                    "text": Value("string").encode_example(text),
                    "numbers_only": numbers_only,
                }
            )

    return pd.DataFrame(metadata)

In [None]:
train_dataframe = pd.read_json(pathlib.Path(TRAIN_IMAGE_FOLDER) / "ann_file.jsonl", lines=True)
train_dataset = disclose_data(train_dataframe, TRAIN_IMAGE_FOLDER)

In [None]:
test_dataframe = pd.read_json(pathlib.Path(TEST_IMAGE_FOLDER) / "ann_file.jsonl", lines=True)
test_dataset = disclose_data(test_dataframe, TEST_IMAGE_FOLDER)

In [9]:
features=Features({"image" : ImageFeature(), "text" : Value("string")})

hf_dataset = DatasetDict({
    "train": Dataset.from_pandas(train_dataset[["image", "text"]], preserve_index=False, features=features),
    "test": Dataset.from_pandas(test_dataset[["image", "text"]], preserve_index=False, features=features),
})

In [None]:
hf_dataset.push_to_hub("MiXaiLL76/SVHN_OCR")