In [None]:
import pathlib
import pandas as pd
from PIL import Image
from tqdm import tqdm
from datasets import Dataset, DatasetDict, Features, Value, Image as ImageFeature

In [2]:
IMAGE_FOLDER = "./7_seg_dataset"

In [3]:
assert (
    pathlib.Path(IMAGE_FOLDER) / "ann_file.jsonl"
).exists(), "Please generate the data first."

In [4]:
def disclose_data(df: pd.DataFrame, image_folder: str) -> pd.DataFrame:
    metadata = []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        with Image.open(pathlib.Path(image_folder) / row["filename"]) as image:
            for ann in row["annotations"]:
                x, y, w, h = ann["bbox"]
                text = ann["text"]

                if text.strip() == "":
                    continue

                metadata.append(
                    {
                        "image": ImageFeature().encode_example(
                            image.crop((x, y, x + w, y + h))
                        ),
                        "text": Value("string").encode_example(text),
                    }
                )

    return pd.DataFrame(metadata)

In [None]:
train_dataframe = pd.read_json(
    pathlib.Path(IMAGE_FOLDER) / "ann_file.jsonl", lines=True
)
train_dataset = disclose_data(train_dataframe, IMAGE_FOLDER)

In [6]:
features = Features({"image": ImageFeature(), "text": Value("string")})

hf_dataset = DatasetDict(
    {
        "train": Dataset.from_pandas(
            train_dataset[["image", "text"]], preserve_index=False, features=features
        ),
    }
)

In [None]:
hf_dataset

In [None]:
hf_dataset.push_to_hub("MiXaiLL76/7SEG_OCR")