In [1]:
# coding=utf-8
# Copyright 2024 The TensorFlow Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""EMNIST: Extending MNIST to handwritten letters."""

import os

import numpy as np
from six.moves import urllib
from tensorflow_datasets.core.utils.lazy_imports_utils import tensorflow as tf
import tensorflow_datasets.public_api as tfds

# EMNIST constants
_EMNIST_URL = "https://biometrics.nist.gov/cs_links/EMNIST/gzip.zip"

_EMNIST_CITATION = """\
@article{cohen_afshar_tapson_schaik_2017,
    title={EMNIST: Extending MNIST to handwritten letters},
    DOI={10.1109/ijcnn.2017.7966217},
    journal={2017 International Joint Conference on Neural Networks (IJCNN)},
    author={Cohen, Gregory and Afshar, Saeed and Tapson, Jonathan and Schaik, Andre Van},
    year={2017}
}
"""


class EMNISTConfig(tfds.core.BuilderConfig):
    """BuilderConfig for EMNIST CONFIG."""

    def __init__(self, *, class_number, train_examples, test_examples, **kwargs):
        """BuilderConfig for EMNIST class number.

        Args:
            class_number: There are six different splits provided in this dataset. And
            have different class numbers.
            train_examples: number of train examples
            test_examples: number of test examples
            **kwargs: keyword arguments forwarded to super.
        """
        super(EMNISTConfig, self).__init__(**kwargs)
        self.class_number = class_number
        self.train_examples = train_examples
        self.test_examples = test_examples


class EMNIST(tfds.core.GeneratorBasedBuilder):
    """EMNIST dataset."""

    URL = _EMNIST_URL
    VERSION = tfds.core.Version("3.1.0")
    RELEASE_NOTES = {
        "3.0.0": "New split API (https://tensorflow.org/datasets/splits)",
        "3.1.0": "Updated broken download URL",
    }
    BUILDER_CONFIGS = [
        EMNISTConfig(
            name="byclass",
            class_number=62,
            train_examples=697932,
            test_examples=116323,
            description="EMNIST ByClass",
        ),
        EMNISTConfig(
            name="bymerge",
            class_number=47,
            train_examples=697932,
            test_examples=116323,
            description="EMNIST ByMerge",
        ),
        EMNISTConfig(
            name="balanced",
            class_number=47,
            train_examples=112800,
            test_examples=18800,
            description="EMNIST Balanced",
        ),
        EMNISTConfig(
            name="letters",
            class_number=37,
            train_examples=88800,
            test_examples=14800,
            description="EMNIST Letters",
        ),
        EMNISTConfig(
            name="digits",
            class_number=10,
            train_examples=240000,
            test_examples=40000,
            description="EMNIST Digits",
        ),
        EMNISTConfig(
            name="mnist",
            class_number=10,
            train_examples=60000,
            test_examples=10000,
            description="EMNIST MNIST",
        ),
    ]

    def _info(self):
        return tfds.core.DatasetInfo(
            builder=self,
            description=(
                "The EMNIST dataset is a set of handwritten character digits "
                "derived from the NIST Special Database 19 and converted to "
                "a 28x28 pixel image format and dataset structure that directly "
                "matches the MNIST dataset.\n\n"
                "Note: Like the original EMNIST data, images provided here are "
                "inverted horizontally and rotated 90 anti-clockwise. You can use "
                "`tf.transpose` within `ds.map` to convert the images to a "
                "human-friendlier format."
            ),
            features=tfds.features.FeaturesDict({
                "image": tfds.features.Image(shape=(28, 28, 1)),
                "label": tfds.features.ClassLabel(
                    num_classes=self.builder_config.class_number
                ),
            }),
            supervised_keys=("image", "label"),
            homepage=(
                "https://www.nist.gov/itl/products-and-services/emnist-dataset"
            ),
            citation=_EMNIST_CITATION,
        )

    def _split_generators(self, dl_manager):
        filenames = {
            "train_data": "emnist-{}-train-images-idx3-ubyte.gz".format(
                self.builder_config.name
            ),
            "train_labels": "emnist-{}-train-labels-idx1-ubyte.gz".format(
                self.builder_config.name
            ),
            "test_data": "emnist-{}-test-images-idx3-ubyte.gz".format(
                self.builder_config.name
            ),
            "test_labels": "emnist-{}-test-labels-idx1-ubyte.gz".format(
                self.builder_config.name
            ),
        }

        dir_name = os.path.join(dl_manager.download_and_extract(self.URL), "gzip")
        extracted = dl_manager.extract(
            {k: os.path.join(dir_name, fname) for k, fname in filenames.items()}
        )

        return [
            tfds.core.SplitGenerator(
                name=tfds.Split.TRAIN,
                gen_kwargs=dict(
                    num_examples=self.builder_config.train_examples,
                    data_path=extracted["train_data"],
                    label_path=extracted["train_labels"],
                ),
            ),
            tfds.core.SplitGenerator(
                name=tfds.Split.TEST,
                gen_kwargs=dict(
                    num_examples=self.builder_config.test_examples,
                    data_path=extracted["test_data"],
                    label_path=extracted["test_labels"],
                ),
            ),
        ]


def _extract_mnist_images(image_filepath, num_images):
    with tf.io.gfile.GFile(image_filepath, "rb") as f:
        f.read(16)  # header
        buf = f.read(28 * 28 * num_images)
        data = np.frombuffer(
            buf,
            dtype=np.uint8,
        ).reshape(num_images, 28, 28, 1)
        return data


def _extract_mnist_labels(labels_filepath, num_labels):
    with tf.io.gfile.GFile(labels_filepath, "rb") as f:
        f.read(8)  # header
        buf = f.read(num_labels)
        labels = np.frombuffer(buf, dtype=np.uint8).astype(np.int64)
        return labels

In [6]:
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt

class EMNIST(tfds.core.GeneratorBasedBuilder):
    # EMNIST 클래스의 내용을 여기에 붙여넣으세요.
    URL = _EMNIST_URL
    VERSION = tfds.core.Version("3.1.0")
    RELEASE_NOTES = {
        "3.0.0": "New split API (https://tensorflow.org/datasets/splits)",
        "3.1.0": "Updated broken download URL",
    }
    BUILDER_CONFIGS = [
        EMNISTConfig(
            name="byclass",
            class_number=62,
            train_examples=697932,
            test_examples=116323,
            description="EMNIST ByClass",
        ),
        EMNISTConfig(
            name="bymerge",
            class_number=47,
            train_examples=697932,
            test_examples=116323,
            description="EMNIST ByMerge",
        ),
        EMNISTConfig(
            name="balanced",
            class_number=47,
            train_examples=112800,
            test_examples=18800,
            description="EMNIST Balanced",
        ),
        EMNISTConfig(
            name="letters",
            class_number=37,
            train_examples=88800,
            test_examples=14800,
            description="EMNIST Letters",
        ),
        EMNISTConfig(
            name="digits",
            class_number=10,
            train_examples=240000,
            test_examples=40000,
            description="EMNIST Digits",
        ),
        EMNISTConfig(
            name="mnist",
            class_number=10,
            train_examples=60000,
            test_examples=10000,
            description="EMNIST MNIST",
        ),
    ]

    def _info(self):
        return tfds.core.DatasetInfo(
            builder=self,
            description=(
                "The EMNIST dataset is a set of handwritten character digits "
                "derived from the NIST Special Database 19 and converted to "
                "a 28x28 pixel image format and dataset structure that directly "
                "matches the MNIST dataset.\n\n"
                "Note: Like the original EMNIST data, images provided here are "
                "inverted horizontally and rotated 90 anti-clockwise. You can use "
                "`tf.transpose` within `ds.map` to convert the images to a "
                "human-friendlier format."
            ),
            features=tfds.features.FeaturesDict({
                "image": tfds.features.Image(shape=(28, 28, 1)),
                "label": tfds.features.ClassLabel(
                    num_classes=self.builder_config.class_number
                ),
            }),
            supervised_keys=("image", "label"),
            homepage=(
                "https://www.nist.gov/itl/products-and-services/emnist-dataset"
            ),
            citation=_EMNIST_CITATION,
        )

    def _split_generators(self, dl_manager):
        filenames = {
            "train_data": "emnist-{}-train-images-idx3-ubyte.gz".format(
                self.builder_config.name
            ),
            "train_labels": "emnist-{}-train-labels-idx1-ubyte.gz".format(
                self.builder_config.name
            ),
            "test_data": "emnist-{}-test-images-idx3-ubyte.gz".format(
                self.builder_config.name
            ),
            "test_labels": "emnist-{}-test-labels-idx1-ubyte.gz".format(
                self.builder_config.name
            ),
        }

        dir_name = os.path.join(dl_manager.download_and_extract(self.URL), "gzip")
        extracted = dl_manager.extract(
            {k: os.path.join(dir_name, fname) for k, fname in filenames.items()}
        )

        return [
            tfds.core.SplitGenerator(
                name=tfds.Split.TRAIN,
                gen_kwargs=dict(
                    num_examples=self.builder_config.train_examples,
                    data_path=extracted["train_data"],
                    label_path=extracted["train_labels"],
                ),
            ),
            tfds.core.SplitGenerator(
                name=tfds.Split.TEST,
                gen_kwargs=dict(
                    num_examples=self.builder_config.test_examples,
                    data_path=extracted["test_data"],
                    label_path=extracted["test_labels"],
                ),
            ),
        ]

    def _generate_examples(self, num_examples, data_path, label_path):
        images = _extract_mnist_images(data_path, num_examples)
        labels = _extract_mnist_labels(label_path, num_examples)
        data = list(zip(images, labels))

        # Using index as key since data is always loaded in same order.
        for index, (image, label) in enumerate(data):
            record = {"image": image, "label": label}
            yield index, record

# EMNIST 데이터셋 로드
dataset, info = tfds.load('emnist/byclass', split='train', with_info=True)

# 데이터셋 정보 출력
print(info)

# 데이터셋에서 샘플 이미지와 라벨 가져오기
for example in dataset.take(5):  # 처음 5개의 샘플만 사용
    image


TypeError: Can't instantiate abstract class EMNIST with abstract methods _info, _split_generators