In [9]:
# Besides scikit-learn, TensorFlow is another tool that 
# we can use for machine learning projects. For similar reasons, 
# there is also a dataset API for TensorFlow that gives you 
# the dataset in a format that works best with TensorFlow. 
# Unlike scikit-learn, the API is not part of the standard 
# TensorFlow package. You need to install it using the command:

# pip install tensorflow-datasets



In [10]:
import tensorflow_datasets as tfds 
print(tfds.list_builders()) # prints more than 1,000 names.

['abstract_reasoning', 'accentdb', 'aeslc', 'aflw2k3d', 'ag_news_subset', 'ai2_arc', 'ai2_arc_with_ir', 'amazon_us_reviews', 'anli', 'answer_equivalence', 'arc', 'asqa', 'asset', 'assin2', 'bair_robot_pushing_small', 'bccd', 'beans', 'bee_dataset', 'beir', 'big_patent', 'bigearthnet', 'billsum', 'binarized_mnist', 'binary_alpha_digits', 'ble_wind_field', 'blimp', 'booksum', 'bool_q', 'bucc', 'c4', 'c4_wsrs', 'caltech101', 'caltech_birds2010', 'caltech_birds2011', 'cardiotox', 'cars196', 'cassava', 'cats_vs_dogs', 'celeb_a', 'celeb_a_hq', 'cfq', 'cherry_blossoms', 'chexpert', 'cifar10', 'cifar100', 'cifar100_n', 'cifar10_1', 'cifar10_corrupted', 'cifar10_n', 'citrus_leaves', 'cityscapes', 'civil_comments', 'clevr', 'clic', 'clinc_oos', 'cmaterdb', 'cnn_dailymail', 'coco', 'coco_captions', 'coil100', 'colorectal_histology', 'colorectal_histology_large', 'common_voice', 'conll2002', 'conll2003', 'controlled_noisy_web_labels', 'coqa', 'cos_e', 'cosmos_qa', 'covid19', 'covid19sum', 'crema_d

In [11]:
import tensorflow_datasets as tfds
ds = tfds.load("mnist", split="train", shuffle_files=True)
print(ds)

<PrefetchDataset element_spec={'image': TensorSpec(shape=(28, 28, 1), dtype=tf.uint8, name=None), 'label': TensorSpec(shape=(), dtype=tf.int64, name=None)}>


In [12]:
# In particular, this dataset has the data instances (images)
# in a numpy array of shapes (28,28,1), and the targets (labels) are scalars.
# With minor polishing, the data is ready for use in the Keras fit() function. An example is as follows:



import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Dense, AveragePooling2D, Dropout, Flatten
from tensorflow.keras.callbacks import EarlyStopping

# Read data with train-test split
ds_train, ds_test = tfds.load("mnist", split=['train', 'test'],
                              shuffle_files=True, as_supervised=True)

# Set up BatchDataset from the OptionsDataset object
ds_train = ds_train.batch(32)
ds_test = ds_test.batch(32)

# Build LeNet5 model and fit
model = Sequential([
    Conv2D(6, (5,5), input_shape=(28,28,1), padding="same", activation="tanh"),
    AveragePooling2D((2,2), strides=2),
    Conv2D(16, (5,5), activation="tanh"),
    AveragePooling2D((2,2), strides=2),
    Conv2D(120, (5,5), activation="tanh"),
    Flatten(),
    Dense(84, activation="tanh"),
    Dense(10, activation="softmax")
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["sparse_categorical_accuracy"])
earlystopping = EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)
model.fit(ds_train, validation_data=ds_test, epochs=100, callbacks=[earlystopping])


# If we provided as_supervised=True, the dataset would be
# records of tuples (features, targets) instead of the dictionary. 
# It is required for Keras. Moreover, to use the dataset in the fit() function, 
# we need to create an iterable of batches. This is done by setting up the 
# batch size of the dataset to convert it from OptionsDataset object into BatchDataset object.
# We applied the LeNet5 model for the image classification. But since the target in the dataset 
# is a numerical value (0 to 9) rather than a Boolean vector, we ask Keras to convert the softmax
# output vector into a number before computing accuracy and loss by specifying sparse_categorical_accuracy
# and sparse_categorical_crossentropy in the compile() function.
# The key here is to understand every dataset is in a different shape.
# When you use it with your TensorFlow model, you need to adapt your model to fit the dataset.


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100


<keras.callbacks.History at 0x243cad061c8>