<a href="https://colab.research.google.com/github/https-deeplearning-ai/tensorflow-2-public/blob/adding_C3/C3/W1/ungraded_labs/C3_W1_Lab_1_tfds_hello_world.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TFDS Data Pipelines

In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In this notebook we will take a look at the simple data pipelines scenario of TensorFlow Datasets (TFDS). We'll use TFDS to perform the extract, transform, and load processes for the MNIST dataset.

## Setup

We'll start by importing TensorFlow, TensorFlow Datasets, and Matplotlib.

In [1]:
%%bash
pip install -qU pip wheel
pip install -qU tensorflow-gpu tensorflow-datasets
pip install -qU numpy pandas matplotlib seaborn
pip check

No broken requirements found.


In [None]:
try:
    %tensorflow_version 2.x
except Exception:
    pass

In [None]:
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt

import seaborn as sns
sns.set_style('whitegrid')
sns.set(font='DejaVu Sans')

import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_datasets as tfds

print("\u2022 Using TensorFlow Version:", tf.__version__)

In [None]:
# Limiting GPU memory growth
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

## Extract - Transform - Load (ETL)

Now we'll run the **ETL** code. First, to perform the **Extract** process we use `tfts.load`. This handles everything from downloading the raw data to parsing and splitting it, giving us a dataset. Next, we perform the **Transform** process. In this simple example, our transform process will just consist of shuffling the dataset. Finally, we **Load** one record by using the `take(1)` method. In this case, each record consists of an image and its corresponding label. After loading the record we proceed to plot the image and print its corresponding label. 

In [None]:
tfds.list_builders()

In [None]:
# Pick dataset
mnist_builder = tfds.builder('mnist')
# Download
mnist_builder.download_and_prepare()
# Extract dataset
mnist_builder.as_dataset(split=tfds.Split.TRAIN)

In [None]:
dataset, info = tfds.load(name='mnist', with_info=True)
info

In [None]:
print('Number of training examples:', info.splits['train'].num_examples)

In [None]:
# Construct a tf.data.Dataset from MNIST
dataset = tfds.load(name='mnist')
# Inspecting shapes and datatypes
dataset

In [None]:
# EXTRACT
# Construct a tf.data.Dataset by downloading and extracting
dataset = tfds.load(name='mnist', split='test')
# Checking if the dataset is an instance of tf.data.Dataset
assert isinstance(dataset, tf.data.Dataset)

def scale(elem):
    elem['image'] = tf.cast(elem['image'], tf.float32)
    elem['image'] /= 255.0
    return elem

# TRANSFORM
dataset = dataset.map(scale)
dataset = dataset.shuffle(100) # number samples
# dataset = dataset.repeat(1) # number epochs
# dataset = dataset.batch(1) # batch size

# LOAD
iterator = dataset.take(10).cache().repeat(1) # To fetch 10 samples from the dataset
# for data in iterator:
#     image = data['image'].numpy().squeeze()
#     label = data['label'].numpy()
    
#     print("Label: {}".format(label))
#     plt.imshow(image, cmap=plt.cm.binary)
#     plt.show()
for data in iterator:
    print(data)

In [None]:
dataset = tfds.load(name='mnist', as_supervised=True)
# Inspecting shapes of a batch
# tuples of data and label
for image, label in dataset['train'].take(1):
    print(image.shape, label.shape)

## Fashion MNIST

In [None]:
(x_train, y_train), (x_test, y_test) = \
    tf.keras.datasets.fashion_mnist.load_data()

print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)

In [None]:
(x_train, y_train), (x_test, y_test) = \
    tfds.as_numpy(tfds.load(name='fashion_mnist', split=['train', 'test'],
        batch_size=-1, as_supervised=True))
x_train = x_train / 255.0
x_test = x_test / 255.0
# Reserve 10,0000 samples for validation
x_valid = x_train[-10000:]
y_valid = y_train[-10000:]
x_train = x_train[:-10000]
y_train = y_train[:-10000]
print("x_train shape:", x_train.shape)
print("x_valid shape:", x_valid.shape)
print("x_test shape:", x_test.shape)

inputs = tf.keras.Input(shape=(28, 28, 1))
h = tf.keras.layers.Flatten()(inputs)
h = tf.keras.layers.Dense(128, activation=tf.nn.relu)(h)
h = tf.keras.layers.Dropout(0.2)(h)
outputs = tf.keras.layers.Dense(10, activation=tf.nn.softmax)(h)

model = tf.keras.Model(inputs=inputs, outputs=outputs)
model.summary()

model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

history = model.fit(x_train, y_train, validation_data=(x_valid, y_valid),
    epochs=5, batch_size=64, verbose=0)
history_df = pd.DataFrame(history.history, index=history.epoch)
_, axs = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
for metric, ax in zip(['loss', 'sparse_categorical_accuracy'], axs):
    sns.lineplot(ax=ax, data=history_df, x=history_df.index,
        y=metric, label='Train')
    sns.lineplot(ax=ax, data=history_df, x=history_df.index,
        y='val_' + metric, label='Valid')
    ax.set_xlabel('epoch')
    ax.set_title(metric)
plt.show()

result = model.evaluate(x_test, y_test, 
    batch_size=128, verbose=0)
print(f"Test loss: {result[0]:.4f} \n"
    f"Test accuracy: {result[1]:.4f}")

predict = model.predict(x_test[:3])
print(f"Predict: {np.argmax(predict, axis=1)}\n"
    f"Confidence: {100*np.max(predict, axis=1)}")

## Horses or Humans

In [None]:
_, info = tfds.load(name='horses_or_humans', with_info=True)
info

In [None]:
inputs = tf.keras.Input(shape=(300, 300, 3))
h = tf.keras.layers.Conv2D(16, (3, 3), activation=tf.nn.relu)(inputs)
h = tf.keras.layers.MaxPooling2D(2, 2)(h)
h = tf.keras.layers.Conv2D(32, (3, 3), activation=tf.nn.relu)(h)
h = tf.keras.layers.MaxPooling2D(2, 2)(h)
h = tf.keras.layers.Conv2D(64, (3, 3), activation=tf.nn.relu)(h)
h = tf.keras.layers.MaxPooling2D(2, 2)(h)
h = tf.keras.layers.Conv2D(64, (3, 3), activation=tf.nn.relu)(h)
h = tf.keras.layers.MaxPooling2D(2, 2)(h)
h = tf.keras.layers.Conv2D(64, (3, 3), activation=tf.nn.relu)(h)
h = tf.keras.layers.MaxPooling2D(2, 2)(h)
h = tf.keras.layers.Flatten()(h)
h = tf.keras.layers.Dense(512, activation=tf.nn.relu)(h)
outputs = tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)(h)

model = tf.keras.Model(inputs=inputs, outputs=outputs)
model.compile(optimizer=tf.keras.optimizers.Adam(), 
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=[tf.keras.metrics.Accuracy()])
model.summary()

train_data = tfds.load(name='horses_or_humans', split='train', 
    as_supervised=True)
valid_data = tfds.load(name='horses_or_humans', split='test',
    as_supervised=True)

train_batch = train_data.shuffle(100).batch(32).cache()
valid_batch = valid_data.batch(32).cache()

history = model.fit(train_batch, validation_data=valid_batch,
    validation_steps=1, epochs=10, verbose=0)

history_df = pd.DataFrame(history.history, index=history.epoch)
_, axs = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
for metric, ax in zip(['loss', 'accuracy'], axs):
    sns.lineplot(ax=ax, data=history_df, x=history_df.index, 
        y=metric, label='Train')
    sns.lineplot(ax=ax, data=history_df, x=history_df.index,
        y='val_' + metric, label='Valid')
    ax.set_xlabel('epoch')
    ax.set_title(metric)
plt.show()

result = model.evaluate(valid_batch, verbose=0)
print(f"Test loss: {result[0]:.4f} \n"
    f"Test accuracy: {result[1]:.4f}")