# **Chapter 13 – Loading and Preprocessing Data with TensorFlow**

# The tf.data API

In [1]:
import tensorflow as tf

In [2]:
X = tf.range(10) # any data tensor
dataset = tf.data.Dataset.from_tensor_slices(X)
dataset

<_TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

In [3]:
# or 
tf.data.Dataset.range(10)

<_RangeDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>

In [4]:
# can iterate
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


In [5]:
X_nested = {"a": ([1, 2, 3], [4, 5, 6]), "b": [7, 8, 9]}
dataset = tf.data.Dataset.from_tensor_slices(X_nested)
for item in dataset:
    print(item)

{'a': (<tf.Tensor: shape=(), dtype=int32, numpy=1>, <tf.Tensor: shape=(), dtype=int32, numpy=4>), 'b': <tf.Tensor: shape=(), dtype=int32, numpy=7>}
{'a': (<tf.Tensor: shape=(), dtype=int32, numpy=2>, <tf.Tensor: shape=(), dtype=int32, numpy=5>), 'b': <tf.Tensor: shape=(), dtype=int32, numpy=8>}
{'a': (<tf.Tensor: shape=(), dtype=int32, numpy=3>, <tf.Tensor: shape=(), dtype=int32, numpy=6>), 'b': <tf.Tensor: shape=(), dtype=int32, numpy=9>}


## Chaining Transformations

In [6]:
dataset = tf.data.Dataset.from_tensor_slices(tf.range(10))
dataset = dataset.repeat(3).batch(7)
for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


In [7]:
# tranforming items by calling map()
dataset = dataset.map(lambda x: x * 2) # x is a batch, can speed things up by setting num_parallel_calls argument to number of threads to run or to tf.data.AUTOTUNE.
for item in dataset:
    print(item)

tf.Tensor([ 0  2  4  6  8 10 12], shape=(7,), dtype=int32)
tf.Tensor([14 16 18  0  2  4  6], shape=(7,), dtype=int32)
tf.Tensor([ 8 10 12 14 16 18  0], shape=(7,), dtype=int32)
tf.Tensor([ 2  4  6  8 10 12 14], shape=(7,), dtype=int32)
tf.Tensor([16 18], shape=(2,), dtype=int32)


In [8]:
# filter batches whose sum is less than or equal to 50
dataset = dataset.filter(lambda x: tf.reduce_sum(x) > 50)
for item in dataset:
    print(item)

tf.Tensor([14 16 18  0  2  4  6], shape=(7,), dtype=int32)
tf.Tensor([ 8 10 12 14 16 18  0], shape=(7,), dtype=int32)
tf.Tensor([ 2  4  6  8 10 12 14], shape=(7,), dtype=int32)


In [9]:
# look at first n items in dataset 
for item in dataset.take(2):
    print(item)

tf.Tensor([14 16 18  0  2  4  6], shape=(7,), dtype=int32)
tf.Tensor([ 8 10 12 14 16 18  0], shape=(7,), dtype=int32)


### Shuffling the Data

In [10]:
dataset = tf.data.Dataset.range(10).repeat(2)
dataset = dataset.shuffle(buffer_size=4, seed=42).batch(7)
for item in dataset:
    print(item)

tf.Tensor([1 4 2 3 5 0 6], shape=(7,), dtype=int64)
tf.Tensor([9 8 2 0 3 1 4], shape=(7,), dtype=int64)
tf.Tensor([5 7 9 6 7 8], shape=(6,), dtype=int64)


### Interleaving Lines from Multiple Files

_suppose we have many csv files for the train set and all their paths are store in an array called train_filepaths_

train_filepaths = ["path1.csv", "path2.csv", "path3.csv", ...] or just "path*.csv"

filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42)

_next, call the interleave() method to read from 5 files at a time and interleave their lines_

n_reader = 5

dataset = filepath_dataset.interleave(
lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
cycle_length=n_readers)

Can use interleave's num_parallel_call to use more threads

### Preprocessing the Data

In [11]:
import numpy as np

In [12]:
# For the california housing dataset
n_inputs = 8
X_mean, X_std = [np.array([.5] * n_inputs), np.array([2.] * n_inputs)]

def parse_csv_line(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    return tf.stack(fields[:-1]), tf.stack(fields[-1:])

def preprocess(line):
    x, y = parse_csv_line(line)
    return (x - X_mean) / X_std, y

In [13]:
preprocess(b'4.2083,44.0,5.3232,0.9171,846.0,2.3370,37.47,-122.2,2.782')

(<tf.Tensor: shape=(8,), dtype=float32, numpy=
 array([ 1.8541501e+00,  2.1750000e+01,  2.4116001e+00,  2.0855001e-01,
         4.2275000e+02,  9.1849995e-01,  1.8485001e+01, -6.1349998e+01],
       dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([2.782], dtype=float32)>)

Use dataset's map() method to apply the preprocess() function to each sample in the dataset

### Putting Everything Together

In [14]:
def csv_reader_dataset(filepaths, n_readers=5, n_read_threads=None,
                       n_parse_threads=5, shuffle_buffer_size=10_000, seed=42,
                       batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths, seed=seed)
    
    dataset = dataset.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
    cycle_length=n_readers, num_parallel_calls=n_read_threads)
    
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.shuffle(shuffle_buffer_size, seed=seed)
    
    return dataset.batch(batch_size).prefetch(1)

### Using the Dataset with Keras

train_set = csv_reader_dataset(train_filepaths)

valid_set = csv_reader_dataset(valid_filepaths)

test_set = csv_reader_dataset(test_filepaths)

model = tf.keras.Sequential([...])

model.compile(loss="mse", optimizer="sgd")

model.fit(train_set, validation_data=valid_set, epochs=5)

test_mse = model.evaluate(test_set)

new_set = test_set.take(3) # pretend we have 3 new samples

y_pred = model.predict(new_set) # or you could just pass a NumPy array

_If you want to build a custom training loop:_

n_epochs = 5

for epoch in range(epochs):

    for X_batch, y_batch in train_set:
        [...] # perform one gradient descent step

In [15]:
# Can even create a TF function that trains the model for a whole epoch:
@tf.function
def train_one_epoch(model, optimizer, loss_fn, train_set):
    for X_batch, y_batch in train_set:
        with tf.GradientTape() as tape:
            y_pred = model(X_batch)
            main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
            loss = tf.add_n([main__loss] + model.losses)
            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))

# optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
# loss_fn = tf.keras.losses.MeanSquaredError
# for epoch in range(n_epochs):
#     print("\rEpoch {}/{}".format(epoch + 1, n_epochs), end="")
#     traine_one_epoch(model, optimzier, loss_fn, train_set)

In Keras, the _steps_per_execution_ argument of the compile() method lets you define the number of batches that the fit() method will process during each call to the tf.function it uses for training. The default is just 1, so if you set it ot 50 you will often see a significant performance improvement. However, the on_batch_*() methods of Keras callbacks will only be called every 50 batches.

## The TFRecord Format

In [16]:
with tf.io.TFRecordWriter("my_data.tfrecord") as f:
    f.write(b"This is the first record")
    f.write(b"And this is the second record")

In [17]:
filepaths = ["my_data.tfrecord"]
dataset = tf.data.TFRecordDataset(filepaths)
for item in dataset:
    print(item)

tf.Tensor(b'This is the first record', shape=(), dtype=string)
tf.Tensor(b'And this is the second record', shape=(), dtype=string)


By default, a TFRecordDataset will read files one by one, but you make it read multiple files in parallel and interleave their records by passing the constructor a list of filepaths and setting num_parallel_reads to a number greater than one. Alternatively, you could obtain the same result by using list_files() and interleave() as we did earlier to read multiple CSV files.

### Compresssed TFRecord Files

In [18]:
# compress
options = tf.io.TFRecordOptions(compression_type="GZIP")
with tf.io.TFRecordWriter("my_compressed.tfrecord", options) as f:
    f.write("Compress, compress, compress!")

In [19]:
# decompress
dataset = tf.data.TFRecordDataset(["my_compressed.tfrecord"],
                                 compression_type="GZIP")

for item in dataset:
    print(item)

tf.Tensor(b'Compress, compress, compress!', shape=(), dtype=string)


### A Brief Introduction to Protocol Buffers

see page 454

### TensorFlow Protobufs

In [20]:
 # Create a tf.train.Example representing a person
from tensorflow.train import BytesList, FloatList, Int64List
from tensorflow.train import Feature, Features, Example

person_example = Example(
    features=Features(
        feature={
            "name": Feature(bytes_list=BytesList(value=[b"Alice"])),
            "id": Feature(int64_list=Int64List(value=[123])),
            "emails": Feature(bytes_list=BytesList(value=[b"a@b.com",
                                                          b"c@d.com"]))
        }))

In [21]:
with tf.io.TFRecordWriter("my_contacts.tfrecord") as f:
    for _ in range(5):
        f.write(person_example.SerializeToString())

### Loading and Parsing Examples

In [22]:
feature_description = {
    "name": tf.io.FixedLenFeature([], tf.string, default_value=""),
    "id": tf.io.FixedLenFeature([], tf.int64, default_value=0),
    "emails": tf.io.VarLenFeature(tf.string),
}

def parse(serialized_example):
    return tf.io.parse_single_example(serialized_example, feature_description)

dataset = tf.data.TFRecordDataset(["my_contacts.tfrecord"]).map(parse)
for parsed_example in dataset:
    print(parsed_example)

{'emails': SparseTensor(indices=tf.Tensor(
[[0]
 [1]], shape=(2, 1), dtype=int64), values=tf.Tensor([b'a@b.com' b'c@d.com'], shape=(2,), dtype=string), dense_shape=tf.Tensor([2], shape=(1,), dtype=int64)), 'id': <tf.Tensor: shape=(), dtype=int64, numpy=123>, 'name': <tf.Tensor: shape=(), dtype=string, numpy=b'Alice'>}
{'emails': SparseTensor(indices=tf.Tensor(
[[0]
 [1]], shape=(2, 1), dtype=int64), values=tf.Tensor([b'a@b.com' b'c@d.com'], shape=(2,), dtype=string), dense_shape=tf.Tensor([2], shape=(1,), dtype=int64)), 'id': <tf.Tensor: shape=(), dtype=int64, numpy=123>, 'name': <tf.Tensor: shape=(), dtype=string, numpy=b'Alice'>}
{'emails': SparseTensor(indices=tf.Tensor(
[[0]
 [1]], shape=(2, 1), dtype=int64), values=tf.Tensor([b'a@b.com' b'c@d.com'], shape=(2,), dtype=string), dense_shape=tf.Tensor([2], shape=(1,), dtype=int64)), 'id': <tf.Tensor: shape=(), dtype=int64, numpy=123>, 'name': <tf.Tensor: shape=(), dtype=string, numpy=b'Alice'>}
{'emails': SparseTensor(indices=tf.Tenso

In [23]:
# variable-length features are parsed as sparse tensors. Can convert to a dense tensor using tf.sparse.to_dense()
tf.sparse.to_dense(parsed_example["emails"], default_value=b"")

<tf.Tensor: shape=(2,), dtype=string, numpy=array([b'a@b.com', b'c@d.com'], dtype=object)>

In [24]:
# but in this case it is simpler to just access its values:
parsed_example["emails"].values

<tf.Tensor: shape=(2,), dtype=string, numpy=array([b'a@b.com', b'c@d.com'], dtype=object)>

In [25]:
# can parse examples batch by batch using tf.io.parse_example():
def parse(serialized_examples):
    return tf.io.parse_example(serialized_examples, feature_description)

dataset = tf.data.TFRecordDataset(["my_contacts.tfrecord"]).batch(2).map(parse)
for parsed_examples in dataset:
    print(parsed_examples) # two examples at a time

{'emails': SparseTensor(indices=tf.Tensor(
[[0 0]
 [0 1]
 [1 0]
 [1 1]], shape=(4, 2), dtype=int64), values=tf.Tensor([b'a@b.com' b'c@d.com' b'a@b.com' b'c@d.com'], shape=(4,), dtype=string), dense_shape=tf.Tensor([2 2], shape=(2,), dtype=int64)), 'id': <tf.Tensor: shape=(2,), dtype=int64, numpy=array([123, 123])>, 'name': <tf.Tensor: shape=(2,), dtype=string, numpy=array([b'Alice', b'Alice'], dtype=object)>}
{'emails': SparseTensor(indices=tf.Tensor(
[[0 0]
 [0 1]
 [1 0]
 [1 1]], shape=(4, 2), dtype=int64), values=tf.Tensor([b'a@b.com' b'c@d.com' b'a@b.com' b'c@d.com'], shape=(4,), dtype=string), dense_shape=tf.Tensor([2 2], shape=(2,), dtype=int64)), 'id': <tf.Tensor: shape=(2,), dtype=int64, numpy=array([123, 123])>, 'name': <tf.Tensor: shape=(2,), dtype=string, numpy=array([b'Alice', b'Alice'], dtype=object)>}
{'emails': SparseTensor(indices=tf.Tensor(
[[0 0]
 [0 1]], shape=(2, 2), dtype=int64), values=tf.Tensor([b'a@b.com' b'c@d.com'], shape=(2,), dtype=string), dense_shape=tf.Ten

See page 458 about storing images and tf.tensors

### Handling Lists of Lists Using the SequenceExample Protobuf

If features lists contain sequences of varying sizes, you may want to convert them to ragged tensors using tf.RaggedTensor.from_sparse() (see page 459)

### Normalization Layer

It is possible to pass a tf.data.Dataset to a preprocessing layer's adapt() method. It is also possible to apply a Keras preprocessing layer to a tf.data.Dataset using the dataset's map() method. Here is how you can could apply an adapted Normalization layer to the input features of each batch in a dataset:

### The discretezation layer

3 categories: less than 18, 18 to 50 (not included), and 50 or over:

In [26]:
age = tf.constant([[10.], [93.], [57.], [18.], [37.], [5.]])
discretize_layer = tf.keras.layers.Discretization(bin_boundaries=[18., 50.])
age_categories = discretize_layer(age)
age_categories

<tf.Tensor: shape=(6, 1), dtype=int32, numpy=
array([[0],
       [2],
       [2],
       [1],
       [1],
       [0]], dtype=int32)>

We can instead provide the number of bins you want, then call the layer's adapt() method to let it find the appropriate bin boundaries based on the value percentiles. For example, if we set num_bins=3, then the bind boundaries will be located at the values just bellow the 33rd and 66th percentiles (in this example, at the values 10 and 37):

In [27]:
discretize_layer = tf.keras.layers.Discretization(num_bins=3)
discretize_layer.adapt(age)
age_categories = discretize_layer(age)
age_categories

<tf.Tensor: shape=(6, 1), dtype=int32, numpy=
array([[1],
       [2],
       [2],
       [1],
       [2],
       [0]], dtype=int32)>

### The CategoryEncoding Layer

In [28]:
onehot_layer = tf.keras.layers.CategoryEncoding(num_tokens=3)
onehot_layer(age_categories)

<tf.Tensor: shape=(6, 3), dtype=float32, numpy=
array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)>

In [29]:
# multi-hot encoding:
two_age_categories = np.array([[1, 0], [2, 2], [2, 0]])
onehot_layer(two_age_categories)

<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[1., 1., 0.],
       [0., 0., 1.],
       [1., 0., 1.]], dtype=float32)>

if you want to count how mnay times each category occurred, set output_mode="count", when creating the layer. The output would be the same except for the second row above, which would become [0, 0, 2]

both multi-hot encoding and count encoding lose information, since it is not possible to know which feature each active category came from. If you want to avoid this, then you need to one-hot encode each feature separately and concatenate the outputs. You can get the same result by tweaking the category identifiers so they don't overlap:

In [30]:
onehot_layer = tf.keras.layers.CategoryEncoding(num_tokens=3 + 3)
onehot_layer(two_age_categories + [0, 3]) # adds 3 to the second feature

<tf.Tensor: shape=(3, 6), dtype=float32, numpy=
array([[0., 1., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 1.],
       [0., 0., 1., 1., 0., 0.]], dtype=float32)>

In this output, the first 3 columns corresponde to the first feature, and the last three correspond to the second feature.

### The StringLookup Layer

In [31]:
# one-hot encode a cities feature:
cities = ["Auckland", "Paris", "Paris", "San Francisco"]
str_lookup_layer = tf.keras.layers.StringLookup()
str_lookup_layer.adapt(cities)
str_lookup_layer([["Paris"], ["Auckland"], ["Auckland"], ["Montreal"]])

<tf.Tensor: shape=(4, 1), dtype=int64, numpy=
array([[1],
       [3],
       [3],
       [0]])>

In [32]:
# using output_mode="one_hot"
str_lookup_layer = tf.keras.layers.StringLookup(output_mode="one_hot")
str_lookup_layer.adapt(cities)
str_lookup_layer([["Paris"], ["Auckland"], ["Auckland"], ["Montreal"]])

<tf.Tensor: shape=(4, 4), dtype=int64, numpy=
array([[0, 1, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [1, 0, 0, 0]])>

Keras also includes IntegerLookup layer which is just like StringLookup but for integers

If the training set is large, it may be convenient to adapt the layer to just a random subste of the training set. In this case, the layer's adapt() method may miss some of the rarer categories. By default, it would then map them all to category 0, making them indistinguishable. To reduce this risk, you can set num_oov_indies to an integer greater than 1.

In [33]:
str_lookup_layer = tf.keras.layers.StringLookup(num_oov_indices=5)
str_lookup_layer.adapt(cities)
str_lookup_layer([["Paris"], ["Auckland"], ["Foo"], ["Bar"], ["Baz"]])

<tf.Tensor: shape=(5, 1), dtype=int64, numpy=
array([[5],
       [7],
       [4],
       [3],
       [4]])>

### The Hashing Layer

In [34]:
hashing_layer = tf.keras.layers.Hashing(num_bins=10)
hashing_layer([["Paris"], ["Tokyo"], ["Auckland"], ["Montreal"]])

<tf.Tensor: shape=(4, 1), dtype=int64, numpy=
array([[0],
       [1],
       [9],
       [1]])>

The benefit of this layer is that it does not need to be adapted at all.

### Encoding Categorical Features Using Embeddings

Initialize an embedding layer with five rows and 2d embeddings, and use it to encode some categories:

In [35]:
tf.random.set_seed(42)
embedding_layer = tf.keras.layers.Embedding(input_dim=5, output_dim=2)
embedding_layer(np.array([2, 4, 2]))

<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[-0.00498476,  0.0461193 ],
       [ 0.04001949,  0.03139852],
       [-0.00498476,  0.0461193 ]], dtype=float32)>

If you want to embed a categorical text attribute, you can simply chain a String Lookup layer and an Embedding layer, like this:

In [36]:
tf.random.set_seed(42)
ocean_prox = ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]
str_lookup_layer = tf.keras.layers.StringLookup()
str_lookup_layer.adapt(ocean_prox)
lookup_and_embed = tf.keras.Sequential([
    str_lookup_layer,
    tf.keras.layers.Embedding(input_dim=str_lookup_layer.vocabulary_size(),
                              output_dim=2)
])

lookup_and_embed(np.array([["<1H OCEAN"], ["ISLAND"], ["<1H OCEAN"]]))

<tf.Tensor: shape=(3, 1, 2), dtype=float32, numpy=
array([[[0.00639011, 0.01911124]],

       [[0.00778233, 0.00470138]],

       [[0.00639011, 0.01911124]]], dtype=float32)>

Putting everything together, we can now create a Keras model that can process a categorical text feature along with regular numerical features and learn and learn an embedding for each category:

We could also have passed the training data to the fit() method using a dictionary instead of a tuple: {"num": X_train_num, "cat": X_train_cat}. Alternatively, we could have passed a tf.data.Dataset containing batches, each represenetd as ((X_batch_num, X_batch_cat), y_batch) or as ({"num": X_batch_num, "cat": X_batch_cat}, y_batch)

### Text Preprocessing

In [37]:
train_data = ["To be", "!(to be)", "That's the question", "Be, be, be."]
text_vec_layer = tf.keras.layers.TextVectorization()
text_vec_layer.adapt(train_data)
text_vec_layer(["Be good!", "Question: be or be?"])

<tf.Tensor: shape=(2, 4), dtype=int64, numpy=
array([[2, 1, 0, 0],
       [6, 2, 1, 2]])>

In [38]:
# setting output_mode to TF-IDF
text_vec_layer = tf.keras.layers.TextVectorization(output_mode="tf_idf")
text_vec_layer.adapt(train_data)
text_vec_layer(["Be good!", "Question: be or be?"])

<tf.Tensor: shape=(2, 6), dtype=float32, numpy=
array([[0.96725637, 0.6931472 , 0.        , 0.        , 0.        ,
        0.        ],
       [0.96725637, 1.3862944 , 0.        , 0.        , 0.        ,
        1.0986123 ]], dtype=float32)>

### Using Pretrained Language Model Components

In [39]:
import tensorflow_hub as hub

hub_layer = hub.KerasLayer("https://www.kaggle.com/models/google/nnlm/TensorFlow2/en-dim50/1")

sentence_embeddings = hub_layer(tf.constant(["To be", "Not to be"]))
sentence_embeddings.numpy().round(2)




















array([[-0.25,  0.28,  0.01,  0.1 ,  0.14,  0.16,  0.25,  0.02,  0.07,
         0.13, -0.19,  0.06, -0.04, -0.07,  0.  , -0.08, -0.14, -0.16,
         0.02, -0.24,  0.16, -0.16, -0.03,  0.03, -0.14,  0.03, -0.09,
        -0.04, -0.14, -0.19,  0.07,  0.15,  0.18, -0.23, -0.07, -0.08,
         0.01, -0.01,  0.09,  0.14, -0.03,  0.03,  0.08,  0.1 , -0.01,
        -0.03, -0.07, -0.1 ,  0.05,  0.31],
       [-0.2 ,  0.2 , -0.08,  0.02,  0.19,  0.05,  0.22, -0.09,  0.02,
         0.19, -0.02, -0.14, -0.2 , -0.04,  0.01, -0.07, -0.22, -0.1 ,
         0.16, -0.44,  0.31, -0.1 ,  0.23,  0.15, -0.05,  0.15, -0.13,
        -0.04, -0.08, -0.16, -0.1 ,  0.13,  0.13, -0.18, -0.04,  0.03,
        -0.1 , -0.07,  0.07,  0.03, -0.08,  0.02,  0.05,  0.07, -0.14,
        -0.1 , -0.18, -0.13, -0.04,  0.15]], dtype=float32)

In [40]:
sentence_embeddings.numpy().shape

(2, 50)

### Image Preprocessing Layers

It includes 3 preprocessing layers:

* tf.keras.layers.Resizing
* tf.keras.layers.Rescaling
* tf.keras.layers.CenterCrop

In [41]:
from sklearn.datasets import load_sample_images

images = load_sample_images()["images"]
crop_images_layer = tf.keras.layers.CenterCrop(height=100, width=100)
cropped_images = crop_images_layer(images)

Keras also includes several layers for data augmentation.

## The TensorFlow Datasets Project

In [42]:
import tensorflow_datasets as tfds

datasets = tfds.load(name="mnist")
mnist_train, mnist_test = datasets["train"], datasets["test"]

In [43]:
for batch in mnist_train.shuffle(10_000, seed=42).batch(32).prefetch(1):
    images = batch["image"]
    labels = batch["label"]
    # [...] do something with the images and labels

Note that each item in the datset is a dictionary containing both the features and the labels. But Keras expects each item to be a tuple containing two elements(again, the features and the labels). You could transform the dataset using the map() method like:

In [44]:
mnist_train = mnist_train.shuffle(buffer_size=10_000, seed=42).batch(32)
mnist_train = mnist_train.map(lambda items: (items["image"], items["label"]))
mnist_train = mnist_train.prefetch(1)

But it is simpler to ask the load() function to do this for you by setting as_supervised=True.

Can split dataset using split parameter: split=["train[:90%]", "train[90%:]", "test"]

In [45]:
# Complete example:
train_set, valid_set, test_set = tfds.load(
    name="mnist",
    split=["train[:90%]", "train[90%:]", "test"],
    as_supervised=True)

train_set = train_set.shuffle(buffer_size=10_000, seed=42).batch(32).prefetch(1)
valid_set = valid_set.batch(32).cache()
test_set = test_set.batch(32).cache()
tf.random.set_seed(42)
model = tf.keras.Sequential([
    tf.keras.layers.Input([28, 28]),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(10, activation="softmax")
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
history = model.fit(train_set, validation_data=valid_set, epochs=5)
test_loss, test_accuracy = model.evaluate(test_set)

Epoch 1/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - accuracy: 0.8356 - loss: 10.1244 - val_accuracy: 0.8730 - val_loss: 6.6363
Epoch 2/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.8783 - loss: 5.7680 - val_accuracy: 0.8670 - val_loss: 6.1885
Epoch 3/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.8826 - loss: 5.2739 - val_accuracy: 0.8782 - val_loss: 5.9981
Epoch 4/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.8854 - loss: 5.1385 - val_accuracy: 0.8935 - val_loss: 5.5771
Epoch 5/5
[1m1688/1688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - accuracy: 0.8876 - loss: 4.9490 - val_accuracy: 0.8853 - val_loss: 5.9133
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8889 - loss: 5.5199


# Open-ended questions

1. Because it can efficiently load data on the fly or too big to fit on memory, and it can do preprocessing
2. Better shuffling
3. Time to start training is long. We could do prefetching and store data in a more efficient format.
4. Yes, we can save any binary data to TFRecord file, but TF's prefered one is protobufs.
5. The Example protobuf is already compiled in TensorFlow. To use our out protobuf definition, we would have to write it and compile it ourselves.
6. We would activate compression when we need to send the data over the internet. I believe it would slowdown training.
7. Preprocess data when writing the file: it makes training faster, but we would not have the preprocessing parameters when deploying the model for production. Within the tf.data pipeline: slower training as we would do the same computation many times, and we would have to code the preprocessing again for production which is more error-prone. Preprocessing layers within model: slower training, but model would be more portable.
8. One-hot encoding, multi-hot encoding, count, one-hot encode features separetely then concatenate. Text: same as numerical, but with TF-IDF, word embeddings, and sentence embeddings.

## Solutions

# Exercises

## 1. to 8.
1. Ingesting a large dataset and preprocessing it efficiently can be a complex engineering challenge. The Data API makes it fairly simple. It offers many features, including loading data from various sources (such as text or binary files), reading data in parallel from multiple sources, transforming it, interleaving the records, shuffling the data, batching it, and prefetching it.
2. Splitting a large dataset into multiple files makes it possible to shuffle it at a coarse level before shuffling it at a finer level using a shuffling buffer. It also makes it possible to handle huge datasets that do not fit on a single machine. It's also simpler to manipulate thousands of small files rather than one huge file; for example, it's easier to split the data into multiple subsets. Lastly, if the data is split across multiple files spread across multiple servers, it is possible to download several files from different servers simultaneously, which improves the bandwidth usage.
3. You can use TensorBoard to visualize profiling data: if the GPU is not fully utilized then your input pipeline is likely to be the bottleneck. You can fix it by making sure it reads and preprocesses the data in multiple threads in parallel, and ensuring it prefetches a few batches. If this is insufficient to get your GPU to 100% usage during training, make sure your preprocessing code is optimized. You can also try saving the dataset into multiple TFRecord files, and if necessary perform some of the preprocessing ahead of time so that it does not need to be done on the fly during training (TF Transform can help with this). If necessary, use a machine with more CPU and RAM, and ensure that the GPU bandwidth is large enough.
4. A TFRecord file is composed of a sequence of arbitrary binary records: you can store absolutely any binary data you want in each record. However, in practice most TFRecord files contain sequences of serialized protocol buffers. This makes it possible to benefit from the advantages of protocol buffers, such as the fact that they can be read easily across multiple platforms and languages and their definition can be updated later in a backward-compatible way.
5. The `Example` protobuf format has the advantage that TensorFlow provides some operations to parse it (the `tf.io.parse`*`example()` functions) without you having to define your own format. It is sufficiently flexible to represent instances in most datasets. However, if it does not cover your use case, you can define your own protocol buffer, compile it using `protoc` (setting the `--descriptor_set_out` and `--include_imports` arguments to export the protobuf descriptor), and use the `tf.io.decode_proto()` function to parse the serialized protobufs (see the "Custom protobuf" section of the notebook for an example). It's more complicated, and it requires deploying the descriptor along with the model, but it can be done.
6. When using TFRecords, you will generally want to activate compression if the TFRecord files will need to be downloaded by the training script, as compression will make files smaller and thus reduce download time. But if the files are located on the same machine as the training script, it's usually preferable to leave compression off, to avoid wasting CPU for decompression.
7. Let's look at the pros and cons of each preprocessing option:
    * If you preprocess the data when creating the data files, the training script will run faster, since it will not have to perform preprocessing on the fly. In some cases, the preprocessed data will also be much smaller than the original data, so you can save some space and speed up downloads. It may also be helpful to materialize the preprocessed data, for example to inspect it or archive it. However, this approach has a few cons. First, it's not easy to experiment with various preprocessing logics if you need to generate a preprocessed dataset for each variant. Second, if you want to perform data augmentation, you have to materialize many variants of your dataset, which will use a large amount of disk space and take a lot of time to generate. Lastly, the trained model will expect preprocessed data, so you will have to add preprocessing code in your application before it calls the model. There's a risk of code duplication and preprocessing mismatch in this case.
    * If the data is preprocessed with the tf.data pipeline, it's much easier to tweak the preprocessing logic and apply data augmentation. Also, tf.data makes it easy to build highly efficient preprocessing pipelines (e.g., with multithreading and prefetching). However, preprocessing the data this way will slow down training. Moreover, each training instance will be preprocessed once per epoch rather than just once if the data was preprocessed when creating the data files. Well, unless the dataset fits in RAM and you can cache it using the dataset's `cache()` method. Lastly, the trained model will still expect preprocessed data. But if you use preprocessing layers in your tf.data pipeline to handle the preprocessing step, then you can just reuse these layers in your final model (adding them after training), to avoid code duplication and preprocessing mismatch.
    * If you add preprocessing layers to your model, you will only have to write the preprocessing code once for both training and inference. If your model needs to be deployed to many different platforms, you will not need to write the preprocessing code multiple times. Plus, you will not run the risk of using the wrong preprocessing logic for your model, since it will be part of the model. On the downside, preprocessing the data on the fly during training will slow things down, and each instance will be preprocessed once per epoch.
8. Let's look at how to encode categorical text features and text:
    * To encode a categorical feature that has a natural order, such as a movie rating (e.g., "bad," "average," "good"), the simplest option is to use ordinal encoding: sort the categories in their natural order and map each category to its rank (e.g., "bad" maps to 0, "average" maps to 1, and "good" maps to 2). However, most categorical features don't have such a natural order. For example, there's no natural order for professions or countries. In this case, you can use one-hot encoding, or embeddings if there are many categories. With Keras, the `StringLookup` layer can be used for ordinal encoding (using the default `output_mode="int"`), or one-hot encoding (using `output_mode="one_hot"`). It can also perform multi-hot encoding (using `output_mode="multi_hot"`) if you want to encode multiple categorical text features together, assuming they share the same categories and it doesn't matter which feature contributed which category. For trainable embeddings, you must first use the `StringLookup` layer to produce an ordinal encoding, then use the `Embedding` layer.
    * For text, the `TextVectorization` layer is easy to use and it can work well for simple tasks, or you can use TF Text for more advanced features. However, you'll often want to use pretrained language models, which you can obtain using tools like TF Hub or Hugging Face's Transformers library. These last two options are discussed in Chapter 16.

# Coding Questions

9. Load the Fashion MNIST dataset (introduced in Chapter 10); split it into a training set, a validation set, and a test set; shuffle the training set; and save each dataset to multiple TFRecord files. Each record should be a serialized `Example` protobuf with two features: the serialized image (use `tf.io.serialize_tensor()` to serialize each image), and the label. Note: for large images, you could use `tf.io.encode_jpeg()` instead. This would save a lot of space, but it would lose a bit of image quality. Then use tf.data to create an efficient dataset for each set. Finally, use a Keras model to train these datasets, including a preprocessing layer to standardize each input feature. Try to make the input pipeline as efficient as possible, using TensorBoard to visualize profiling data.

In [132]:
import tensorflow_datasets as tfds

datasets = tfds.load("mnist", as_supervised=True, split=["train[:90%]", "train[90%:]", "test"])

In [135]:
from tensorflow.train import Features, Feature, Example, BytesList

def create_example(X_y):
    protobuf_example = Example(
        features=Features(
            feature={
                "image": Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(X_y[0]).numpy()])),
                "target": Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(X_y[1]).numpy()]))
            }
        )
    )
    return protobuf_example

In [140]:
import numpy as np
import os

names = ("train", "valid", "test")
buffer_sizes = (10_000, 3_000, 5_000)
dataset_sizes = (54_000, 6_000, 10_000)
rng = np.random.default_rng()
number_files = 20
files_index = np.arange(number_files)
rng.shuffle(files_index)

for i, name in enumerate(names):
    name_dir = "mnist_" + names[i]
    os.makedirs(name_dir, exist_ok=True)
    dataset = datasets[i].shuffle(buffer_size=buffer_sizes[i])
    count = 0
    files = [tf.io.TFRecordWriter(os.path.join(name_dir, "mnist_" + name + str(j) + ".tfrecord")) for j in range(number_files)]
    
    for instance in dataset:
        files[files_index[count]].write(create_example(instance).SerializeToString())
        count += 1
        
        if count == number_files:
            rng.shuffle(files_index)
            count = 0

    for file in files:
        file.close()
    
    

Now load datasets and train model

In [166]:
feature_description = {
    "image": tf.io.FixedLenFeature([], tf.string, default_value=""),
    "target": tf.io.FixedLenFeature([], tf.string, default_value="")
}

def parse_batch(serialized_examples):
    return tf.io.parse_example(serialized_examples, feature_description)

def to_tensor(features_string):
    image = tf.io.parse_tensor(features_string["image"], out_type=tf.uint8)
    label = tf.io.parse_tensor(features_string["target"], out_type=tf.int64)

    return image, label

In [167]:
loaded_datasets = {}
for name in names:
    files_paths = [os.path.join("mnist_" + name, filename) for filename in os.listdir("mnist_" + name)]
    dataset = tf.data.TFRecordDataset(files_paths).batch(32).map(parse_batch).map(to_tensor)
    loaded_datasets[name] = dataset

In [169]:
norm_layer = tf.keras.layers.Normalization()
norm_layer.adapt(loaded_datasets["train"])

AttributeError: 'tuple' object has no attribute 'shape'

In [159]:
loaded_datasets["train"]  = loaded_datasets["train"].interleave(
    

loaded_datasets["valid"]  = loaded_datasets["valid"].batch(32).cache()
loaded_datasets["test"]  = loaded_datasets["test"].batch(32).cache()

dict_items([('train', <TFRecordDatasetV2 element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>), ('valid', <TFRecordDatasetV2 element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>), ('test', <TFRecordDatasetV2 element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>)])