#  Custom Models and Training with Tensorflow

## Using TensorFlow like NumPy

 Tensor and Operations

In [1]:
import tensorflow as tf

t = tf.constant([[1., 2., 3.], [4., 5., 6.]]) # matrix
t

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[1., 2., 3.],
       [4., 5., 6.]], dtype=float32)>

In [2]:
t.shape

TensorShape([2, 3])

In [3]:
t.dtype

tf.float32

Indexing

In [4]:
t[:, 1:]

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[2., 3.],
       [5., 6.]], dtype=float32)>

In [5]:
t[..., 1]

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([2., 5.], dtype=float32)>

In [6]:
t[..., 1, tf.newaxis]

<tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[2.],
       [5.]], dtype=float32)>

Ops

In [7]:
t + 10

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[11., 12., 13.],
       [14., 15., 16.]], dtype=float32)>

In [8]:
tf.square(t)

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[ 1.,  4.,  9.],
       [16., 25., 36.]], dtype=float32)>

In [9]:
t @ tf.transpose(t)

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[14., 32.],
       [32., 77.]], dtype=float32)>

Scalar

In [10]:
tf.constant(42)

<tf.Tensor: shape=(), dtype=int32, numpy=42>

### Tensors and NumPy

`TensorFlow uses float32 by default and NumPy uses float64 by default. Remember to adjust dtype before conversion from numpy to tensorflow.`

In [11]:
import numpy as np

a = np.array([2., 4., 5.])
tf.constant(a)

<tf.Tensor: shape=(3,), dtype=float64, numpy=array([2., 4., 5.])>

In [12]:
t.numpy() # or np.array(t)

array([[1., 2., 3.],
       [4., 5., 6.]], dtype=float32)

In [13]:
tf.square(a)

<tf.Tensor: shape=(3,), dtype=float64, numpy=array([ 4., 16., 25.])>

In [14]:
np.square(t)

array([[ 1.,  4.,  9.],
       [16., 25., 36.]], dtype=float32)

### Type Conversions

In [15]:
try:
    tf.constant(2.) + tf.constant(40)
except Exception as e:
    print(e)

cannot compute AddV2 as input #1(zero-based) was expected to be a float tensor but is a int32 tensor [Op:AddV2] name: 


In [16]:
try:
    tf.constant(2.) + tf.constant(40., dtype=tf.float64)
except Exception as e:
    print(e)

cannot compute AddV2 as input #1(zero-based) was expected to be a float tensor but is a double tensor [Op:AddV2] name: 


In [17]:
t2 = tf.constant(40., dtype=tf.float64)
tf.constant(2.0) + tf.cast(t2, tf.float32)

<tf.Tensor: shape=(), dtype=float32, numpy=42.0>

### Variables

In [18]:
v = tf.Variable([[1., 2., 3.], [4., 5., 6.]])
v

<tf.Variable 'Variable:0' shape=(2, 3) dtype=float32, numpy=
array([[1., 2., 3.],
       [4., 5., 6.]], dtype=float32)>

In [19]:
v.assign(2 * v)

<tf.Variable 'UnreadVariable' shape=(2, 3) dtype=float32, numpy=
array([[ 2.,  4.,  6.],
       [ 8., 10., 12.]], dtype=float32)>

In [20]:
v[0, 1].assign(42)

<tf.Variable 'UnreadVariable' shape=(2, 3) dtype=float32, numpy=
array([[ 2., 42.,  6.],
       [ 8., 10., 12.]], dtype=float32)>

In [21]:
v[:, 2].assign([0., 1.])

<tf.Variable 'UnreadVariable' shape=(2, 3) dtype=float32, numpy=
array([[ 2., 42.,  0.],
       [ 8., 10.,  1.]], dtype=float32)>

In [22]:
v.scatter_nd_update(
    indices=[[0, 0], [1, 2]], updates=[100., 200.])

<tf.Variable 'UnreadVariable' shape=(2, 3) dtype=float32, numpy=
array([[100.,  42.,   0.],
       [  8.,  10., 200.]], dtype=float32)>

In [23]:
# Direct assignment does not work
try:
    v[1] = [7., 8., 9.]
except Exception as e:
    print(e)

'ResourceVariable' object does not support item assignment


## Customizing Models and Training Algorithms

### Custom Loss Functions

In [24]:
def huber_fn(y_true, y_pred_):
    error = y_true - y_pred
    is_small_error = tf.abs(error) < 1
    squared_loss = tf.square(error) / 2
    linear_loss = tf.abs(error) - 0.5
    return tf.where(is_small_error, squared_loss, linear_loss)
             

In [25]:
# Can use this Huber loss function when you compile the keras model
# model.compile(loss=huber_fn, optimizer="nadam")
# model.fit(X_train, y_train, [...])

### Saving Loading Models That Contain Custom Components

In [26]:
# model = tf.keras.models.load_model("my_model_with_a_custom_loss",
                                   # custom_objects={"huber_fn": huber_fn})

Do not need to include it in custom_objects if huber_fn() function is decorated with @keras.utils.register_keras_serializable()

Create a function that creates a configured loss function:

In [27]:
def create_huber(threshold=1.0):
    def huber_fn(y_true, y_pred):
        error = y_true - y_pred
        is_small_error = tf.abs(error) < threshold
        squared_loss = tf.square(error) / 2
        linear_loss = tf.abs(error) - threshold ** 2 / 2
        return tf.where(is_small_error, squared_loss, linear_loss)
    return huber_fn

# model.compile(loss=create_huber(2.0), optimzer="nadam")

When you save the model, the threshold will not be saved. Have to specify threshold value when loading hte model:

In [28]:
# model = tf.keras.models.load_model(
#     "my_model_with_a_custom_loss_threshold_2",
#     custom_objects={"huber_fn": create_huber(2.0)}
# )

Can solve this by creating a subclass of the tf.keras.losses.Loss class, and then implementing its get_config() method:

In [29]:
class HuberLoss(tf.keras.losses.Loss):
    def __init__(self, threshold=1.0, **kwargs):
        self.threshold = threshold
        super.__init__(**kwargs)

    def call(self, y_true, y_pred):
        error = y_true - y_pred
        is_small_error = tf.abs(error) < self.threshold
        squared_loss = tf.square(error) / 2
        linear_loss = self.threshold * tf.abs(error) - self.threshold**2 / 2
        return tf.where(is_small_error, squared_loss, linear_loss)

    def get_config(self):
        base_config = super.get_config()
        return {**base_config, "threshold" : self.threshold} # can do base_config | {"threshold" : self.threshold}

### Custom Activation Functions, Initializers, Regularizers, and Constraints

In [30]:
def my_softplus(z):
    return tf.math.log(1.0 + tf.exp(z))

def my_glorot_initializer(shape, dtype=tf.float32):
    stddev = tf.sqrt(2. / (shape[0] + shape[1]))
    return tf.random.normal(shape, stddev=stddev, dtype=dtype)

def my_l1_regularizer(weights):
    return tf.reduce_sum(tf.abs(0.01 * weights))

def my_positive_weights(weights): # return values is just tf.nn.relu(weights)
    return tf.where(weights < 0., tf.zeros_like(weights), weights)

In [31]:
# Arguments depend on the type of custom function. These custom functions can then be used normally, as shown here:
layer = tf.keras.layers.Dense(1, activation=my_softplus,
                              kernel_initializer=my_glorot_initializer,
                              kernel_regularizer=my_l1_regularizer,
                              kernel_constraint=my_positive_weights
                             )

If function has hyperparameters that need to be saved along with the model, then subclass appropriate class such as: tf.keras.regularizers.Regularizer, tf.keras.constraints.Constraint, tf.keras.initializers.Initializer, or tf.keras.layers.Layer (for any layer, including activation functions). Example:

In [32]:
class MyL1Regularizer(tf.keras.regularizers.Regularizer):
    def __init__(self, factor):
        self.factor = factor

    def __call__(self, weights):
        return tf.reduce_sum(tf.abs(self.factor * weights))

    def get_config(self):
        return {"factor" : self.factor}

Note that you must implement the call() method for losses, layers (including activation functions), and models, or the \_\_call__() method for regularizer, initializers, and constraints

### Custom metrics

In [33]:
# model.compile(loss="mse", optimizer="nadam", metrics=[create_huber(2.0)])

In [34]:
precision = tf.keras.metrics.Precision() # streaming metric (or stateful metric)
precision([0, 1, 1, 1, 0, 1, 0, 1], [1, 1, 0, 1, 0, 1, 0, 1])

<tf.Tensor: shape=(), dtype=float32, numpy=0.800000011920929>

In [35]:
precision([0, 1, 0, 0, 1, 0, 1, 1], [1, 0, 1, 1, 0, 0, 0, 0])

<tf.Tensor: shape=(), dtype=float32, numpy=0.5>

In [36]:
precision.result()

<tf.Tensor: shape=(), dtype=float32, numpy=0.5>

In [37]:
precision.variables

[<Variable path=precision/true_positives, shape=(1,), dtype=float32, value=[4.]>,
 <Variable path=precision/false_positives, shape=(1,), dtype=float32, value=[4.]>]

In [38]:
precision.reset_state() # both variables get reset to 0.0

If you need to define your own custom streaming metric, create a subclass of the tf.keras.metrics.Metric class.

In [39]:
# This is for illustration purposes
class HuberMetric(tf.keras.metrics.Metric):
    def __init__(self, threshold=1.0, **kwargs):
        super().__init__(**kwargs) # handles base args (e.g., dtype)
        self.threshold = threshold
        self.huber_fn = create_huber(threshold)
        self.total = self.add_weight("total", initializer="zeros")
        self.count = self.add_weight("count", initializer="zeros")

    def update_state(self, y_true, y_pred, sample_weight=None):
        sample_metrics = self.huber_fn(y_true, y_pred)
        self.total.assign_add(tf.reduce_sum(sample_metrics))
        self.count.assign_add(tf.cast(tf.size(y_true), tf.float32))

    def result(self):
        return self.total / self.count

    def get_config(self):
        base_config = super().get_config()
        return {**base_config, "threshold" : self.threshold}

In [40]:
# better implementation
class HuberMetric(tf.keras.metrics.Mean):
    def __init__(self, threshold=1.0, name="HuberMetric", dtype=None):
        self.threshold = threshold
        self.huber_fn = create_huber(threshold)
        super().__init__(name=name, dtype=dtype)

    def update_state(self, y_true, y_pred, sample_weight=None):
        metric = self.huber_fn(y_true, y_pred)
        super().update_state(metric, sample_weight)

    def get_config(self):
        base_config = super().get_config()
        return {**base_config, "threshold": self.threshold}

### Custom Layers

In [41]:
# To create custom layer without any weights, the simplest option is to write a function and wrap it in a tf.keras.layers.Lambda layer
exponential_layer = tf.keras.layers.Lambda(lambda x: tf.expo(x))

In [42]:
# build a custom stateful layer (i.e, a layer with weights)
class MyDense(tf.keras.layers.Layer):
    def __init__(self, units, activation=None, **kwargs):
        super().__init__(**kwargs)
        self.units = units
        self.activation = tf.keras.activation.get(activation)

    def build(self, batch_input_shape):
        self.kernel = self.add_weight(
            name="kernel", shape=[batch_input_shape[-1], self.units],
            initializer="glorot_normal")
        self.bias = self.add_weight(
            name="bias", shape=[self.units], initializer="zeros")

    def call(self, X):
        return self.activation(X @ self.kernel + self.bias)

    def get_config(self):
        base_config = super().get_config()
        return {**base_config, "units" : self.units,
                "activation" : tf.keras.activations.serialize(self.activation)}

    

In [43]:
# layer with multiple inputs
class MyMultiLayer(tf.keras.layers.Layer):
    def call(self, X):
        X1, X2 = X
        return X1 + X2,  X1 * X2, X1 / X2

In [44]:
# Layer that has a different behavior during training
class MyGaussianNoise(tf.keras.layers.Layer):
    def __init__(self, stddev, **kwargs):
        super().__init__(**kwargs)
        self.stddev = stddev

    def call(self, X, training=False):
        if training:
            noise = tf.random.normal(tf.shape(X), sddev=self.stddev)
            return X + noise
        return X

## Custom Models

In [45]:
# first create this residual layer
class ResidualBlock(tf.keras.layers.Layer):
    def __init__(self, n_layers, n_neurons, **kwargs):
        super().__init__(**kwargs)
        self.hidden = [tf.keras.layers.Dense(n_neurons, activation="relu",
                                            kernel_initializer="he_normal")
                        for _ in range(n_layers)]

    def call(self, inputs):
        Z = inputs
        for layer in self.hidden:
            Z = layer(Z)
        return inputs + Z

In [46]:
# using subclassing API to define model
class ResidualRegressor(tf.keras.Model):
    def __init__(self, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.hidden1 = tf.keras.layers.Dense(30, activation="relu",
                                            kernel_initializer="he_normal")
        self.block1 = ResidualBlock(2, 30)
        self.block2 = ResidualBlock(2, 30)
        self.out = tf.keras.layers.Dense(output_dim)

    def call(self, inputs):
        Z = self.hidden1(inputs)
        for _ in range(1 + 3):
            Z = self.block1(Z)
        Z = self.block2(Z)
        return self.out(Z)

### Losses and Metrics Based on Model Internals

In [47]:
# code for this custom model with a custom reconstruction loss and a corresponding metric
class ReconstructingRegressor(tf.keras.Model):
    def __init__(self, ouput_dim, **kwargs):
        super().__init__(**kwargs)
        self.hidden = [tf.keras.layer.Dense(30, activation="relu",
                                            kernel_initializer="he_normal")
                        for _ in range(5)]
        self.out = tf.keras.layers.Dense(output_dim)
        self.reconstruction_mean = tf.keras.metrics.Mean(
            name="reconstruction_error")

    def build(self, batch_input_shape):
        n_inputs = batch_input_shape[-1]
        self.reconstruct = tf.keras.layers.Dense(n_inputs)

    def call(self, inputs, training=False):
        Z = inputs
        for layer in self.hidden:
            Z = layer(Z)
        reconstruction = self.reconstruct(Z)
        recon_loss = tf.reduce_mean(tf.square(reconstruction - inputs))
        self.add_loss(0.05 * recon_loss)
        if training:
            result = self.reconstruction_mean(recon_loss)
            self.add_metric(result)
        return sel.out(Z)

### Computing Gradients Using Autodiff

In [48]:
def f(w1, w2):
    return 3 * w1 ** 2 + 2 * w1 * w2

In [49]:
w1, w2 = 5, 3
eps = 1e-6
(f(w1 + eps, w2) - f(w1, w2)) / eps

36.000003007075065

In [50]:
(f(w1, w2 + eps) - f(w1, w2)) / eps

10.000000003174137

In [51]:
# using reverse-mode autodiff.
w1, w2 = tf.Variable(5.), tf.Variable(3.)
with tf.GradientTape() as tape:
    z = f(w1, w2)

gradients = tape.gradient(z, [w1, w2])

gradients

[<tf.Tensor: shape=(), dtype=float32, numpy=36.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=10.0>]

In order to save memory, only put the strict minimum inside the tf.GradientTape() block. Alternatively, pause recording by creating a with tape.stop_recording() block inside the tf.GradientTape() block

In [52]:
# tape is automatically erased after calling its gradient() once:
with tf.GradientTape() as tape:
    z = f(w1, w2)

dz_dw1 = tape.gradient(z, w1) # returns tensor 36.0

try:
    dz_dw2 = tape.gradient(z, w2) # raises a RuntimeError!
except Exception as e:
    print(e)

A non-persistent GradientTape can only be used to compute one set of gradients (or jacobians)


In [53]:
# if you need to call gradient() more than once, you must make the tape persistent and delete it each time you are done with it to free resources
with tf.GradientTape(persistent=True) as tape:
    z = f(w1, w2)

dz_dw1 = tape.gradient(z, w1)
dz_dw2 = tape.gradient(z, w2)
del tape

In [54]:
# tape only keeps track of variables so if use constant tensors:
c1, c2 = tf.constant(5.), tf.constant(3.)
with tf.GradientTape() as tape:
    z = f(c1, c2)
gradients = tape.gradient(z, [c1, c2])
gradients

[None, None]

In [55]:
# you may ocasionally run into some numerical issues when computing gradients
x = tf.Variable(1e-50)
with tf.GradientTape() as tape:
    z = tf.sqrt(x)

tape.gradient(z, [x])

[<tf.Tensor: shape=(), dtype=float32, numpy=inf>]

In [56]:
# better implementation of softplus
def my_softplus(z):
    return tf.math.log(1 + tf.exp(-tf.abs(z))) + tf.maximum(0., z)

In [57]:
# in some rare cases, a numerically stable function may still have numerically unstable gradients. Have to tell TensorFlow which equation to use for the gradients
@tf.custom_gradient
def my_softplus(z):
    def my_softplus_gradients(grads): # grads = backprop'ed from upper layers
        return grads * (1 - 1 / (1 + tf.exp(z))) # stable grads of softplus

    result = tf.math.log(1 + tf.exp(-tf.abs(z))) + tf.maximum(0., z)
    return result, my_softplus_gradients
        

### Custom Training Loops

In [58]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(
    housing.data, housing.target.reshape(-1, 1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

input_shape = X_train.shape[1:]

In [59]:
# build a simple model. There is not need to compile it
l2_reg = tf.keras.regularizers.l2(0.05)
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(input_shape),
    tf.keras.layers.Dense(30, activation="relu", kernel_initializer="he_normal",
                          kernel_regularizer=l2_reg),
    tf.keras.layers.Dense(1, kernel_regularizer=l2_reg)
])

In [60]:
# create a tiny function that will randomly sample a batch of instances from the training set
def random_batch(X, y, batch_size=32):
    idx = np.random.randint(len(X), size=batch_size)
    return X[idx], y[idx]

In [61]:
def print_status_bar(step, total, loss, metrics=None):
    metrics = " - ".join([f"{m.name}: {m.result():.4f}"
                          for m in [loss] + (metrics or[])])
    end = "" if step < total else "\n"
    print(f"\r{step}/{total} - " + metrics, end=end)

In [72]:
n_epochs = 5
batch_size = 32
n_steps = len(X_train) // batch_size
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
loss_fn = tf.keras.losses.MeanSquaredError()
mean_loss = tf.keras.metrics.Mean(name="mean_loss")
metrics = [tf.keras.metrics.MeanAbsoluteError()]

In [73]:
for epoch in range(1, n_epochs + 1):
    print("Epoch {}/{}".format(epoch, n_epochs))
    for step in range(1, n_steps + 1):
        X_batch, y_batch = random_batch(X_train_scaled, y_train)
        with tf.GradientTape() as tape:
            y_pred = model(X_batch, training=True)
            main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
            loss = tf.add_n([main_loss] + model.losses)

        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        mean_loss(loss)
        for metric in metrics:
            metric(y_batch, y_pred)

        print_status_bar(step, n_steps, mean_loss, metrics)

        for metric in [mean_loss] + metrics:
            metric.reset_state()

Epoch 1/5
362/362 - mean_loss: 0.3882 - mean_absolute_error: 0.3808
Epoch 2/5
362/362 - mean_loss: 0.5888 - mean_absolute_error: 0.5676
Epoch 3/5
362/362 - mean_loss: 0.6170 - mean_absolute_error: 0.5002
Epoch 4/5
362/362 - mean_loss: 0.6372 - mean_absolute_error: 0.4944
Epoch 5/5
362/362 - mean_loss: 0.4913 - mean_absolute_error: 0.4201


In [74]:
# pasted code from Author's notebook. Shows how to use the tqdm package to display nice progress bars
# extra code – shows how to use the tqdm package to display nice progress bars

from tqdm.notebook import trange
from collections import OrderedDict

with trange(1, n_epochs + 1, desc="All epochs") as epochs:
    for epoch in epochs:
        with trange(1, n_steps + 1, desc=f"Epoch {epoch}/{n_epochs}") as steps:
            for step in steps:
                X_batch, y_batch = random_batch(X_train_scaled, y_train)
                with tf.GradientTape() as tape:
                    y_pred = model(X_batch)
                    main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
                    loss = tf.add_n([main_loss] + model.losses)

                gradients = tape.gradient(loss, model.trainable_variables)
                optimizer.apply_gradients(zip(gradients, model.trainable_variables))

                for variable in model.variables:
                    if variable.constraint is not None:
                        variable.assign(variable.constraint(variable))

                status = OrderedDict()
                mean_loss(loss)
                status["loss"] = mean_loss.result().numpy()
                for metric in metrics:
                    metric(y_batch, y_pred)
                    status[metric.name] = metric.result().numpy()

                steps.set_postfix(status)

        for metric in [mean_loss] + metrics:
            metric.reset_state()

All epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1/5:   0%|          | 0/362 [00:00<?, ?it/s]

Epoch 2/5:   0%|          | 0/362 [00:00<?, ?it/s]

Epoch 3/5:   0%|          | 0/362 [00:00<?, ?it/s]

Epoch 4/5:   0%|          | 0/362 [00:00<?, ?it/s]

Epoch 5/5:   0%|          | 0/362 [00:00<?, ?it/s]

## TensorFlow Functions and Graphs

In [65]:
def cube(x):
    return x ** 3

In [66]:
cube(tf.constant(2.0))

<tf.Tensor: shape=(), dtype=float32, numpy=8.0>

In [67]:
tf_cube = tf.function(cube)
tf_cube

<tensorflow.python.eager.polymorphic_function.polymorphic_function.Function at 0x189a2c67ed0>

In [68]:
tf_cube(2)

<tf.Tensor: shape=(), dtype=int32, numpy=8>

In [69]:
tf_cube(tf.constant(2.0))

<tf.Tensor: shape=(), dtype=float32, numpy=8.0>

In [70]:
@tf.function
def tf_cube(x):
    return x ** 3

In [71]:
tf_cube.python_function(2)

8

## Exercises

1. a library for computational graphs. Efficient computation and ease of use. Jax and Pytorch.
2. No, TensorFlow is not a replacement for NumPy. The main difference is the TensorFlow main data structure is tensors and, for NumPy, is the arrays. TensorFlow also tries to perform computations efficiently automatically with whatever function we create.
3. Yes, as NumPy and TensorFlow make conversion of tensors and arrays very easy.
4. Queue, sparse tensors, ragged tensors, tensor arrays, sets
5. Function is for when we do not have a hyperparameter, subclassing is for when we want to save the hyperparameter when saving the model
6. Same answer as question 5's
7. Create a custom layer when we can do what we want in a single layer. Create a custom model when we want different layers to interact in a different way.
8. More control over the training loop, debugging, training in a different way than provided by TensorFlow.
9. Must be convertible to TF functions.
10. only use tf operations, if creating variables or datasets do so in the first line of code or outside function, do not use other libraries and not even pythond standard libraries.
11. When we want to use functions from other libraries. When compiling model, set eager execution to True. If all models are dynamic, they could become much less efficient.

12. Implement a custom layer that performs layer normalization:

    a. The build() method should define two trainable weights $\alpha$ and $\beta$, both of shape input_shape [-1:] and data type tf.float32. $\alpha$ should be initialized with 1s, and $\beta$ with 0s

    b. The call() method should compute the mean $\mu$ and standard deviation $\sigma$ of each instance's features. For this, you can use `tf.nn.moments(inputs, axes=-1, keepdims=True)`, which returns the mean $\mu$ and the variance $\sigma^2$ of all instances (compute the square root of the variance to get the standard deviation). Then the function should compute and return *α*⊗(*X* - μ)/(σ + ε) + *β*, where ⊗ represents itemwise multiplication (`*`) and ε is a smoothing term (small constant to avoid division by zero, e.g., 0.001).

    c. _Ensure that your custom layer produces the same (or very nearly the same) output as the `tf.keras.layers.LayerNormalization` layer._

In [85]:
class Layer_norm(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        
    def build(self, input_shape):
        self.alpha = self.add_weight(
            name="alpha", shape=input_shape[-1:],
            initializer="ones")
        self.beta = self.add_weight(
            name="beta", shape=input_shape[-1:],
            initializer="zeros")

    def call(self, X):
        mean, variance = tf.nn.moments(X, axes=-1, keepdims=True)

        return self.alpha * (X - mean) / (tf.sqrt(variance) + 0.0001) + self.beta
    

In [150]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(
    housing.data, housing.target.reshape(-1, 1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, random_state=42)

In [151]:
# Training on the California housing dataset
model_tf = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=X_train.shape[-1:]),
    tf.keras.layers.LayerNormalization(),
    tf.keras.layers.Dense(100, activation="relu", kernel_initializer="he_normal"),
    tf.keras.layers.Dense(1)])

my_model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=X_train.shape[-1:]),
    Layer_norm(),
    tf.keras.layers.Dense(100, activation="relu", kernel_initializer="he_normal"),
    tf.keras.layers.Dense(1)])

In [152]:
model_tf.compile(
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=["RootMeanSquaredError"],
    optimizer="SGD")

my_model.compile(
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=["RootMeanSquaredError"],
    optimizer="SGD")

In [153]:
history_tf = model_tf.fit(X_train, y_train, 
                       epochs=20, validation_data=(X_valid, y_valid))

Epoch 1/20
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - RootMeanSquaredError: 1.2124 - loss: 1.4700 - val_RootMeanSquaredError: 1.1429 - val_loss: 1.3063
Epoch 2/20
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - RootMeanSquaredError: 1.1557 - loss: 1.3356 - val_RootMeanSquaredError: 1.1873 - val_loss: 1.4097
Epoch 3/20
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - RootMeanSquaredError: 1.1547 - loss: 1.3332 - val_RootMeanSquaredError: 1.1422 - val_loss: 1.3047
Epoch 4/20
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - RootMeanSquaredError: 1.1518 - loss: 1.3266 - val_RootMeanSquaredError: 1.1618 - val_loss: 1.3499
Epoch 5/20
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - RootMeanSquaredError: 1.1499 - loss: 1.3224 - val_RootMeanSquaredError: 1.1349 - val_loss: 1.2880
Epoch 6/20
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

In [154]:
my_history = my_model.fit(X_train, y_train, 
                       epochs=20, validation_data=(X_valid, y_valid))

Epoch 1/20
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - RootMeanSquaredError: 1.1869 - loss: 1.4087 - val_RootMeanSquaredError: 1.1514 - val_loss: 1.3257
Epoch 2/20
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - RootMeanSquaredError: 1.1652 - loss: 1.3578 - val_RootMeanSquaredError: 1.1407 - val_loss: 1.3013
Epoch 3/20
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - RootMeanSquaredError: 1.1594 - loss: 1.3443 - val_RootMeanSquaredError: 1.1454 - val_loss: 1.3120
Epoch 4/20
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - RootMeanSquaredError: 1.1577 - loss: 1.3402 - val_RootMeanSquaredError: 1.1386 - val_loss: 1.2965
Epoch 5/20
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - RootMeanSquaredError: 1.1537 - loss: 1.3310 - val_RootMeanSquaredError: 1.1859 - val_loss: 1.4064
Epoch 6/20
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

In [155]:
model_tf.layers[0](X_train[:4])

<tf.Tensor: shape=(4, 8), dtype=float32, numpy=
array([[-0.29537582, -0.16629328, -0.24113165, -0.2219448 ,  0.11663638,
        -0.11351932, -0.4765796 , -0.23885104],
       [-0.32881823, -0.21005034, -0.25207087, -0.2365596 ,  0.11811031,
        -0.1262686 , -0.58191866, -0.11767381],
       [-0.30603322, -0.13173658, -0.23529656, -0.22523968,  0.11615811,
        -0.11617221, -0.46847165, -0.2633689 ],
       [-0.251659  , -0.15667798, -0.22586706, -0.21513076,  0.11538444,
        -0.10295809, -0.43709666, -0.3030616 ]], dtype=float32)>

In [156]:
my_model.layers[0](X_train[:4])

<tf.Tensor: shape=(4, 8), dtype=float32, numpy=
array([[-0.18491364, -0.18655874, -0.15223306, -0.06520963,  0.57598996,
        -0.22528985, -0.43214753, -0.30542195],
       [-0.21388632, -0.23079644, -0.16345519, -0.07829991,  0.5789715 ,
        -0.24051759, -0.52364355, -0.18486273],
       [-0.19414663, -0.15162255, -0.14624715, -0.06816083,  0.5750226 ,
        -0.22845843, -0.42510512, -0.32981485],
       [-0.14703983, -0.17683779, -0.13657376, -0.05910635,  0.5734575 ,
        -0.21267551, -0.39785323, -0.369305  ]], dtype=float32)>

In [157]:
model_tf.layers[0](X_train[:4]) - my_model.layers[0](X_train[:4])

<tf.Tensor: shape=(4, 8), dtype=float32, numpy=
array([[-0.11046219,  0.02026546, -0.08889858, -0.15673517, -0.45935357,
         0.11177053, -0.04443207,  0.06657091],
       [-0.11493191,  0.0207461 , -0.08861569, -0.15825969, -0.4608612 ,
         0.11424899, -0.0582751 ,  0.06718893],
       [-0.11188659,  0.01988597, -0.08904941, -0.15707885, -0.45886445,
         0.11228622, -0.04336652,  0.06644595],
       [-0.10461918,  0.02015981, -0.0892933 , -0.15602441, -0.45807302,
         0.10971742, -0.03924343,  0.06624341]], dtype=float32)>

The difference looks pretty small to me

## 13. Train a model using a custom training loop to tackle the Fashion MNIST dataset
_The Fashion MNIST dataset was introduced in Chapter 10._

### a.
_Exercise: Display the epoch, iteration, mean training loss, and mean accuracy over each epoch (updated at each iteration), as well as the validation loss and accuracy at the end of each epoch._

### b.
_Exercise: Try using a different optimizer with a different learning rate for the upper layers and the lower layers._

In [179]:
fashion_mnist = tf.keras.datasets.fashion_mnist.load_data()
(X_train_full, y_train_full), (X_test, y_test) = fashion_mnist

X_train, y_train = X_train_full[:-5000], y_train_full[:-5000]
X_valid, y_valid = X_train_full[-5000:], y_train_full[-5000:]

In [185]:
len(model.trainable_variables)

12

In [186]:
def get_batch(X, y, batch_size):
    indexes = np.random.default_rng().integers(X.shape[0], size=batch_size)

    return X[indexes], y[indexes]

In [239]:
def print_status(batch, batches, loss, metric):
    print(f"\rBatch {batch}/{batches} - loss: {loss.result()} - Accuracy: {metric.result()} ", end="")

def print_val(loss, metric):
        print(f"- val_loss: {loss} - val_accuracy: {metric.result()}")

In [252]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=[28, 28]),
    tf.keras.layers.Flatten(),
    tf.keras.layers.LayerNormalization(),
    tf.keras.layers.Dense(50, activation="relu", kernel_initializer="he_normal"),
    tf.keras.layers.Dense(50, activation="relu", kernel_initializer="he_normal"),
    tf.keras.layers.Dense(50, activation="relu", kernel_initializer="he_normal"),
    tf.keras.layers.Dense(50, activation="relu", kernel_initializer="he_normal"),
    tf.keras.layers.Dense(10, activation="softmax")
])

In [253]:
epochs = 10
batch_size = 32
batches = int(np.ceil(X_train.shape[0] / batch_size))
optimizer_upper = tf.keras.optimizers.SGD(1e-4)
optimizer_lower = tf.keras.optimizers.Adam(1e-3)
loss_fn = tf.keras.losses.sparse_categorical_crossentropy
mean_loss = tf.keras.metrics.Mean()
metric = tf.keras.metrics.Accuracy()
num_variables = len(model.trainable_variables)

In [254]:
for epoch in range(1, epochs + 1):
    print(f"Epoch {epoch}/{epochs}")
    
    for batch in range(1, batches + 1):
        X_batch, y_true = get_batch(X_train, y_train, batch_size)
        
        with tf.GradientTape() as tape:
            y_pred = model(X_batch, training=True)
            main_loss = tf.reduce_mean(loss_fn(y_true, y_pred))
            loss = tf.reduce_sum([main_loss] + model.losses)
        gradients = tape.gradient(loss, model.trainable_weights)

        optimizer_lower.apply_gradients(zip(gradients[:num_variables//2], model.trainable_variables[:num_variables//2]))
        optimizer_upper.apply_gradients(zip(gradients[num_variables//2:], model.trainable_variables[num_variables//2:]))
        
        mean_loss(loss)
        accuracy = metric(y_true, tf.argmax(y_pred, axis=1))
        print_status(batch, batches, mean_loss, metric)
        
    metric.reset_state()
    y_pred_valid = model(X_valid)
    val_loss = tf.reduce_mean(loss_fn(y_valid, y_pred_valid))
    metric(y_valid, tf.argmax(y_pred_valid, axis=1))
    print_val(val_loss, metric)
    metric.reset_state()
    mean_loss.reset_state()
    

Epoch 1/10
Batch 1719/1719 - loss: 0.5173409581184387 - Accuracy: 0.8196625709533691 - val_loss: 0.43423324823379517 - val_accuracy: 0.843999981880188
Epoch 2/10
Batch 1719/1719 - loss: 0.4438561499118805 - Accuracy: 0.8689281344413757 - val_loss: 0.3812102973461151 - val_accuracy: 0.8611999750137329
Epoch 3/10
Batch 1719/1719 - loss: 0.40463095903396606 - Accuracy: 0.884289562702179 - val_loss: 0.4023357033729553 - val_accuracy: 0.8529999852180481
Epoch 4/10
Batch 1719/1719 - loss: 0.38221275806427 - Accuracy: 0.8871254920959473 - val_loss: 0.37169259786605835 - val_accuracy: 0.8669999837875366
Epoch 5/10
Batch 1719/1719 - loss: 0.3636491894721985 - Accuracy: 0.8953606486320496 - val_loss: 0.357223778963089 - val_accuracy: 0.8700000047683716
Epoch 6/10
Batch 1719/1719 - loss: 0.34774789214134216 - Accuracy: 0.9021960496902466 - val_loss: 0.3755651116371155 - val_accuracy: 0.8715999722480774
Epoch 7/10
Batch 1719/1719 - loss: 0.3351919949054718 - Accuracy: 0.9061227440834045 - val_loss

### Author's solutions:

1. TensorFlow is an open-source library for numerical computation, particularly well suited and fine-tuned for large-scale Machine Learning. Its core is similar to NumPy, but it also features GPU support, support for distributed computing, computation graph analysis and optimization capabilities (with a portable graph format that allows you to train a TensorFlow model in one environment and run it in another), an optimization API based on reverse-mode autodiff, and several powerful APIs such as tf.keras, tf.data, tf.image, tf.signal, and more. Other popular Deep Learning libraries include PyTorch, MXNet, Microsoft Cognitive Toolkit, Theano, Caffe2, and Chainer.
2. Although TensorFlow offers most of the functionalities provided by NumPy, it is not a drop-in replacement, for a few reasons. First, the names of the functions are not always the same (for example, `tf.reduce_sum()` versus `np.sum()`). Second, some functions do not behave in exactly the same way (for example, `tf.transpose()` creates a transposed copy of a tensor, while NumPy's `T` attribute creates a transposed view, without actually copying any data). Lastly, NumPy arrays are mutable, while TensorFlow tensors are not (but you can use a `tf.Variable` if you need a mutable object).
3. Both `tf.range(10)` and `tf.constant(np.arange(10))` return a one-dimensional tensor containing the integers 0 to 9. However, the former uses 32-bit integers while the latter uses 64-bit integers. Indeed, TensorFlow defaults to 32 bits, while NumPy defaults to 64 bits.
4. Beyond regular tensors, TensorFlow offers several other data structures, including sparse tensors, tensor arrays, ragged tensors, queues, string tensors, and sets. The last two are actually represented as regular tensors, but TensorFlow provides special functions to manipulate them (in `tf.strings` and `tf.sets`).
5. When you want to define a custom loss function, in general you can just implement it as a regular Python function. However, if your custom loss function must support some hyperparameters (or any other state), then you should subclass the `keras.losses.Loss` class and implement the `__init__()` and `call()` methods. If you want the loss function's hyperparameters to be saved along with the model, then you must also implement the `get_config()` method.
6. Much like custom loss functions, most metrics can be defined as regular Python functions. But if you want your custom metric to support some hyperparameters (or any other state), then you should subclass the `keras.metrics.Metric` class. Moreover, if computing the metric over a whole epoch is not equivalent to computing the mean metric over all batches in that epoch (e.g., as for the precision and recall metrics), then you should subclass the `keras.metrics.Metric` class and implement the `__init__()`, `update_state()`, and `result()` methods to keep track of a running metric during each epoch. You should also implement the `reset_state()` method unless all it needs to do is reset all variables to 0.0. If you want the state to be saved along with the model, then you should implement the `get_config()` method as well.
7. You should distinguish the internal components of your model (i.e., layers or reusable blocks of layers) from the model itself (i.e., the object you will train). The former should subclass the `keras.layers.Layer` class, while the latter should subclass the `keras.models.Model` class.
8. Writing your own custom training loop is fairly advanced, so you should only do it if you really need to. Keras provides several tools to customize training without having to write a custom training loop: callbacks, custom regularizers, custom constraints, custom losses, and so on. You should use these instead of writing a custom training loop whenever possible: writing a custom training loop is more error-prone, and it will be harder to reuse the custom code you write. However, in some cases writing a custom training loop is necessary⁠—for example, if you want to use different optimizers for different parts of your neural network, like in the [Wide & Deep paper](https://homl.info/widedeep). A custom training loop can also be useful when debugging, or when trying to understand exactly how training works.
9. Custom Keras components should be convertible to TF Functions, which means they should stick to TF operations as much as possible and respect all the rules listed in Chapter 12 (in the _TF Function Rules_ section). If you absolutely need to include arbitrary Python code in a custom component, you can either wrap it in a `tf.py_function()` operation (but this will reduce performance and limit your model's portability) or set `dynamic=True` when creating the custom layer or model (or set `run_eagerly=True` when calling the model's `compile()` method).
10. Please refer to Chapter 12 for the list of rules to respect when creating a TF Function (in the _TF Function Rules_ section).
11. Creating a dynamic Keras model can be useful for debugging, as it will not compile any custom component to a TF Function, and you can use any Python debugger to debug your code. It can also be useful if you want to include arbitrary Python code in your model (or in your training code), including calls to external libraries. To make a model dynamic, you must set `dynamic=True` when creating it. Alternatively, you can set `run_eagerly=True` when calling the model's `compile()` method. Making a model dynamic prevents Keras from using any of TensorFlow's graph features, so it will slow down training and inference, and you will not have the possibility to export the computation graph, which will limit your model's portability.

### 12.

In [255]:
class LayerNormalization(tf.keras.layers.Layer):
    def __init__(self, eps=0.001, **kwargs):
        super().__init__(**kwargs)
        self.eps = eps

    def build(self, batch_input_shape):
        self.alpha = self.add_weight(
            name="alpha", shape=batch_input_shape[-1:],
            initializer="ones")
        self.beta = self.add_weight(
            name="beta", shape=batch_input_shape[-1:],
            initializer="zeros")

    def call(self, X):
        mean, variance = tf.nn.moments(X, axes=-1, keepdims=True)
        return self.alpha * (X - mean) / (tf.sqrt(variance + self.eps)) + self.beta

    def get_config(self):
        base_config = super().get_config()
        return {**base_config, "eps": self.eps}

Note that making _ε_ a hyperparameter (`eps`) was not compulsory. Also note that it's preferable to compute `tf.sqrt(variance + self.eps)` rather than `tf.sqrt(variance) + self.eps`. Indeed, the derivative of sqrt(z) is undefined when z=0, so training will bomb whenever the variance vector has at least one component equal to 0. Adding _ε_ within the square root guarantees that this will never happen.

Let's create one instance of each class, apply them to some data (e.g., the training set), and ensure that the difference is negligeable.

In [256]:
X = X_train.astype(np.float32)

custom_layer_norm = LayerNormalization()
keras_layer_norm = tf.keras.layers.LayerNormalization()

tf.reduce_mean(tf.keras.losses.MeanAbsoluteError()(
    keras_layer_norm(X), custom_layer_norm(X)))

<tf.Tensor: shape=(), dtype=float32, numpy=4.682965837332631e-08>

Yep, that's close enough. To be extra sure, let's make alpha and beta completely random and compare again:

In [257]:
tf.keras.utils.set_random_seed(42)
random_alpha = np.random.rand(X.shape[-1])
random_beta = np.random.rand(X.shape[-1])

custom_layer_norm.set_weights([random_alpha, random_beta])
keras_layer_norm.set_weights([random_alpha, random_beta])

tf.reduce_mean(tf.keras.losses.MeanAbsoluteError()(
    keras_layer_norm(X), custom_layer_norm(X)))

<tf.Tensor: shape=(), dtype=float32, numpy=2.438514279390347e-08>

Still a negligeable difference! Our custom layer works fine.

## 13.

In [258]:
(X_train_full, y_train_full), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
X_train_full = X_train_full.astype(np.float32) / 255.
X_valid, X_train = X_train_full[:5000], X_train_full[5000:]
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]
X_test = X_test.astype(np.float32) / 255.

In [259]:
tf.keras.utils.set_random_seed(42)

In [260]:
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=[28, 28]),
    tf.keras.layers.Dense(100, activation="relu"),
    tf.keras.layers.Dense(10, activation="softmax"),
])

  super().__init__(**kwargs)


In [261]:
n_epochs = 5
batch_size = 32
n_steps = len(X_train) // batch_size
optimizer = tf.keras.optimizers.Nadam(learning_rate=0.01)
loss_fn = tf.keras.losses.sparse_categorical_crossentropy
mean_loss = tf.keras.metrics.Mean()
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]

In [262]:
with trange(1, n_epochs + 1, desc="All epochs") as epochs:
    for epoch in epochs:
        with trange(1, n_steps + 1, desc=f"Epoch {epoch}/{n_epochs}") as steps:
            for step in steps:
                X_batch, y_batch = random_batch(X_train, y_train)
                with tf.GradientTape() as tape:
                    y_pred = model(X_batch)
                    main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
                    loss = tf.add_n([main_loss] + model.losses)
                gradients = tape.gradient(loss, model.trainable_variables)
                optimizer.apply_gradients(zip(gradients, model.trainable_variables))
                for variable in model.variables:
                    if variable.constraint is not None:
                        variable.assign(variable.constraint(variable))                    
                status = OrderedDict()
                mean_loss(loss)
                status["loss"] = mean_loss.result().numpy()
                for metric in metrics:
                    metric(y_batch, y_pred)
                    status[metric.name] = metric.result().numpy()
                steps.set_postfix(status)
            y_pred = model(X_valid)
            status["val_loss"] = np.mean(loss_fn(y_valid, y_pred))
            status["val_accuracy"] = np.mean(tf.keras.metrics.sparse_categorical_accuracy(
                tf.constant(y_valid, dtype=np.float32), y_pred))
            steps.set_postfix(status)
        for metric in [mean_loss] + metrics:
            metric.reset_state()


All epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1/5:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 2/5:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 3/5:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 4/5:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 5/5:   0%|          | 0/1718 [00:00<?, ?it/s]

In [263]:
lower_layers = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=[28, 28]),
    tf.keras.layers.Dense(100, activation="relu"),
])
upper_layers = tf.keras.Sequential([
    tf.keras.layers.Dense(10, activation="softmax"),
])
model = tf.keras.Sequential([
    lower_layers, upper_layers
])

In [264]:
lower_optimizer = tf.keras.optimizers.SGD(learning_rate=1e-4)
upper_optimizer = tf.keras.optimizers.Nadam(learning_rate=1e-3)

In [265]:
n_epochs = 5
batch_size = 32
n_steps = len(X_train) // batch_size
loss_fn = tf.keras.losses.sparse_categorical_crossentropy
mean_loss = tf.keras.metrics.Mean()
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]

In [266]:
with trange(1, n_epochs + 1, desc="All epochs") as epochs:
    for epoch in epochs:
        with trange(1, n_steps + 1, desc=f"Epoch {epoch}/{n_epochs}") as steps:
            for step in steps:
                X_batch, y_batch = random_batch(X_train, y_train)
                with tf.GradientTape(persistent=True) as tape:
                    y_pred = model(X_batch)
                    main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
                    loss = tf.add_n([main_loss] + model.losses)
                for layers, optimizer in ((lower_layers, lower_optimizer),
                                          (upper_layers, upper_optimizer)):
                    gradients = tape.gradient(loss, layers.trainable_variables)
                    optimizer.apply_gradients(zip(gradients, layers.trainable_variables))
                del tape
                for variable in model.variables:
                    if variable.constraint is not None:
                        variable.assign(variable.constraint(variable))                    
                status = OrderedDict()
                mean_loss(loss)
                status["loss"] = mean_loss.result().numpy()
                for metric in metrics:
                    metric(y_batch, y_pred)
                    status[metric.name] = metric.result().numpy()
                steps.set_postfix(status)
            y_pred = model(X_valid)
            status["val_loss"] = np.mean(loss_fn(y_valid, y_pred))
            status["val_accuracy"] = np.mean(tf.keras.metrics.sparse_categorical_accuracy(
                tf.constant(y_valid, dtype=np.float32), y_pred))
            steps.set_postfix(status)
        for metric in [mean_loss] + metrics:
            metric.reset_state()

All epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1/5:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 2/5:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 3/5:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 4/5:   0%|          | 0/1718 [00:00<?, ?it/s]

Epoch 5/5:   0%|          | 0/1718 [00:00<?, ?it/s]

`The error above happens because the output is not saved when using the library for pretty prints.`