#Automatic Differentiation and Gradients
Automatic differentiation is useful for implementing machine learning algorithms such as backpropagation for training neural networks.

In [1]:
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

##Gradient Tapes

In [3]:
# Example with a sclars
x = tf.Variable(3.0)

with tf.GradientTape() as tape:
    y = x**2

# Calculate the gradient
dy_dx = tape.gradient(y, x)
dy_dx.numpy()

6.0

In [4]:
# Example with a tensor
w = tf.Variable(tf.random.normal((3, 2)), name='w')
b = tf.Variable(tf.zeros(2, dtype=tf.float32), name='b')
x = [[1., 2., 3.]]

with tf.GradientTape(persistent=True) as tape:
    y = x @ w + b
    loss = tf.reduce_mean(y**2)

[dl_dw, dl_db] = tape.gradient(loss, [w, b])

print(w.shape)
print(dl_dw.shape)

(3, 2)
(3, 2)


In [6]:
# Example with a simple model
layer = tf.keras.layers.Dense(2, activation='relu')
x = tf.constant([[1., 2., 3.]])

with tf.GradientTape() as tape:
    # Forward pass
    y = layer(x)
    loss = tf.reduce_mean(y**2)

# Calculate gradients with respect to every trainable variable
grad = tape.gradient(loss, layer.trainable_variables)

# Print
for var, g in zip(layer.trainable_variables, grad):
    print(f'{var.name}, shape: {g.shape}')

dense_1/kernel:0, shape: (3, 2)
dense_1/bias:0, shape: (2,)


##What Tapes Watch?

The default behavior is to record all operations after accessing a <u>trainable</u> <u>`tf.Variable`</u>. The following fails to calculate a gradient because the `tf.Tensor` is not "watched" by default, and the `tf.Variable` is not trainable:

In [7]:
# A trainable variable
x0 = tf.Variable(3.0, name='x0')
# Not trainable
x1 = tf.Variable(3.0, name='x1', trainable=False)
# Not a Variable: A variable + tensor returns a tensor.
x2 = tf.Variable(2.0, name='x2') + 1.0 # a scalar tensor
# Not a Variable
x3 = tf.constant(3.0, name='x3')

with tf.GradientTape() as tape:
    y = (x0**2) + (x1**2) + (x2**2)

grad = tape.gradient(y, [x0, x1, x2, x3])

for g in grad:
    print(g)

tf.Tensor(6.0, shape=(), dtype=float32)
None
None
None


In [8]:
# List the variables being watched by the tape
[var.name for var in tape.watched_variables()]

['x0:0']

To record gradients with respect to a `tf.Tensor`, you need to call `GradientTape.watch(x)`

In [9]:
x = tf.constant(3.0)
with tf.GradientTape() as tape:
    tape.watch(x)
    y = x**2

# dy = 2x * dx
dy_dx = tape.gradient(y, x)
print(dy_dx.numpy())

6.0


Conversely, to disable the default behavior of watching all `tf.Variables`, set `watch_accessed_variables=False` when creating the gradient tape.

In [10]:
# Uses two variables, but only connects the gradient for one of the variables
x0 = tf.Variable(0.0)
x1 = tf.Variable(10.0)

with tf.GradientTape(watch_accessed_variables=False) as tape:
    tape.watch(x1)
    y0 = tf.math.sin(x0)
    y1 = tf.nn.softplus(x1)
    y = y0 + y1
    ys = tf.reduce_sum(y)

# Since GradientTape.watch was not called on x0, no gradient is computed with respect to it:
# dys/dx1 = exp(x1) / (1 + exp(x1)) = sigmoid(x1)
grad = tape.gradient(ys, {'x0': x0, 'x1': x1})

print('dy/dx0:', grad['x0'])
print('dy/dx1:', grad['x1'].numpy())

dy/dx0: None
dy/dx1: 0.9999546


##Gradients of Non-scalar Targets
A gradient is fundamentally an operation on a scalar. Thus, if you ask for the gradient of multiple targets, the result for each source is:
- The gradient of the sum of the targets, or equivalently
- The sum of the gradients of each target.



In [11]:
x = tf.Variable(2.0)
with tf.GradientTape() as tape:
    y0 = x**2   # gradient = 2x
    y1 = 1 / x  # gradient = -(1/(x^2))

print(tape.gradient({'y0': y0, 'y1': y1}, x).numpy())

3.75


Similarly, if the target(s) are not scalar the gradient of the sum is calculated:

In [12]:
x = tf.Variable(2.)

with tf.GradientTape() as tape:
    y = x * [3., 4.]  # gradient = 3.0 + 4.0 since y = [3.0 * x, 4.0 * x]

print(tape.gradient(y, x).numpy())

7.0


##Cases where `gradient` returns `None`
When a target is not connected to a source, `gradient` will return `None`.


In [14]:
x = tf.Variable(2.)
y = tf.Variable(3.)

with tf.GradientTape() as tape:
    z = y * y   # z is not connected to x
print(tape.gradient(z, x))

None


###Replaced a `Variable` with a `Tensor`
The tape will automatically watch a `tf.Variable` but not a `tf.Tensor`. Hence, one common error is to inadvertently replace a `tf.Variable` with a `tf.Tensor`, instead of using `Variable.assign` to update the `tf.Variable`.

In [15]:
x = tf.Variable(2.0)

for epoch in range(2):
    with tf.GradientTape() as tape:
        y = x + 1

    print(type(x).__name__, ":", tape.gradient(y, x))
    x = x + 1   # This should be `x.assign_add(1)`, and '1' is a scalar tensor

ResourceVariable : tf.Tensor(1.0, shape=(), dtype=float32)
EagerTensor : None


###Did calculations outside of TensorFlow
The tape can't record the gradient path if the calculation exits TensorFlow.

In [16]:
x = tf.Variable([[1.0, 2.0],
                 [3.0, 4.0]], dtype=tf.float32)

with tf.GradientTape() as tape:
    x2 = x**2

    # This step is calculated with NumPy rather than TensorFlow
    y = np.mean(x2, axis=0)

    # Like most ops, reduce_mean will cast the NumPy array
    # to a constant tensor by using `tf.convert_to_tensor`.
    y = tf.reduce_mean(y, axis=0)

print(tape.gradient(y, x))

None


###Took Gradients Through an Integer or String
Integers and strings are not differentiable. If a calculation path uses these data types there will be no gradient.

Nobody expects `strings` to be differentiable, but it's easy to accidentally create an `int` constant or variable if you don't specify the `dtype`.


In [17]:
x = tf.constant(10)

with tf.GradientTape() as g:
    g.watch(x)
    y = x * x

print(g.gradient(y, x))



None


###Took gradients through a stateful object
State stops gradients. When you read from a stateful object, the tape can only observe the current state, not the history that lead to it.

A `tf.Tensor` is immutable. You can't change a tensor once it's created. It has a value, but no state. All the operations discussed so far are also stateless: the output of a tf.matmul only depends on its inputs.

A `tf.Variable` has internal state—its value. When you use the variable, the state is read. It's normal to calculate a gradient with respect to a variable, but the variable's state blocks gradient calculations from going farther back. For example:

In [18]:
x0 = tf.Variable(3.0)
x1 = tf.Variable(0.0)

with tf.GradientTape() as tape:
    # Update x1 = x1 + x0.
    x1.assign_add(x0)
    # The tape starts recording from x1.
    y = x1**2   # y = (x1 + x0)**2

# This doesn't work. x0 is a state value that leads to x1
print(tape.gradient(y, x0))   #dy/dx0 = 2*(x1 + x0)

None


#Reference
- [Automatic Differentiation and Gradients](https://www.tensorflow.org/guide/autodiff)