# Computing gradients with autodiff

# Importing libraries

In [1]:
import tensorflow as tf

- Defining a demo func

In [2]:
def f(w1, w2):
    return 3 * w1 ** 2 + 2 * w1 * w2

In [3]:
w1, w2 = 5, 3
eps = 1e-6 # Small difference

In [4]:
(f(w1 + eps, w2) - f(w1, w2)) / eps

36.000003007075065

In [5]:
(f(w1, w2 + eps) - f(w1, w2)) / eps

10.000000003174137

- Calling f() for each parameter in a NN is hard to control. Instead we will use autodiff.

In [6]:
w1, w2 = tf.Variable(5.), tf.Variable(3.)

In [7]:
with tf.GradientTape() as tape: # Creating a tape that records the computations
    z = f(w1, w2)

In [8]:
gradients = tape.gradient(z, [w1, w2])

In [9]:
gradients

[<tf.Tensor: shape=(), dtype=float32, numpy=36.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=10.0>]

In [10]:
with tf.GradientTape() as tape:
    z = f(w1, w2)

In [11]:
dz_dw1 = tape.gradient(z, w1)
dz_dw1

<tf.Tensor: shape=(), dtype=float32, numpy=36.0>

In [12]:
dz_dw2 = tape.gradient(z, w1)
dz_dw2 # Gives error as the tape can be used only once.

RuntimeError: GradientTape.gradient can only be called once on non-persistent tapes.

In [13]:
with tf.GradientTape(persistent = True) as tape: # Setting the tape as persistent for multiple usage
    z = f(w1, w2)

In [14]:
tape.gradient(z, w1)

<tf.Tensor: shape=(), dtype=float32, numpy=36.0>

In [15]:
tape.gradient(z, w2)

<tf.Tensor: shape=(), dtype=float32, numpy=10.0>

In [16]:
del tape

In [17]:
c1, c2 = tf.constant(5.), tf.constant(3.)

In [18]:
with tf.GradientTape() as tape:
    z = f(c1, c2)

In [19]:
tape.gradient(z, [c1, c2]) # No gradients as tape by default track only variables.

[None, None]

In [20]:
with tf.GradientTape() as tape:
    tape.watch(c1) # Forcing tape to track tensors
    tape.watch(c2)
    z = f(c1, c2)

In [21]:
tape.gradient(z, [c1, c2])

[<tf.Tensor: shape=(), dtype=float32, numpy=36.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=10.0>]

In [22]:
with tf.GradientTape() as tape:
    z1 = f(w1, w2 + 2.)
    z2 = f(w1, w2 + 5.)
    z3 = f(w1, w2 + 7.)
tape.gradient([z1, z2, z3], [w1, w2]) # Gradients for multiple outputs. 

[<tf.Tensor: shape=(), dtype=float32, numpy=136.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=30.0>]

In [23]:
with tf.GradientTape(persistent = True) as tape:
    z1 = f(w1, w2 + 2.)
    z2 = f(w1, w2 + 5.)
    z3 = f(w1, w2 + 7.)
tf.reduce_sum(tf.stack([tape.gradient(z, [w1, w2]) for z in (z1, z2, z3)]), axis = 0) # The above cell is same as this one

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([136.,  30.], dtype=float32)>

In [24]:
with tf.GradientTape(persistent = True) as hessian_tape:
    with tf.GradientTape() as tape:
        z = f(w1, w2)
    jacobians = tape.gradient(z, [w1, w2])
hessians = [hessian_tape.gradient(jacobian, [w1, w2]) for jacobian in jacobians]

In [25]:
jacobians

[<tf.Tensor: shape=(), dtype=float32, numpy=36.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=10.0>]

In [26]:
hessians

[[<tf.Tensor: shape=(), dtype=float32, numpy=6.0>,
  <tf.Tensor: shape=(), dtype=float32, numpy=2.0>],
 [<tf.Tensor: shape=(), dtype=float32, numpy=2.0>, None]]

In [27]:
def f(w1, w2):
    return 3 * w1 ** 2 + tf.stop_gradient(2 * w1 * w2)

In [28]:
with tf.GradientTape() as tape:
    z = f(w1, w2)

In [29]:
tape.gradient(z, [w1, w2])

[<tf.Tensor: shape=(), dtype=float32, numpy=30.0>, None]

In [37]:
x = tf.Variable([100.])

In [38]:
# Softplus activation func
def softplus(z):
    return tf.math.log(tf.exp(z) + 1)

In [39]:
with tf.GradientTape() as tape:
    z = softplus(x)

In [40]:
tape.gradient(z, [x])

[<tf.Tensor: shape=(1,), dtype=float32, numpy=array([nan], dtype=float32)>]

In [41]:
@tf.custom_gradient
def softplus_v2(z):
    exp = tf.exp(z)
    def softplus_v2_gradient(grad):
        return grad / (1 + 1 / exp)
    return tf.math.log(exp + 1), softplus_v2_gradient

In [42]:
def softplus_v2(z):
    return tf.where(z > 30., z, tf.math.log(tf.exp(z) + 1.))

In [43]:
x = tf.Variable([1000.])

In [47]:
with tf.GradientTape() as tape:
    z = softplus_v2(x)

In [48]:
z, tape.gradient(z, [x])

(<tf.Tensor: shape=(1,), dtype=float32, numpy=array([1000.], dtype=float32)>,
 [<tf.Tensor: shape=(1,), dtype=float32, numpy=array([nan], dtype=float32)>])