# **AutoDiff**
Automatic differentiation is useful for implementing machine learning algorithms such as backpropagation for training neural networks.

#### **Setup**

In [13]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as pl

#### **Computing Gradients**

#### **Gradient Tapes**

In [24]:
# Define a function for comoutation
def fn(x_ten):
    return x_ten**2

x = tf.Variable(3.0)

with tf.GradientTape() as tape:
    # Use defined function to transform the input.
    y = fn(x)

# Use GradientTape() to compute gradient... 
tape.gradient(y, x).numpy()

6.0

In [None]:
### Gradients with nd-Variables
v1 = tf.Variable(tf.random.normal([3,3], 3, 9,seed= 3, name= 'var1'))
v2 = tf.Variable(tf.eye(3), name='var2')

w = [[1. ,2. ,3. ]]

# Recording operations onto `tape`
with tf.GradientTape() as tape:
    y = v1 * w + v2
    loss = tf.reduce_mean(y**2)
    
# Computing gradients w.r.t...


> **Using a dictionary of Variables with the same tape as above**

In [None]:
(dl_dv1, dl_dv2) = tape.gradient(loss, [v1, v2])
  
# np.argsort organizes the index bsaed on the sorted order of the tensor.
np.argsort(tf.nn.softmax(dl_dv1.numpy()))
dl_dv2.numpy()



In [42]:
my_vars = {
    'dl_dw': v1,
    'dl_db': v2
}

grad = tape.gradient(loss, my_vars)

for i in grad:
    print(f'The name of the tensor-Variable is: {i}')
    print(grad[i].numpy())
    print()

np.argsort(tf.nn.softmax(grad['dl_db']).numpy())

# We discovered a new problem here...  
# If this problem occurs 
# 'A non-persistent GradientTape can only be used to compute one set of gradients (or jacobians)'
# It means the tape has previously been used to compute the gradients, thus it can not be reused.




{'dl_dw': <tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[  0.5811825 , -10.723121  ,   6.628027  ],
       [  2.0538814 ,   8.9651165 ,  38.633232  ],
       [  3.0487926 ,   0.34223557, -26.612106  ]], dtype=float32)>, 'dl_db': <tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[ 0.5811825 , -5.3615603 ,  2.2093422 ],
       [ 2.0538814 ,  4.4825583 , 12.877744  ],
       [ 3.0487926 ,  0.17111778, -8.870702  ]], dtype=float32)>}


<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[ 0.5811825 , -5.3615603 ,  2.2093422 ],
       [ 2.0538814 ,  4.4825583 , 12.877744  ],
       [ 3.0487926 ,  0.17111778, -8.870702  ]], dtype=float32)>

##### **Model Gradients**

In [79]:
import tensorflow.keras.layers as lyrs
import tensorflow.keras.activations as act

D_layer = lyrs.Dense(32, activation= act.relu)
x_ten = tf.cast(tf.linspace([1,5], [10, -4], 10), tf.float16)
x_ten

with tf.GradientTape() as tape2:
    y = D_layer(x_ten)
    loss = tf.reduce_mean(y ** 2)
# Computing the gradient    
grad = tape2.gradient(loss, D_layer.trainable_variables)

# Displaying the Output
tf.nn.softmax(grad[0]).numpy()
np.argmax(tf.nn.softmax(grad[0]).numpy())


8

In [80]:
for var, g in zip(D_layer.trainable_variables, grad):
  print(f'{var.name}, shape: {g.shape}')


dense_23/kernel:0, shape: (2, 32)
dense_23/bias:0, shape: (32,)


##### **Controlling what the tape watches**
> The default behavior is to record all operations after accessing a **`trainable tf.Variable`**.

> Tape cannot compute gradients for tesnors as thet're immutables

In [86]:
# The only trainable Variable
var1 = tf.Variable(3.0)
# Non-trainanle
var2 = tf.Variable(5.0, trainable= False)

# A constant tensor (no grads for tensors)
ten1 = tf.constant(5.0)
# Variable + constant = constant, thus below is also a tensor
ten2 = tf.Variable(10.0) + 3.0

with tf.GradientTape() as tape3:
    y = (var1**2) + (var2**2) + (ten1**2) + ten2

grads = tape3.gradient(y, [var1, var2, ten1, ten2])

for gr in grads:
    print(gr)


tf.Tensor(6.0, shape=(), dtype=float32)
None
None
None


List Variables watched by the tape

In [93]:
print([var for var in tape3.watched_variables()])
[var.name for var in tape3.watched_variables()]


[<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=3.0>]


['Variable:0']

> **To record gradients with respect to a tf.Tensor, you need to call `GradientTape.watch(x)`**

In [120]:
rten = tf.random.uniform([3, 3], 0, 20, seed= 3)

tf.transpose(rten)

with tf.GradientTape() as ten_tape:
    ten_tape.watch(rten)
    y = rten @ (tf.transpose(rten) *  tf.random.normal([3, 3], 10, 5))

dy_dx = ten_tape.gradient(y, rten)
print(dy_dx)
print(tf.nn.softmax(dy_dx))
print(np.argmax(tf.nn.softmax(dy_dx)))


tf.Tensor(
[[842.26544 154.64954 706.1339 ]
 [926.0429  298.74033 326.2389 ]
 [845.9513  426.66037 588.9402 ]], shape=(3, 3), dtype=float32)
tf.Tensor(
[[1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]], shape=(3, 3), dtype=float32)
0


> **To disable default behaviuor of Gradient-Tape, use `watch_accessed_variables=False` paramater**

In [134]:
v1 = tf.Variable(31.4, name= 'var1')
v2 = tf.Variable(21.7, name= 'var2')

with tf.GradientTape(watch_accessed_variables= False) as gt:
    gt.watch(v2)
    y = v1**2 + v2**2

vars = {
    'Variable1' : v1,
    'Variable2' : v2
}

grz = gt.gradient(y, vars)

print("Printing Gradients for each variable")
for gr in grz:
    print(gr)
    print(grz[gr])
    print()


# Printing all watched variables
print([var for var in gt.watched_variables()])
print("This shows that only var2 is being watched")

Printing Gradients for each variable
Variable1
None

Variable2
tf.Tensor(43.4, shape=(), dtype=float32)

[<tf.Variable 'var2:0' shape=() dtype=float32, numpy=21.7>]
This shows that only var2 is being watched


##### **Intermediate Results**

In [139]:
xten = tf.constant([1, 3.2, 5.4])

with tf.GradientTape() as gt:
    gt.watch(xten)
    y = x ** 2
    z = y * x

gt.gradient(z, y).numpy() 

3.0

##### **Note on Performance**

##### **Gradients of non-scalar targets**

##### **Control Flow**

##### **Getting Gradient of None**

##### **No Gradient registered**

##### **Zeros instead of None**