In [107]:
# 手动近似求导 实现梯度下降
# diff求导函数 Difference and approximate derivative

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras

In [108]:
# 近似求导
def f(x):
    return 3. * x ** 2 + 2. * x - 1
    # 6x + 2


def approximate_derivative(f, x, eps=1e-4):
    return (f(x + eps) - f(x - eps)) / (2. * eps)


print(approximate_derivative(f, 1.))

7.999999999994678


In [109]:
# 求偏导
def g(x, y):
    return (x + 5) * (y ** 2)


def approximate_gradient(g, x, y, eps=1e-3):
    dg_dx = approximate_derivative(lambda a: g(a, y), x, eps)
    dg_dy = approximate_derivative(lambda a: g(x, a), y, eps)
    return dg_dx, dg_dy


print(approximate_gradient(g, 2, 3))

(8.999999999993236, 41.999999999994486)


In [110]:
# tensorflow提供的近似求导
x = tf.Variable(2.)
y = tf.Variable(3.)
# tape只能使用一次
with tf.GradientTape() as tape:
    z = g(x, y)
# 求偏导
dz_dx = tape.gradient(z, [x, y])  ##############
print(dz_dx)
print('-' * 50)
print(x)
print('-' * 50)
try:
    dz_dy = tape.gradient(z, y)
except RuntimeError as ex:
    print(ex)

[<tf.Tensor: shape=(), dtype=float32, numpy=9.0>, <tf.Tensor: shape=(), dtype=float32, numpy=42.0>]
--------------------------------------------------
<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=2.0>
--------------------------------------------------
A non-persistent GradientTape can only be used to compute one set of gradients (or jacobians)


In [111]:
# 进行多次求导 tape使用多次
x = tf.Variable(2.)
y = tf.Variable(3.)
# tape设置persistent使用后不释放 需要自己手动释放
with tf.GradientTape(persistent=True) as tape:
    z = g(x, y)
dz_x = tape.gradient(z, x)
dz_y = tape.gradient(z, y)
print(dz_x, dz_y)
print('-' * 50)
dz_dx_dy = tape.gradient(z, [x, y])  # 方便损失对多个w进行求导
print(dz_dx_dy)
print('-' * 50)
print(type(dz_dx_dy))
del tape

tf.Tensor(9.0, shape=(), dtype=float32) tf.Tensor(42.0, shape=(), dtype=float32)
--------------------------------------------------
[<tf.Tensor: shape=(), dtype=float32, numpy=9.0>, <tf.Tensor: shape=(), dtype=float32, numpy=42.0>]
--------------------------------------------------
<class 'list'>


In [112]:
#同时求偏导，传入列表，可以在with中编写多个公式
x1 = tf.Variable(2.0)
x2 = tf.Variable(3.0)
with tf.GradientTape() as tape:
    z = g(x, y)
    z1 = z * z
dz1_x1 = tape.gradient(z1, x)
print(dz1_x1)

tf.Tensor(1134.0, shape=(), dtype=float32)


In [113]:
# 对常量求偏导
x = tf.constant(2.0)
y = tf.constant(3.0)
with tf.GradientTape() as tape:
    z = g(x, y)
dz_dx_dy = tape.gradient(z, [x, y])
print(dz_dx_dy)

[None, None]


In [114]:
# 对常量求偏导 一般关注常量的导数
x = tf.constant(2.0)
y = tf.constant(3.0)
with tf.GradientTape() as tape:
    tape.watch(x)  #增加watch
    tape.watch(y)
    z = g(x, y)
dz_dx_dy = tape.gradient(z, [x, y])
print(dz_dx_dy)

[<tf.Tensor: shape=(), dtype=float32, numpy=9.0>, <tf.Tensor: shape=(), dtype=float32, numpy=42.0>]


In [115]:
x = tf.Variable(5.)
with tf.GradientTape() as tape:
    z1 = 3 * x
    z2 = x ** 2
    z3 = x ** 3
print(tape.gradient([z1, z2, z3], x))  # 3+10+75

tf.Tensor(88.0, shape=(), dtype=float32)


In [116]:
# 求二阶导
x = tf.Variable(2.0)
y = tf.Variable(3.0)
with tf.GradientTape(persistent=True) as outer_tape:
    with tf.GradientTape(persistent=True) as inner_tape:
        z = g(x, y)
    inner_grads = inner_tape.gradient(z, [x, y])
outer_grads = [outer_tape.gradient(inner_grad, [x, y]) for inner_grad in inner_grads]
# 结果依次是 d²z/dx² d²z/dxdy d²z/dydx d²z/dydy
print(outer_grads)
del inner_tape
del outer_tape

[[None, <tf.Tensor: shape=(), dtype=float32, numpy=6.0>], [<tf.Tensor: shape=(), dtype=float32, numpy=6.0>, <tf.Tensor: shape=(), dtype=float32, numpy=14.0>]]


In [117]:
# 模拟梯度下降算法 SGD
learning_rate = 0.1
# 随意一个x 如glorot均匀分布后的w值
x = tf.Variable(-1.)
for _ in range(100):
    with tf.GradientTape() as tape:
        z = f(x)
    dz_dx = tape.gradient(z, x)
    x.assign_sub(learning_rate * dz_dx)
print(x)

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=-0.33333334>


In [118]:
# GradientTape与optimizer（优化器）结合使用
learning_rate = 0.1
x = tf.Variable(2.0)
# 使用随机梯度下降
optimizer = keras.optimizers.SGD(lr=learning_rate)
for _ in range(100):
    with tf.GradientTape() as tape:
        z = f(x)
    dz_dx = tape.gradient(z, x)
    optimizer.apply_gradients([(dz_dx, x)])  # 支持批量矩阵运算 这和上面x.assign_sub是等价的效果相同，前面是梯度，后面是x
print(x)
# 不同优化器的主要区别在于learning_rate的变化规律不同

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=-0.3333333>
