TF的eager execution是一个命令式编程环境，可以立即执行操作，不需要建计算图（graph）。

操作返回具体的值，而不是构建一个计算图以便以后运行。

这使得初学TensorFlow、调试模型、减少模板代码变得很容易，而且很有趣！

Eager execution支持大多数的TensorFlow操作和GPU加速。

自动微分使用动态构建的tape代替静态图来计算梯度。

Eager execution是一个灵活的机器学习平台，用于研究和实验，提供：

- 直观的界面：自然地构造代码并使用Python数据结构。快速迭代小模型和小数据。
- 更容易调试：直接调用ops来检查运行的模型和测试更改。使用标准的Python调试工具进行即时错误报告。
- 自然的控制流：使用Python控制流而不是图控制流，包括对动态模型的支持。

# 安装和基本用法

##### 更新到TF 1.7版本

In [1]:
from __future__ import absolute_import, division, print_function
import tensorflow as tf
tf.enable_eager_execution()

In [2]:
x = [[2.]]
m = tf.matmul(x, x)
print("hello, {}".format(m))  # => "hello, [[4.]]"

hello, [[ 4.]]


Eager execution works nicely with NumPy.

In [4]:
a = tf.constant([[1, 2],[3, 4]])
print(a)
# => tf.Tensor([[1 2][3 4]], shape=(2, 2), dtype=int32)

# Broadcasting support
b = tf.add(a, 1)
print(b)
# => tf.Tensor([[2 3][4 5]], shape=(2, 2), dtype=int32)

# Operator overloading is supported
print(a * b)
# => tf.Tensor([[ 2  6][12 20]], shape=(2, 2), dtype=int32)

# Use NumPy values
import numpy as np

c = np.multiply(a, b)
print(c)
# => [[ 2  6]
#     [12 20]]

# Obtain numpy value from a tensor:
print(a.numpy())
# => [[1 2]
#     [3 4]]

tf.Tensor(
[[1 2]
 [3 4]], shape=(2, 2), dtype=int32)
tf.Tensor(
[[2 3]
 [4 5]], shape=(2, 2), dtype=int32)
tf.Tensor(
[[ 2  6]
 [12 20]], shape=(2, 2), dtype=int32)
[[ 2  6]
 [12 20]]
[[1 2]
 [3 4]]


In [6]:
import tensorflow.contrib.eager as tfe

## Eager training

#### Automatic differentiation

In [7]:
w = tfe.Variable([[1.0]])
with tfe.GradientTape() as tape:
  loss = w * w

grad = tape.gradient(loss, [w])
print(grad)  # => [tf.Tensor([[ 2.]], shape=(1, 1), dtype=float32)]

[<tf.Tensor: id=40, shape=(1, 1), dtype=float32, numpy=array([[ 2.]], dtype=float32)>]


records forward-pass operations to train a simple model:

In [8]:
# A toy dataset of points around 3 * x + 2
NUM_EXAMPLES = 1000
training_inputs = tf.random_normal([NUM_EXAMPLES])
noise = tf.random_normal([NUM_EXAMPLES])
training_outputs = training_inputs * 3 + 2 + noise

def prediction(input, weight, bias):
  return input * weight + bias

# A loss function using mean-squared error
def loss(weights, biases):
  error = prediction(training_inputs, weights, biases) - training_outputs
  return tf.reduce_mean(tf.square(error))

# Return the derivative of loss with respect to weight and bias
def grad(weights, biases):
  with tfe.GradientTape() as tape:
    loss_value = loss(weights, biases) 
  return tape.gradient(loss_value, [weights, biases])

train_steps = 200
learning_rate = 0.01
# Start with arbitrary values for W and B on the same batch of data
W = tfe.Variable(5.)
B = tfe.Variable(10.)

print("Initial loss: {:.3f}".format(loss(W, B)))

for i in range(train_steps):
  dW, dB = grad(W, B)
  W.assign_sub(dW * learning_rate)
  B.assign_sub(dB * learning_rate)
  if i % 20 == 0:
    print("Loss at step {:03d}: {:.3f}".format(i, loss(W, B)))

print("Final loss: {:.3f}".format(loss(W, B)))
print("W = {}, B = {}".format(W.numpy(), B.numpy()))

Initial loss: 68.371
Loss at step 000: 65.739
Loss at step 020: 30.182
Loss at step 040: 14.154
Loss at step 060: 6.922
Loss at step 080: 3.656
Loss at step 100: 2.180
Loss at step 120: 1.513
Loss at step 140: 1.210
Loss at step 160: 1.073
Loss at step 180: 1.011
Final loss: 0.983
W = 3.06265926361084, B = 2.1161582469940186


```
dataset = tf.data.Dataset.from_tensor_slices((data.train.images,
                                              data.train.labels))
...
for (batch, (images, labels)) in enumerate(tfe.Iterator(dataset)):
  ...
  with tfe.GradientTape() as tape:
    logits = model(images, training=True)
    loss_value = loss(logits, labels)
  ...
  grads = tape.gradient(loss_value, model.variables)
  optimizer.apply_gradients(zip(grads, model.variables),
                            global_step=tf.train.get_or_create_global_step())
```

### Dynamic models

In [9]:
def line_search_step(fn, init_x, rate=1.0):
  with tfe.GradientTape() as tape:
    # Variables are automatically recorded, but manually watch a tensor
    tape.watch(init_x)
    value = fn(init_x)
  grad, = tape.gradient(value, [init_x])
  grad_norm = tf.reduce_sum(grad * grad)
  init_value = value
  while value > init_value - rate * grad_norm:
    x = init_x - rate * grad
    value = fn(x)
    rate /= 2.0
  return x, value

### Additional functions to compute gradients

In [10]:
def square(x):
  return tf.multiply(x, x)

grad = tfe.gradients_function(square)

square(3.)  # => 9.0
grad(3.)    # => [6.0]

# The second-order derivative of square:
gradgrad = tfe.gradients_function(lambda x: grad(x)[0])
gradgrad(3.)  # => [2.0]

# The third-order derivative is None:
gradgradgrad = tfe.gradients_function(lambda x: gradgrad(x)[0])
gradgradgrad(3.)  # => [None]

# With flow control:
def abs(x):
  return x if x > 0. else -x

grad = tfe.gradients_function(abs)

grad(3.)   # => [1.0]
grad(-3.)  # => [-1.0]

[<tf.Tensor: id=7430, shape=(), dtype=float32, numpy=-1.0>]

### Custom gradients

In [11]:
@tf.custom_gradient
def clip_gradient_by_norm(x, norm):
  y = tf.identity(x)
  def grad_fn(dresult):
    return [tf.clip_by_norm(dresult, norm), None]
  return y, grad_fn

In [12]:
def log1pexp(x):
  return tf.log(1 + tf.exp(x))
grad_log1pexp = tfe.gradients_function(log1pexp)

# The gradient computation works fine at x = 0.
grad_log1pexp(0.)  # => [0.5]

# However, x = 100 fails because of numerical instability.
grad_log1pexp(100.)  # => [nan]

[<tf.Tensor: id=7447, shape=(), dtype=float32, numpy=nan>]

In [13]:
@tf.custom_gradient
def log1pexp(x):
  e = tf.exp(x)
  def grad(dy):
    return dy * (1 - 1 / (1 + e))
  return tf.log(1 + e), grad

grad_log1pexp = tfe.gradients_function(log1pexp)

# As before, the gradient computation works fine at x = 0.
grad_log1pexp(0.)  # => [0.5]

# And the gradient computation also works at x = 100.
grad_log1pexp(100.)  # => [1.0]

[<tf.Tensor: id=7466, shape=(), dtype=float32, numpy=1.0>]

# Build and train models

## Build a model

In [14]:
model = tf.keras.Sequential([
  tf.keras.layers.Dense(10, input_shape=(784,)),  # must declare input shape
  tf.keras.layers.Dense(10)
])

In [15]:
class MNISTModel(tf.keras.Model):
  def __init__(self):
    super(MNISTModel, self).__init__()
    self.dense1 = tf.keras.layers.Dense(units=10)
    self.dense2 = tf.keras.layers.Dense(units=10)

  def call(self, input):
    """Run the model."""
    result = self.dense1(input)
    result = self.dense2(result)
    result = self.dense2(result)  # reuse variables from dense2 layer
    return result

model = MNISTModel()

In [16]:
model

<__main__.MNISTModel at 0x1fc8d6169b0>

## Train a model

In [17]:
# Create a tensor representing a blank image
batch = tf.zeros([1, 1, 784])
print(batch.shape)  # => (1, 1, 784)

result = model(batch)
# => tf.Tensor([[[ 0.  0., ..., 0.]]], shape=(1, 1, 10), dtype=float32)

(1, 1, 784)


In [18]:
result

<tf.Tensor: id=7600, shape=(1, 1, 10), dtype=float32, numpy=array([[[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]]], dtype=float32)>

In [24]:
import dataset  # download dataset.py file

In [25]:
dataset_train = dataset.train('./datasets').shuffle(60000).repeat(4).batch(32)

In [26]:
def loss(model, x, y):
  prediction = model(x)
  return tf.losses.sparse_softmax_cross_entropy(labels=y, logits=prediction)

def grad(model, inputs, targets):
  with tfe.GradientTape() as tape:
    loss_value = loss(model, inputs, targets)
  return tape.gradient(loss_value, model.variables)

optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)

x, y = tfe.Iterator(dataset_train).next()
print("Initial loss: {:.3f}".format(loss(model, x, y)))

# Training loop
for (i, (x, y)) in enumerate(tfe.Iterator(dataset_train)):
  # Calculate derivatives of the input function with respect to its parameters.
  grads = grad(model, x, y)
  # Apply the gradient to the model
  optimizer.apply_gradients(zip(grads, model.variables),
                            global_step=tf.train.get_or_create_global_step())
  if i % 200 == 0:
    print("Loss at step {:04d}: {:.3f}".format(i, loss(model, x, y)))

print("Final loss: {:.3f}".format(loss(model, x, y)))

Initial loss: 2.585
Loss at step 0000: 2.430
Loss at step 0200: 2.095
Loss at step 0400: 2.108
Loss at step 0600: 1.926
Loss at step 0800: 1.673
Loss at step 1000: 1.800
Loss at step 1200: 1.729
Loss at step 1400: 1.375
Loss at step 1600: 1.357
Loss at step 1800: 1.168
Loss at step 2000: 1.133
Loss at step 2200: 1.171
Loss at step 2400: 1.241
Loss at step 2600: 0.935
Loss at step 2800: 0.889
Loss at step 3000: 0.761
Loss at step 3200: 0.836
Loss at step 3400: 0.667
Loss at step 3600: 0.639
Loss at step 3800: 0.864
Loss at step 4000: 0.785
Loss at step 4200: 0.734
Loss at step 4400: 0.901
Loss at step 4600: 0.661
Loss at step 4800: 0.725
Loss at step 5000: 0.466
Loss at step 5200: 0.469
Loss at step 5400: 0.402
Loss at step 5600: 0.695
Loss at step 5800: 0.553
Loss at step 6000: 0.465
Loss at step 6200: 0.559
Loss at step 6400: 0.725
Loss at step 6600: 0.344
Loss at step 6800: 0.605
Loss at step 7000: 0.345
Loss at step 7200: 0.386
Loss at step 7400: 0.466
Final loss: 0.374


```
with tf.device("/gpu:0"):
  for (i, (x, y)) in enumerate(tfe.Iterator(dataset_train)):
    # minimize() is equivalent to the grad() and apply_gradients() calls.
    optimizer.minimize(lambda: loss(model, x, y),
                       global_step=tf.train.get_or_create_global_step())
```

## Variables and optimizers

In [27]:
class Model(tf.keras.Model):
  def __init__(self):
    super(Model, self).__init__()
    self.W = tfe.Variable(5., name='weight')
    self.B = tfe.Variable(10., name='bias')
  def predict(self, inputs):
    return inputs * self.W + self.B

# A toy dataset of points around 3 * x + 2
NUM_EXAMPLES = 2000
training_inputs = tf.random_normal([NUM_EXAMPLES])
noise = tf.random_normal([NUM_EXAMPLES])
training_outputs = training_inputs * 3 + 2 + noise

# The loss function to be optimized
def loss(model, inputs, targets):
  error = model.predict(inputs) - targets
  return tf.reduce_mean(tf.square(error))

def grad(model, inputs, targets):
  with tfe.GradientTape() as tape:
    loss_value = loss(model, inputs, targets)
  return tape.gradient(loss_value, [model.W, model.B])

# Define:
# 1. A model.
# 2. Derivatives of a loss function with respect to model parameters.
# 3. A strategy for updating the variables based on the derivatives.
model = Model()
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)

print("Initial loss: {:.3f}".format(loss(model, training_inputs, training_outputs)))

# Training loop
for i in range(300):
  grads = grad(model, training_inputs, training_outputs)
  optimizer.apply_gradients(zip(grads, [model.W, model.B]),
                            global_step=tf.train.get_or_create_global_step())
  if i % 20 == 0:
    print("Loss at step {:03d}: {:.3f}".format(i, loss(model, training_inputs, training_outputs)))

print("Final loss: {:.3f}".format(loss(model, training_inputs, training_outputs)))
print("W = {}, B = {}".format(model.W.numpy(), model.B.numpy()))

Initial loss: 69.830
Loss at step 000: 67.067
Loss at step 020: 30.128
Loss at step 040: 13.857
Loss at step 060: 6.688
Loss at step 080: 3.527
Loss at step 100: 2.133
Loss at step 120: 1.518
Loss at step 140: 1.246
Loss at step 160: 1.126
Loss at step 180: 1.073
Loss at step 200: 1.050
Loss at step 220: 1.040
Loss at step 240: 1.035
Loss at step 260: 1.033
Loss at step 280: 1.032
Final loss: 1.032
W = 2.9924566745758057, B = 2.028473138809204


# Use objects for state during eager execution

In [29]:
m = tfe.metrics.Mean("loss")
m(0)
m(5)
m.result()  # => 2.5
m([8, 9])
m.result()  # => 5.5

<tf.Tensor: id=672174, shape=(), dtype=float64, numpy=5.5>

## Performance

In [30]:
import time

def measure(x, steps):
  # TensorFlow initializes a GPU the first time it's used, exclude from timing.
  tf.matmul(x, x)
  start = time.time()
  for i in range(steps):
    x = tf.matmul(x, x)
    _ = x.numpy()  # Make sure to execute op and not just enqueue it
  end = time.time()
  return end - start

shape = (1000, 1000)
steps = 200
print("Time to multiply a {} matrix by itself {} times:".format(shape, steps))

# Run on CPU:
with tf.device("/cpu:0"):
  print("CPU: {} secs".format(measure(tf.random_normal(shape), steps)))

# Run on GPU, if available:
if tfe.num_gpus() > 0:
  with tf.device("/gpu:0"):
    print("GPU: {} secs".format(measure(tf.random_normal(shape), steps)))
else:
  print("GPU: not found")

Time to multiply a (1000, 1000) matrix by itself 200 times:
CPU: 6.314788103103638 secs
GPU: not found


## Use eager execution in a graph environment

In [33]:
import tensorflow as tf
def my_py_func(x):
  x = tf.matmul(x, x)  # You can use tf ops
  print(x)  # but it's eager!
  return x

with tf.Session() as sess:
  x = tf.placeholder(dtype=tf.float32)
  # Call eager function in graph!
  pf = tfe.py_func(my_py_func, [x], tf.float32)
  sess.run(pf, feed_dict={x: [[2.0]]})  # [[4.0]]

RuntimeError: tf.placeholder() is not compatible with eager execution.