In [17]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import datasets

In [18]:
# x: [60k, 28, 28]
# y: [60k]
(x_train, y_train), (x_test, y_test) = datasets.mnist.load_data()

In [19]:
x_train = tf.convert_to_tensor(x_train, dtype=tf.float32) / 255.
y_train = tf.convert_to_tensor(y_train, dtype=tf.int32)
print(x_train.shape, y_train.shape, x_train.dtype, y_train.dtype)

(60000, 28, 28) (60000,) <dtype: 'float32'> <dtype: 'int32'>


In [20]:
print(tf.reduce_min(x_train), tf.reduce_max(x_train))
print(tf.reduce_min(y_train), tf.reduce_max(y_train))

tf.Tensor(0.0, shape=(), dtype=float32) tf.Tensor(1.0, shape=(), dtype=float32)
tf.Tensor(0, shape=(), dtype=int32) tf.Tensor(9, shape=(), dtype=int32)


In [21]:
train_db = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(128)
# 迭代器对象可以逐个返回训练数据集中的样本
train_iter = iter(train_db)
sample = next(train_iter)
print('batch: ', sample[0].shape, sample[1].shape)

batch:  (128, 28, 28) (128,)


In [44]:
# [b，784] => [b，256] => [b，128] => [b，10]
# [dim_in,dim_out],[dim_out]
w1 = tf.Variable(tf.random.truncated_normal([784, 256], stddev=0.1))  # loss是nan，控制初始值的方差
b1 = tf.Variable(tf.zeros([256]))
w2 = tf.Variable(tf.random.truncated_normal([256, 128], stddev=0.1))
b2 = tf.Variable(tf.zeros([128]))
w3 = tf.Variable(tf.random.truncated_normal([128, 10], stddev=0.1))
b3 = tf.Variable(tf.zeros([10]))

# learning rate
lr = 1e-3

for epoch in range(10):  # iterate db for 10
    for step, (x_train, y_train) in enumerate(train_db): 
        # x:[128，28，28]
        # y: [128]
        # [b，28，28] => [b，28*28]
        x_train = tf.reshape(x_train, [-1, 28*28])

        with tf.GradientTape() as tape: #tf.variable
            # x: [b，28*28]
            #hl= x@w1+b1
            # [b，784]@[784，256] + [256] => [b，256] + [256] => [b，256] + [b，256]
            h1 = x_train @ w1 + tf.broadcast_to(b1, [x_train.shape[0], 256])
            h1 = tf.nn.relu(h1)
            # [b，256] =>[b，128]
            h2 = h1 @ w2 + b2
            h2 = tf.nn.relu(h2)
            # [b，128] => [b，10]
            out = h2 @ w3 + b3
            # compute loss
            # out:[b，10]
            # y: [b] => [b，10]
            y_onehot = tf.one_hot(y_train, depth=10)
            # mse = mean(sum(y-out)^2)#[b，10]
            loss = tf.square(y_onehot - out)
            #mean: scalar
            loss = tf.reduce_mean(loss)

        # compute gradients
        grads = tape.gradient(loss, [w1, b1, w2, b2, w3, b3])
        # wl = wl - lr * wl_grad
        w1.assign_sub(lr * grads[0])  #原地更新
        b1.assign_sub(lr * grads[1])
        w2.assign_sub(lr * grads[2])
        b2.assign_sub(lr * grads[3])
        w3.assign_sub(lr * grads[4])
        b3.assign_sub(lr * grads[5])

        if step % 100 == 0:
            print(epoch, step, 'loss:', float(loss))

0 0 loss: 0.32913342118263245
0 100 loss: 0.21052272617816925
0 200 loss: 0.18602678179740906
0 300 loss: 0.17267438769340515
0 400 loss: 0.16706109046936035
1 0 loss: 0.15964654088020325
1 100 loss: 0.1611134111881256
1 200 loss: 0.14721903204917908
1 300 loss: 0.14018478989601135
1 400 loss: 0.13996370136737823
2 0 loss: 0.13158008456230164
2 100 loss: 0.13802003860473633
2 200 loss: 0.12625950574874878
2 300 loss: 0.12181083858013153
2 400 loss: 0.12363128364086151
3 0 loss: 0.11458437144756317
3 100 loss: 0.12323091179132462
3 200 loss: 0.11275351047515869
3 300 loss: 0.11002743244171143
3 400 loss: 0.11279760301113129
4 0 loss: 0.1032678633928299
4 100 loss: 0.11280091851949692
4 200 loss: 0.1033032163977623
4 300 loss: 0.10175305604934692
4 400 loss: 0.10502896457910538
5 0 loss: 0.09511105716228485
5 100 loss: 0.10500464588403702
5 200 loss: 0.0962817370891571
5 300 loss: 0.0956498384475708
5 400 loss: 0.09923167526721954
6 0 loss: 0.08898349106311798
6 100 loss: 0.0989358276128