## ResNet18 or 34

总结：对于残差网络ResNet18层，学习率设置为0.001，对CIFAR100经过近100epoch的训练，由于梯度离散和过拟合的发生，loss值很小，准确率也没有明显提升；然后把学习率设置为0.01，结果出出现梯度爆炸现象；把训练网络换成了RetNet34，学习率设置为0.0001，学习效率很快。

etNet34，学习率设置为(1e-5)

In [1]:
import tensorflow as tf
from tensorflow.keras import layers, Sequential, datasets, optimizers
from tensorflow import keras
import os
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

In [2]:
config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
tf.random.set_seed(2345)

In [3]:
# 定义 BasicBlock类（残差基本单元）
class BasicBlock(layers.Layer):
    # 其中 filter_num 是该卷积核通道的数量，是把输入进该卷积核的channel转换为数量为filter_num大小
    def __init__(self, filter_num, stride=1):
        super(BasicBlock, self).__init__()
        # 定义BasicBlock中的第一个卷积单元核，包括3*3的卷积、BachNom、ReLU。
        # 这个卷积核接受一个输入数据 [b, h, w, c],
        self.conv1 = layers.Conv2D(filter_num, kernel_size=[3, 3], strides=stride, padding='same')
        self.bn1 = layers.BatchNormalization()
        self.relu1 = layers.Activation('relu')

        # 定义BasicBlock中的第二个卷积单元变量
        # 其中strides 设置为1，表示为不间隔进行卷积操作，不减少featureMap
        self.conv2 = layers.Conv2D(filter_num, kernel_size=[3, 3], strides=1, padding='same')
        self.bn2 = layers.BatchNormalization()
        self.relu2 = layers.Activation('relu')

        # 定义残差网络中，短接的那一部分；分为stride为1和不为1两种情况，满足最后加和条件
        if stride != 1:
            self.downsample = Sequential()
            self.downsample.add(layers.Conv2D(filter_num, kernel_size=[1, 1], strides=stride))
        else:
            self.downsample = lambda x : x
            
    # call() 在调用该类时调用这个call() 方法，实现前向传播，在调用BasicBlock类对象时，执行该方法，
    # 第一个参数是输入该神经网络的数据 [b, h, w, c]
    def call(self, inputs, training=None):
        # inputs [b, h, w, c]
        out1 = self.conv1(inputs)
        out1 = self.bn1(out1)
        out1 = self.relu1(out1)
        
        out2 = self.conv2(out1)
        out2 = self.bn2(out2)
        
        identity_out = self.downsample(inputs)
        output = layers.add([out2, identity_out])    # 调用layers.add() 实现对应元素相加
        output = self.relu2(output)
        
        return output
    
# 定义RetNet 类（包括定义残差块build_RestBlock，包括两个基本的残差基本单元）
class RetNet(keras.Model):
    # 定义初始化函数
    def __init__(self, layer_dims, num_classes=100):
        super(RetNet, self).__init__()
        # 定义预处理的一个卷积层
        self.stem = Sequential([
            layers.Conv2D(64, kernel_size=[3, 3], strides=1),
            layers.BatchNormalization(),
            layers.Activation('relu'),
            layers.MaxPool2D(pool_size=(2, 2), strides=1, padding='same')
        ])
        # 创建4个RestBlock，指定每个的channel（filter数量），
        # 其中从第二个起设置stride=2，以达到减小featureMap的目的
        self.layer1 = self.build_RestBlock(64, layer_dims[0])
        self.layer2 = self.build_RestBlock(128, layer_dims[1], stride=2)
        self.layer3 = self.build_RestBlock(256, layer_dims[2], stride=2)
        self.layer4 = self.build_RestBlock(512, layer_dims[3], stride=2)
        # 考虑到全连接层输入决定于前面层的输出 [b, h, w, 512]，需要降维打平操作
        # 调用layers.GlobalAveragePooling2D()方法，简称GAP，全局池化层
        # 具体可参考：https://www.cnblogs.com/hutao722/p/10008581.html
        self.avgpool = layers.GlobalAveragePooling2D()
        # 创建全连接层，用来分类,参数是输出类别数
        self.fc = layers.Dense(num_classes)
    # 前向传播    
    def call(self, inputs, training=None):
        # 预处理卷积层
        x = self.stem(inputs)
        # 四个残差块，包括8个残差单元，一个经过16个卷积核
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        # 全池化层 [b, c]
        x = self.avgpool(x)
        # 全连接层 [b, 100]
        x = self.fc(x)
        
        return x
    
    # 定义残差块，包括两个残差基本单元，filter_num该残差块中通道的数量，blocks表示包含的残差基本单元数
    def build_RestBlock(self, filter_num, blocks, stride=1):
        # 定义神经网络容器Sequential
        res_blocks = Sequential()
        # 在res_blocks添加层（残差基本单元BasicBlock），可能发生下采样
        res_blocks.add(BasicBlock(filter_num, stride))
        # 
        for _ in range(1, blocks):    # 设置为从1到 blocks，其中包括1，但不包括 blocks
            # 不允许下采样，因为在残差单元中要保存featureMap不变，便于与identity进行加和操作
            res_blocks.add(BasicBlock(filter_num, stride=1))
            
        return res_blocks
    
# 定义ResNet18 [2, 2, 2, 2]
def resnet18():
    return RetNet([2, 2, 2, 2])
    
# 定义ResNet34 [3, 4, 6, 3]  
def resnet34():
    return RetNet([3, 4, 6, 3])

In [4]:
# 定义预处理函数
def preprocess(x, y):
    # [-0.5 ~ 0.5]
    x = tf.cast(x, dtype=tf.float32) / 255. - 0.5
    y = tf.cast(y, dtype=tf.int32)
    return x, y

# 加载数据集和数据集预处理
(x, y), (x_test, y_test) = datasets.cifar100.load_data()
y = tf.squeeze(y, axis=1)
y_test = tf.squeeze(y_test, axis=1)

train_db = tf.data.Dataset.from_tensor_slices((x, y))
train_db = train_db.shuffle(10000).map(preprocess).batch(512)
test_db = tf.data.Dataset.from_tensor_slices((x_test, y_test))
test_db = test_db.map(preprocess).batch(256)

# 查看数据集切片
sample = next(iter(train_db))
print('sample_train: ', sample[0].shape, sample[1].shape, tf.reduce_min(sample[0]),
     tf.reduce_max(sample[1]))


sample_train:  (512, 32, 32, 3) (512,) tf.Tensor(-0.5, shape=(), dtype=float32) tf.Tensor(99, shape=(), dtype=int32)


In [5]:
# 生成ResNet18网络
# model = resnet18()
# model.build(input_shape=(None, 32, 32, 3))
# model.summary()

# 生成ResNet34网络
model = resnet34()
model.build(input_shape=(None, 32, 32, 3))
model.summary()

# 定义优化器
optimizer = optimizers.Adam(lr=1e-5)

Model: "ret_net"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequential (Sequential)      multiple                  2048      
_________________________________________________________________
sequential_1 (Sequential)    multiple                  223104    
_________________________________________________________________
sequential_2 (Sequential)    multiple                  1119360   
_________________________________________________________________
sequential_4 (Sequential)    multiple                  6831360   
_________________________________________________________________
sequential_6 (Sequential)    multiple                  13123072  
_________________________________________________________________
global_average_pooling2d (Gl multiple                  0         
_________________________________________________________________
dense (Dense)                multiple                  5130

In [6]:

for epoch in range(500):
    for step, (x, y) in enumerate(train_db):
        with tf.GradientTape() as tape:   
            logits = model(x)   
            y_onehot = tf.one_hot(y, depth=100)
            # compute loss
            loss = tf.losses.categorical_crossentropy(y_onehot, logits, from_logits=True)
            loss = tf.reduce_mean(loss)
        
        # compute grades    
        grads = tape.gradient(loss, model.trainable_variables)
        # update variables 
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        
        # show loss
        if step % 100 == 0:
            print(epoch, step, 'loss: ', float(loss))
            
    # test    
    total_num = 0
    total_correct = 0
    for x, y in test_db:
        logits = model(x)
        prob = tf.nn.softmax(logits, axis=1)
        preb = tf.argmax(prob, axis=1)    # int64
        preb = tf.cast(preb, dtype=tf.int32)
        
        correct = tf.cast(tf.equal(preb, y), dtype=tf.int32)
        correct = tf.reduce_sum(correct)
        
        total_num += x.shape[0]
        total_correct += int(correct)
        
    acc = total_correct / total_num
    print(epoch, 'acc: ', acc)
    
    if epoch % 100 ==0:
        # 保存模型
        
        pass
    

0 0 loss:  4.612636089324951
0 acc:  0.0513
1 0 loss:  4.215523719787598
1 acc:  0.0881
2 0 loss:  3.9350132942199707
2 acc:  0.1167
3 0 loss:  3.7696704864501953
3 acc:  0.1314
4 0 loss:  3.6653172969818115
4 acc:  0.1488
5 0 loss:  3.5529751777648926
5 acc:  0.1618
6 0 loss:  3.4723737239837646
6 acc:  0.1783
7 0 loss:  3.384908676147461
7 acc:  0.1922
8 0 loss:  3.306673526763916
8 acc:  0.2046
9 0 loss:  3.2464451789855957
9 acc:  0.2138
10 0 loss:  3.1760494709014893
10 acc:  0.2186
11 0 loss:  3.1178526878356934
11 acc:  0.225
12 0 loss:  3.0586466789245605
12 acc:  0.2359
13 0 loss:  3.0066609382629395
13 acc:  0.2437
14 0 loss:  2.953023672103882
14 acc:  0.2483
15 0 loss:  2.9149017333984375
15 acc:  0.2482
16 0 loss:  2.8877527713775635
16 acc:  0.2571
17 0 loss:  2.8234970569610596
17 acc:  0.2604
18 0 loss:  2.784043550491333
18 acc:  0.2658
19 0 loss:  2.745532751083374
19 acc:  0.271
20 0 loss:  2.705897569656372
20 acc:  0.2774
21 0 loss:  2.6695454120635986
21 acc:  0.2

KeyboardInterrupt: 