In [19]:
%matplotlib inline
from mxnet import autograd, contrib, init, nd
from mxnet.gluon import loss as gloss, nn, Trainer
import time
import mxnet as mx
from chapter9 import *

##类别预测层， 每个锚对应着目标类别数加一， 类别0对应着背景
def cls_predictor(num_anchors, num_classes):
    return nn.Conv2D(num_anchors * (num_classes + 1), kernel_size=3, padding=1)
    ##预测输出的格式为(批量大小, 通道数, 高, 宽),通道维包含中心相同的锚框的预测结果
    ##保持输入输出的大小不变
    
##边界框预测层，每个锚有四个偏移量
def bbox_predictor(num_anchors):
    return nn.Conv2D(num_anchors * 4, kernel_size=3, padding=1)
    ##预测输出的格式为(批量大小, 通道数, 高, 宽), 通道维包含中心相同的锚框的预测结果
    ##保持输入输出的大小不变

def forward(x, block):
    block.initialize()
    return block(x)


In [2]:
Y1 = forward(nd.zeros((2, 8, 20, 20)), cls_predictor(5, 10))
Y2 = forward(nd.zeros((2, 16, 10, 10)), cls_predictor(3, 10))
Y1.shape, Y2.shape
## 除了批量大小外，其他维度大小均不一样
## 需要将它们变形成统一的格式并将多尺度的预测连结，从而让后续计算更简单。



((2, 55, 20, 20), (2, 33, 10, 10))

In [3]:
## 先将通道维移到最后一维，因为不同尺度下批量大小仍保持不变。
## 我们可以将预测结果转成二维的(批量大小, 高 × 宽 × 通道数)的格式，
def flatten_pred(pred):##预测结果转成二维的(批量大小, 高 × 宽 × 通道数)的格式
    return pred.transpose((0, 2, 3, 1)).flatten()

def concat_preds(preds):
    return nd.concat(*[flatten_pred(pred) for pred in preds], dim = 1)


concat_preds([Y1, Y2]).shape



(2, 25300)

In [4]:
##将上一层的输出高和宽减半
def down_sample_blk(num_channels):
    blk = nn.Sequential()
    for _ in range(2):
        blk.add(nn.Conv2D(num_channels, kernel_size=3, padding=1),
                nn.BatchNorm(in_channels=num_channels),
                nn.Activation('relu'))
    blk.add(nn.MaxPool2D(pool_size=2, strides=2))##高和宽减半
    return blk

forward(nd.zeros((2, 3, 6, 10)), down_sample_blk(10)).shape


(2, 10, 3, 5)

In [5]:
##三层，每一层大小减半，通道数翻倍
def base_net():
    blk = nn.Sequential()
    for num_channels in [16, 32, 64]:
        blk.add(down_sample_blk(num_channels))
    return blk
    
forward(nd.zeros((2, 3, 256, 256)), base_net()).shape



(2, 64, 32, 32)

In [6]:
def get_blk(i):
    if i == 0:
        blk = base_net()
    elif i == 4:
        blk = nn.GlobalMaxPool2D()
    else:
        blk = down_sample_blk(128)
    return blk
        

def blk_forward(X, blk, size, ratio, cls_predictor, bbox_predictor):
    Y = blk(X)
    anchors = contrib.nd.MultiBoxPrior(Y, sizes=size, ratios= ratio)
    cls_preds = cls_predictor(Y)
    bbox_preds = bbox_predictor(Y)
    return (Y, anchors, cls_preds, bbox_preds)



In [7]:
sizes = [[0.2, 0.272], [0.37, 0.447], [0.54, 0.619], [0.71, 0.79],
         [0.88, 0.961]]
ratios = [[1, 2, 0.5]] * 5
num_anchors = len(sizes[0]) + len(ratios[0]) - 1

class TinySSD(nn.Block):
    def __init__(self, num_classes, **kwargs):
        super(TinySSD, self).__init__(**kwargs)
        self.num_classes = num_classes
        for i in range(5):
            setattr(self, 'blk_%d' % i, get_blk(i))
            setattr(self, 'cls_%d' % i, cls_predictor(num_anchors, num_classes))
            setattr(self, 'bbox_%d' % i, bbox_predictor(num_anchors))
        
    def forward(self, X):
        anchors, cls_preds, bbox_preds = [None]*5, [None]*5, [None]*5
        for i in range(5):
            X, anchors[i], cls_preds[i], bbox_preds[i] = \
                blk_forward(X, getattr(self, 'blk_%d' % i), sizes[i], ratios[i], 
                            getattr(self, 'cls_%d' % i), getattr(self, 'bbox_%d' % i))
        return (nd.concat(*anchors, dim=1),
                concat_preds(cls_preds).reshape((0, -1, self.num_classes + 1)), 
                concat_preds(bbox_preds))
    ##输出层的anchors， 类别预测， 以及预测边界框


In [8]:
net = TinySSD(num_classes=1)
net.initialize()
X = nd.zeros((32, 3, 256, 256))
anchors, cls_preds, bbox_preds = net(X)
print('output anchors:', anchors.shape)
print('output class preds:', cls_preds.shape)
print('output bbox preds:', bbox_preds.shape)


output anchors: (1, 5444, 4)
output class preds: (32, 5444, 2)
output bbox preds: (32, 21776)


In [20]:
batch_size = 32
train_iter, _ = load_data_pikachu(batch_size)
net = TinySSD(num_classes=1)
ctx = mx.cpu()
net.initialize(init=init.Xavier())
trainer = Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.2, 'wd': 5e-4})



In [15]:
##定义损失函数
cls_loss = gloss.SoftmaxCrossEntropyLoss()
##使用了 L1 范数损失
bbox_loss = gloss.L1Loss()

##掩码变量bbox_masks令负类锚框和填充锚框不参与损失的计算
##所有样本损失的总和，没有除以样本数
def calc_loss(cls_preds, cls_labels, bbox_preds, bbox_labels, bbox_masks):
    cls = cls_loss(cls_preds, cls_labels)
    bbox = bbox_loss(bbox_preds * bbox_masks, bbox_labels * bbox_masks)
    return cls + bbox
    

In [None]:
##定义评价函数：总和，没有除以样本数
def cls_eval(cls_preds, cls_labels):
    return (cls_preds.argmax(axis=-1) == cls_labels).sum().asscalar()

##使用了 L1 范数损失，我们用平均绝对误差评价边界框的预测结果。
def bbox_eval(bbox_preds, bbox_labels, bbox_masks):
    return ((bbox_preds - bbox_labels) * bbox_masks).abs().sum().asscalar()


In [None]:
##训练模型
for epoch in range(20):
    acc_sum, mae_sum, n, m = 0.0, 0.0, 0, 0
    train_iter.reset()
    start = time.time()
    for batch in train_iter:
        X = batch.data[0].as_in_context(ctx)
        Y = batch.label[0].as_in_context(ctx)
        with autograd.record():
            ##生成多尺度的锚框，为每个锚框预测类别和偏移量
            anchors, cls_preds, bbox_preds = net(X)
            ##为每个锚框标注类别和偏移量
            bbox_labels, bbox_masks, cls_labels = contrib.nd.MultiBoxTarget(
                anchors, Y, cls_preds.transpose((0, 2, 1)))   
            # 根据类别和偏移量的预测和标注值计算损失函数
            lo = calc_loss(cls_preds, cls_labels, bbox_preds, bbox_labels,
                          bbox_masks)
        lo.backward()
        trainer.step(batch_size)
        acc_sum += cls_eval(cls_preds, cls_labels)
        n += cls_labels.size
        mae_sum += bbox_eval(bbox_preds, bbox_labels, bbox_masks)
        m += bbox_labels.size
    
    if (epoch + 1) % 5 == 0:
        print('epoch %2d, class err %.2e, bbox mae %.2e, time %.1f sec' 
              %(epoch + 1, 1-  acc_sum / n, mae_sum / m, time.time() - start))
        




epoch  5, class err -inf, bbox mae 3.20e-03, time 276.3 sec


In [None]:
##还未运行
img = image.imread('../img/pikachu.jpg')
feature = image.imresize(img, 256, 256).astype('float32')
X = feature.transpose((2, 0, 1)).expand_dims(axis=0)

def predict(X):
    anchors, cls_preds, bbox_preds = net(X)
    cls_probs = cls_preds.softmax().transpose((0, 2, 1))
    output = contrib.nd.MultiBoxDetection(cls_probs, bbox_preds, anchors)
    idx = [i for i, row in enumerate(output[0]) if row[0].asscalar() != -1]
    return output[0, idx]
    ##通过非极大值抑制移除相似的预测边界框。
    ##见anchor程序最下面的代码
    """output形如下：
    [[[ 0.    0.9   0.1   0.08  0.52  0.92]
    [ 1.    0.9   0.55  0.2   0.9   0.88]
    [-1.    0.8   0.08  0.2   0.56  0.95]
    [-1.    0.7   0.15  0.3   0.62  0.91]]]
    output[0, idx]：
    [[ 0.    0.9   0.1   0.08  0.52  0.92]
     [ 1.    0.9   0.55  0.2   0.9   0.88]]
    """
    
output = predict(X)


In [None]:
set_figsize((5, 5))
def display(img, output, threshold):
    fig = plt.imshow(img.asnumpy())
    for row in output:
        score = row[1].asscalar()
        if score < threshold:
            continue
        h, w = img.shape[0:2]
        bbox = [row[2:] * nd.array([w, h, w, h])]
        show_bboxes(fig.axes, bbox, '%.2f' % score, 'w')

display(img, output, threshold=0.3)
