In [1]:
import d2lzh as d2l
from mxnet import gluon, image, init, nd, contrib, autograd
from mxnet.gluon import loss as gloss, nn
import time
import os

In [3]:
def cls_predict(numAnchor, numClass):
    return nn.Conv2D(numAnchor * (numClass + 1), kernel_size = 3, padding = 1)

In [4]:
def bbox_predict(numAnchor):
    return nn.Conv2D(numAnchor * 4, kernel_size = 3, padding = 1)

In [5]:
def forward(vec, level):
    level.initialize()
    return level(vec)

In [6]:
def flat(prediction):
    return prediction.transpose((0, 2, 3, 1)).flatten()

In [7]:
def concat(predictions):
    return nd.concat(*[flat(p) for p in predictions], dim = 1)

In [19]:
def half_size_level(numChannels):
    lev = nn.Sequential()
    for _ in range(2):
       lev.add(nn.Conv2D(numChannels, kernel_size = 3, padding = 1),
              nn.BatchNorm(in_channels = numChannels),
              nn.Activation('relu'))
    lev.add(nn.MaxPool2D(2))
    return lev

In [9]:
def base_net():
    lev = nn.Sequential()
    for i in [16, 32, 64]:
        lev.add(half_size_level(i))
    return lev

In [10]:
def create_level(i):
    if i == 0:
        lev = base_net()
    elif i == 4:
        lev = nn.GlobalMaxPool2D()
    else:
        lev = half_size_level(128)
    return lev

In [11]:
def level_forward(X, lev, size, ratio, cls_predictor, bbox_predictor):
    Y = lev(X)
    anchors = contrib.ndarray.MultiBoxPrior(Y, sizes=size, ratios=ratio)
    cls_predictions = cls_predictor(Y)
    bbox_predictions = bbox_predictor(Y)
    return (Y, anchors, cls_predictions, bbox_predictons)

In [12]:
sizes = [[0.2, 0.272], [0.37, 0.447], [0.54, 0.619], [0.71, 0.79],
         [0.88, 0.961]]
ratios = [[1, 2, 0.5]] * 5
numAnchors = len(sizes[0]) + len(ratios[0]) - 1

In [13]:
class SSD(nn.Block):
    def __init__(self, numClasses, **kwargs):
        super(SSD, self).__init__(**kwargs)
        self.numClasses = numClasses
        for i in range(5):
            setattr(self, 'lev_%d' % i, create_level(i))
            setattr(self, 'cls_%d' % i, cls_predict(numAnchors, numClasses))
            setattr(self, 'bbox_%d' % i, bbox_predict(numAnchors))
    
    def forward(self, X):
        anchors, cls_predictions, bbox_predictions = [None] * 5, [None] * 5, [None] * 5
        for i in range(5):
            X, anchors[i], cls_predictions[i], bbox_predictions[i] = level_forward(
            X, getattr(self, 'lev_%d' % i), sizes[i], ratios[i],
            getattr(self, 'cls_%d' % i), getattr(self, 'bbox_%d' % i))
        
        return (nd.concat(*anchors, dim = 1), concat(cls_predctions).reshape((0, -1, self.numClasses + 1)), concat(bbox_predictions))

In [18]:
net = SSD(numClasses = 1)
net.initialize()
batch_size = 32

ctx = d2l.try_gpu()
net = SSD(numClasses = 1)

trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate' : 0.2, 'wd' : 5e-4})

cls_loss = gloss.SoftmaxCrossEntropyLoss()
bbox_loss = gloss.L1Loss()

def compute_loss(cls_predictions, cls_labels, bbox_predictions, bbox_labels, bbox_masks):
    cls_l = cls_loss(cls_predictions, cls_labels)
    bbox_l = bbox_loss(bbox_predictions * bbox_masks, bbox_labels * bbox_masks)
    return cls_l + bbox_l

NameError: name 'half_size_level' is not defined

In [None]:
def cls_eval(cls_predictions, cls_labels):
    # 由于类别预测结果放在最后一维，argmax需要指定最后一维
    return (cls_predictions.argmax(axis=-1) == cls_labels).sum().asscalar()

def bbox_eval(bbox_predictions, bbox_labels, bbox_masks):
    return ((bbox_labels - bbox_predictions) * bbox_masks).abs().sum().asscalar()

In [None]:
for epoch in range(20):
    acc_sum, mae_sum, n, m = 0.0, 0.0, 0, 0
    train_iter.reset()  # 从头读取数据
    start = time.time()
    for batch in train_iter:
        X = batch.data[0].as_in_context(ctx)
        Y = batch.label[0].as_in_context(ctx)
        with autograd.record():
            # 生成多尺度的锚框，为每个锚框预测类别和偏移量
            anchors, cls_preds, bbox_preds = net(X)
            # 为每个锚框标注类别和偏移量
            bbox_labels, bbox_masks, cls_labels = contrib.nd.MultiBoxTarget(
                anchors, Y, cls_preds.transpose((0, 2, 1)))
            # 根据类别和偏移量的预测和标注值计算损失函数
            l = calc_loss(cls_preds, cls_labels, bbox_preds, bbox_labels,
                          bbox_masks)
        l.backward()
        trainer.step(batch_size)
        acc_sum += cls_eval(cls_preds, cls_labels)
        n += cls_labels.size
        mae_sum += bbox_eval(bbox_preds, bbox_labels, bbox_masks)
        m += bbox_labels.size

    if (epoch + 1) % 5 == 0:
        print('epoch %2d, class err %.2e, bbox mae %.2e, time %.1f sec' % (
            epoch + 1, 1 - acc_sum / n, mae_sum / m, time.time() - start))