In [25]:
import os
import sys
sys.path.insert(0,'../')
import numpy as np
import mxnet as mx
from mxnet import nd,autograd,nd,init,contrib,gluon
from mxnet.gluon import nn,data as gdata,loss as gloss
import gluonbook as gb
import time

In [2]:
def cls_predictor(num_anchors,num_classes):
    return nn.Conv2D(channels=num_anchors*(num_classes+1),kernel_size=3,padding=1)

In [3]:
def bbox_predictor(num_anchors):
    return nn.Conv2D(channels=num_anchors*4,kernel_size=3,padding=1)

In [4]:
def down_sample_blk(num_channels):
    blk = nn.Sequential()
    blk.add(nn.BatchNorm(),nn.Activation('relu'),
           nn.Conv2D(channels=num_channels//2,kernel_size=1,strides=1),
            nn.BatchNorm(),nn.Activation('relu'),
           nn.Conv2D(channels=num_channels,kernel_size=3,strides=1,padding=1))
    
    blk.add(nn.MaxPool2D(pool_size=2))
    return blk

In [5]:
def go_forward(net):
    X = nd.uniform(shape=(1,3,256,256),ctx=mx.gpu())
    print('original shape =',X.shape)
    net.initialize(ctx=mx.gpu())
    print('ouput shape =',net(X))

In [6]:
go_forward(down_sample_blk(128))

original shape = (1, 3, 256, 256)
ouput shape = 
[[[[ 0.00630182 -0.00287076  0.00978196 ...  0.01074085 -0.00739565
     0.00426991]
   [ 0.03914755  0.03505199  0.0104829  ...  0.01344586  0.02638256
     0.00646955]
   [-0.00102503  0.01640964  0.00349047 ...  0.0041425   0.01326667
     0.00690887]
   ...
   [ 0.00793571  0.00656327  0.02307148 ... -0.00805367  0.00988335
     0.00761725]
   [-0.00037713  0.02453185 -0.00191945 ...  0.02554989  0.02533143
     0.01285604]
   [ 0.00565997  0.0209941   0.02889473 ...  0.0109336   0.01575508
     0.02148684]]

  [[ 0.02334058  0.02347069  0.02158521 ...  0.03161862  0.02083528
     0.02181804]
   [ 0.02808554  0.0289668   0.01944274 ...  0.0201186   0.03380858
     0.02204946]
   [ 0.02078616  0.01712939  0.01900282 ...  0.0360853   0.01502372
     0.02443051]
   ...
   [ 0.02227004  0.02596958  0.01991541 ...  0.03325571  0.02531973
     0.02893921]
   [ 0.01430886  0.03260048  0.0250055  ...  0.02686636  0.02567566
     0.02485057]


## 下面定义基础网络

In [7]:
def vgg_blk(num_channels):
    blk = nn.Sequential()
    for _ in range(2):
        blk.add(nn.BatchNorm(),
                nn.Activation('relu'),
                nn.Conv2D(channels=num_channels,kernel_size=3,strides=1,padding=1))
    blk.add(nn.BatchNorm(),nn.Activation('relu'),
            nn.Conv2D(channels=num_channels,kernel_size=1,strides=1))
    blk.add(nn.MaxPool2D(pool_size=2,strides=2))
    
    return blk

In [8]:
def vgg_16():
    vgg_16 = nn.Sequential()
    conv = (32,64,128,256,512,512)
    vgg_16.add(nn.BatchNorm(),nn.Activation('relu'),
              nn.Conv2D(channels=conv[0],kernel_size=7,strides=1,padding=3),
              nn.BatchNorm(),nn.Activation('relu'),
              nn.Conv2D(channels=conv[0],kernel_size=3,strides=1,padding=1))   
    #vgg_16.add(nn.MaxPool2D(pool_size=2,strides=2))    
    for i in range(3):
        vgg_16.add(vgg_blk(conv[i+1]))
    
    #最后两层换成卷积
    vgg_16.add(nn.BatchNorm(),nn.Activation('relu'),
               nn.Conv2D(channels=conv[4],kernel_size=3,strides=1,padding=1),
               nn.BatchNorm(),nn.Activation('relu'),
               nn.Conv2D(channels=conv[5],kernel_size=1))
    
    return vgg_16
    

In [9]:
go_forward(vgg_16())

original shape = (1, 3, 256, 256)
ouput shape = 
[[[[-2.75644776e-03 -2.35306984e-03 -2.24168017e-03 ... -2.56139808e-03
    -2.81981169e-03 -1.22101826e-03]
   [-5.48647530e-03 -5.05096698e-03 -4.28996235e-03 ... -4.17764438e-03
    -4.30802256e-03 -6.90575762e-05]
   [-6.43548416e-03 -4.16394323e-03 -4.53535188e-03 ... -4.68625873e-03
    -4.37358208e-03  1.33916023e-04]
   ...
   [-6.14779489e-03 -4.17199451e-03 -4.12868755e-03 ... -4.11138544e-03
    -3.49153206e-03  2.03876101e-04]
   [-5.46118012e-03 -3.93041270e-03 -3.67938401e-03 ... -3.56514822e-03
    -3.08740884e-03  4.02582926e-04]
   [-3.27647454e-03 -2.12497846e-03 -1.43123732e-03 ... -1.60587695e-03
    -1.50453037e-04  2.03067623e-03]]

  [[-1.79876387e-03 -3.07843229e-03 -2.76406342e-03 ... -2.36281753e-03
    -2.50654551e-03 -1.12260714e-04]
   [-2.94301950e-04 -8.19804147e-04 -6.19313738e-04 ... -8.39802611e-04
    -5.12642320e-04 -2.97139370e-04]
   [ 3.78639816e-04 -2.73079204e-04  3.87289649e-04 ...  4.23942547e-0

In [10]:
#把通道数换到最后
def flatten_pred(pred):
    return pred.transpose((0,2,3,1)).flatten()

def concat_pred(preds):
    return nd.concat(*[flatten_pred(pred) for pred in preds],dim=1)

## 定义SSD中前向运算函数

In [11]:
def blk_forward(net,X,sizes,ratios,cls_predictor,bbox_predictor):
    #首先计算这一层的输出
    Y = net(X)
    #生成默认锚框
    anchors = contrib.nd.MultiBoxPrior(Y,sizes=sizes,ratios=ratios)
    #进行预测
    cls_preds = cls_predictor(Y)
    bbox_preds = bbox_predictor(Y)
    #输出生成的默认锚框，类别预测，偏移量预测
    return (Y,anchors,cls_preds,bbox_preds)

In [12]:
def get_blk(i):
    if i==0:
        return vgg_16()
    elif i==4:
        return nn.GlobalMaxPool2D()
    else:
        return down_sample_blk(256)

In [13]:
class SSD(nn.Block):
    def __init__(self,num_classes,**kwargs):
        super(SSD,self).__init__(**kwargs)
        self.num_classes = num_classes           #需要预测的总类别数
        self.sizes = [[0.2, 0.272], [0.37, 0.447], [0.54, 0.619], [0.71, 0.79],
                      [0.88, 0.961]]
        self.ratios = [[0.35, 1.5, 0.75]] * 5                      #超参数，每一层我们需要生成的锚框宽高比和大小比例
        #下面定义每一层的网络
        for i in range(5):
            #这一层的每个像素的锚框个数
            num_anchors = len(self.sizes[i])+len(self.ratios[i])-1
            #卷积层
            setattr(self,'blk_%d' % i,get_blk(i))
            #类别预测
            setattr(self,'cls_pred_%d' % i,cls_predictor(num_anchors,num_classes))
            #锚框偏移量预测
            setattr(self,'bbox_pred_%d' % i ,bbox_predictor(num_anchors))
    
    def forward(self,X):
        #前向运算函数
        anchors,cls_preds,bbox_preds= [],[],[]
        
        for i in range(5):
            #进行一次前向运算
            X,anchor,cls_pred,bbox_pred = blk_forward(getattr(self,'blk_%d' % i),X,self.sizes[i],self.ratios[i],
                                                     getattr(self,'cls_pred_%d' % i),
                                                     getattr(self,'bbox_pred_%d' % i))
            #将输出追加到结果中
            anchors.append(anchor)
            cls_preds.append(cls_pred)
            bbox_preds.append(bbox_pred)
        #最后将结果输出
        return(nd.concat(*anchors,dim=1),
               concat_pred(cls_preds).reshape((0,-1,self.num_classes+1)),
               concat_pred(bbox_preds))

In [14]:
tiny_ssd = SSD(num_classes=1)

In [15]:
class smooth_L1Loss(gloss.Loss):
    def __init__(self,sigma=0.5,weight=None,batch_axis=0,data_axis=-1,**kwargs):
        super(smooth_L1Loss,self).__init__(weight,batch_axis,**kwargs)
        self.axis = data_axis
        self._sigma = sigma
       
    #loss里面有很多函数可以自己来定义loss
    def hybrid_forward(self,F,pred,label,sample_weight=None): 
        loss = F.smooth_l1((pred-label),scalar=self._sigma)
        loss = gluon.loss._apply_weighting(F,loss,self._weight,sample_weight)
        return F.mean(loss,axis = self._batch_axis,exclude =True)

In [16]:
class focal_SoftMaxCrossEntropy(gloss.Loss):
    def __init__(self,gamma=2.0,data_axis=-1,batch_axis=0,
                sparse_label = True,from_logits = False,eps =1e-5,weight=None,**kwargs):
        super(focal_SoftMaxCrossEntropy,self).__init__(weight,batch_axis,**kwargs)
        self._gamma = 2.0
        self._axis = data_axis
        self._sparse_label = sparse_label
        self._from_logits = from_logits
        self._eps = eps
    
    def hybrid_forward(self,F,pred,label,smaple_weight = None):
        if not self._from_logits:
            #计算一次softmax
            pred = F.softmax(pred,axis = self._axis)
        if self._sparse_label:
            #这里keep_dim是为了后面与 smaple_weight相乘
            pred = nd.pick(pred,label,axis=self._axis,keepdims=True)
            loss = -((1-pred)**self._gamma)*F.log(pred+self._eps)
        loss  = gluon.loss._apply_weighting(F,loss,self._weight,smaple_weight)
        return F.mean(loss,axis = self._batch_axis,exclude=True)            

## 定义优化器

In [18]:
tiny_SSD = SSD(num_classes=1)
ctx = gb.try_gpu()
ctx

gpu(0)

In [19]:
tiny_SSD.initialize(init = init.Xavier(),ctx=ctx)
trainer = gluon.Trainer(tiny_SSD.collect_params(),'sgd',{'learning_rate':0.2,'wd':5e-4})

## 定义评价函数

In [20]:
def cls_eval(cls_pred,cls_label):
    return (cls_pred.argmax(axis=-1)==cls_label).mean().asscalar()

def bbox_eval(bbox_pred,bbox_label,bbox_mask):
    return (bbox_pred*bbox_mask-bbox_label*bbox_mask).abs().mean().asscalar()


In [21]:
smooth_L1 = smooth_L1Loss()
focal_loss = focal_SoftMaxCrossEntropy()

## 获取训练数据

In [26]:
batch_size = 32
train_iter,test_iter =gb.load_data_pikachu(batch_size)
train_iter.reshape(label_shape=(3,5))

In [24]:
num_epoches = 5

In [28]:
train_iter.next().data[0]


[[[[109. 106. 106. ... 135. 136. 135.]
   [109. 106. 106. ... 136. 136. 136.]
   [108. 106. 105. ... 136. 136. 137.]
   ...
   [ 79.  56.  74. ... 128. 127. 131.]
   [ 95.  98.  99. ... 139. 139. 135.]
   [125. 121. 140. ... 126. 127. 131.]]

  [[ 98.  95.  95. ... 126. 127. 126.]
   [ 98.  95.  95. ... 128. 128. 128.]
   [ 97.  95.  94. ... 128. 128. 129.]
   ...
   [ 72.  49.  67. ...  76.  75.  79.]
   [ 88.  91.  92. ...  83.  83.  79.]
   [121. 117. 136. ...  78.  79.  81.]]

  [[ 80.  77.  77. ... 109. 110. 109.]
   [ 80.  77.  77. ... 109. 109. 109.]
   [ 79.  77.  76. ... 109. 109. 110.]
   ...
   [ 67.  44.  62. ...  43.  41.  45.]
   [ 80.  83.  84. ...  46.  45.  41.]
   [119. 115. 134. ...  35.  36.  38.]]]


 [[[119. 120. 120. ... 169. 132. 127.]
   [119. 120. 122. ... 172. 164. 156.]
   [119. 120. 122. ... 167. 158. 161.]
   ...
   [ 58.  58.  50. ...  72.  66.  64.]
   [ 56.  69.  66. ...  71.  65.  64.]
   [ 62.  78.  84. ...  71.  67.  68.]]

  [[200. 199. 198. ... 13

In [30]:
def train(num_epoches):
    for epoch in range(num_epoches):
        start = time.time()
        train_cls_acc = 0
        train_bbox_loss = 0
        train_iter.reset()

        for i,batch in enumerate(train_iter):
            #获取小批量数据
            X = batch.data[0].as_in_context(ctx)
            Y = batch.label[0].as_in_context(ctx)
            with autograd.record():
                #前向运算
                anchors,cls_preds,bbox_preds = tiny_SSD(X)
                #标记锚框获得标签,这里还可以设置负采样
                bbox_labels,bbox_masks,cls_labels= contrib.nd.MultiBoxTarget(anchors,Y,cls_preds.transpose((0,2,1)))
                #计算损失
                l_cls = focal_loss(cls_preds,cls_labels)
                l_bbox = smooth_L1(bbox_preds*bbox_masks,bbox_labels*bbox_masks)
                l_total = l_cls+l_bbox
            #反向传播
            l_total.backward()
            #迭代参数
            trainer.step(batch_size)
            train_cls_acc += cls_eval(cls_preds,cls_labels)
            train_bbox_loss += bbox_eval(bbox_preds,bbox_labels,bbox_masks)
        #训练完epoch输出结果
        print('epoch %2d , train_cls_acc %.2f , bbox mae %.2e , time %.1f sec' %
             (epoch+1,train_cls_acc/(i+1),train_bbox_loss/(i+1),time.time()-start))