In [1]:
import gluoncv as gcv
from gluoncv.data import VOCDetection
# typically we use 2007+2012 trainval splits for training data
train_dataset = VOCDetection(splits=[(2007, 'trainval'), (2012, 'trainval')])
# and use 2007 test as validation data
val_dataset = VOCDetection(splits=[(2007, 'test')])



In [19]:
from gluoncv.data.transforms import presets
from gluoncv import utils
from mxnet import nd

width, height = 416, 416  # resize image to 416x416 after all data augmentation
train_transform = presets.yolo.YOLO3DefaultTrainTransform(width, height)
val_transform = presets.yolo.YOLO3DefaultValTransform(width, height)

from gluoncv.data.batchify import Tuple, Stack, Pad
from mxnet.gluon.data import DataLoader

batch_size = 5  # for tutorial, we use smaller batch-size
num_workers = 0  # you can make it larger(if your CPU has more cores) to accelerate data loading

# behavior of batchify_fn: stack images, and pad labels
batchify_fn = Tuple(Stack(), Pad(pad_val=-1))
train_loader = DataLoader(train_dataset.transform(train_transform), batch_size, shuffle=True,
                          batchify_fn=batchify_fn, last_batch='rollover', num_workers=num_workers)
val_loader = DataLoader(val_dataset.transform(val_transform), batch_size, shuffle=False,
                        batchify_fn=batchify_fn, last_batch='keep', num_workers=num_workers)


In [20]:
for ib, batch in enumerate(train_loader):
    if ib > 0:
        break
    print("batch: \n", batch)
    print('\ndata 0:', batch[0][0].shape, 'label 0:', batch[1][0].shape)
    print('data 1:', batch[0][1].shape, 'label 1:', batch[1][1].shape)


batch: 
 (
[[[[-2.0781863  -2.0867932  -2.1016858  ... -2.0924532  -2.110679
    -2.1217976 ]
   [-2.082326   -2.0919385  -2.1083825  ... -2.102681   -2.113593
    -2.1203184 ]
   [-2.0907009  -2.099584   -2.1145532  ... -2.117129   -2.1177082
    -2.1182153 ]
   ...
   [ 0.1587951   0.1705039   0.1909194  ... -1.1928699  -1.0560479
    -0.9681952 ]
   [ 0.14921437  0.15044498  0.1518703  ... -1.1236101  -0.9422763
    -0.82680804]
   [ 0.14448634  0.13695976  0.12260129 ... -1.0840803  -0.8678288
    -0.73110235]]

  [[-2.0402267  -2.040413   -2.0414455  ... -2.0219479  -2.0318546
    -2.0382063 ]
   [-2.0295973  -2.0338562  -2.0416274  ... -2.02714    -2.0333095
    -2.0372605 ]
   [-2.0140755  -2.0224178  -2.036789   ... -2.034539   -2.0353832
    -2.0359147 ]
   ...
   [ 0.08760366  0.09957328  0.12044462 ... -1.3115871  -1.2079707
    -1.1408606 ]
   [ 0.0778088   0.07906675  0.08052415 ... -1.2389289  -1.0907786
    -0.9958279 ]
   [ 0.07297537  0.06528092  0.05060168 ... -1.1956

In [21]:
from gluoncv import model_zoo
net = model_zoo.get_model('yolo3_darknet53_voc', pretrained_base=False)

import mxnet as mx
x = mx.nd.zeros(shape=(1, 3, 416, 416))
net.initialize()
cids, scores, bboxes = net(x)

print("ids: ", cids.shape)
print("scores: ", scores.shape)
print("bboxes: ", bboxes.shape)


ids:  (1, 100, 1)
scores:  (1, 100, 1)
bboxes:  (1, 100, 4)


In [22]:
from mxnet import autograd
train_transform = presets.yolo.YOLO3DefaultTrainTransform(width, height, net)
# return stacked images, objectness_targets, center_targets, scale_targets, gradient weights, class_targets
# additionally, return padded ground truth bboxes, so there are 7 components returned by dataloader
batchify_fn = Tuple(*([Stack() for _ in range(6)] + [Pad(axis=0, pad_val=-1) for _ in range(1)]))
train_loader = DataLoader(train_dataset.transform(train_transform), batch_size, shuffle=True,
                          batchify_fn=batchify_fn, last_batch='rollover', num_workers=num_workers)

for ib, batch in enumerate(train_loader):
    if ib > 1:
        break

    print('batch[0][0]:', batch[0][0].shape)  # (3, h, w)
    print('batch[1][0]:', batch[1][0].shape)  # (10647, 1)
    print('batch[2][0]:', batch[2][0].shape)  # (10647, 2)
    print('batch[3][0]:', batch[3][0].shape)  # (10647, 2)
    print('batch[4][0]:', batch[4][0].shape)  # (10647, 2)
    print('batch[5][0]:', batch[5][0].shape)  # (10647, 20)
    print('batch[6][0]:', batch[6][0].shape)  # label (num_obj, 4)
    with autograd.record():
        input_order = [0, 6, 1, 2, 3, 4, 5]
        obj_loss, center_loss, scale_loss, cls_loss = net(*[batch[o] for o in input_order])
        
        print("obj_loss: ", obj_loss.shape)  # (2, )
        print("center_loss: ", center_loss.shape)  # (2, )
        print("scale_loss: ", scale_loss.shape)  # (2, )
        print("cls_loss: ", cls_loss.shape)  # (2, )




batch[0][0]: (3, 416, 416)
batch[1][0]: (10647, 1)
batch[2][0]: (10647, 2)
batch[3][0]: (10647, 2)
batch[4][0]: (10647, 2)
batch[5][0]: (10647, 20)
batch[6][0]: (11, 4)
obj_loss:  (5,)
center_loss:  (5,)
scale_loss:  (5,)
cls_loss:  (5,)
batch[0][0]: (3, 416, 416)
batch[1][0]: (10647, 1)
batch[2][0]: (10647, 2)
batch[3][0]: (10647, 2)
batch[4][0]: (10647, 2)
batch[5][0]: (10647, 20)
batch[6][0]: (7, 4)
obj_loss:  (5,)
center_loss:  (5,)
scale_loss:  (5,)
cls_loss:  (5,)
