In [61]:
import os
import os.path
import cv2
import numpy as np
import torch
from torch.utils import data
import xml.etree.ElementTree as ET  # parse xml file
from torch.nn import init
from torch import nn
from torch import optim
import time
import torch.nn.functional as F
from torch.utils.data import DataLoader

## Load VOC Dataset

In [13]:
VOC_CLASSES = (  # always index 0
    'aeroplane', 'bicycle', 'bird', 'boat',
    'bottle', 'bus', 'car', 'cat', 'chair',
    'cow', 'diningtable', 'dog', 'horse',
    'motorbike', 'person', 'pottedplant',
    'sheep', 'sofa', 'train', 'tvmonitor')

In [11]:
class VOCDetection(data.Dataset):
    """VOC Detection Dataset Object

    input is image, target is annotation

    Arguments:
        root (string): filepath to VOCdevkit folder.
        image_set (list with tuple-string): imageset to use (eg. [('2007', 'train')])
        transform (callable, optional): transformation to perform on the input image
        target_transform (callable, optional): transformation to perform on the target `annotation`
            (eg: take in caption string, return tensor of word indices)
    """

    def __init__(self, root, image_set, transform=None, target_transform=None):
        self.root = root
        self.image_set = image_set
        self.transform = transform
        self.target_transform = target_transform
        self._annopath = os.path.join('%s', 'Annotations', '%s.xml')
        self._imgpath = os.path.join('%s', 'JPEGImages', '%s.jpg')
        self.ids = list()
        for (year, name) in image_set:
            rootpath = os.path.join(self.root, 'VOC' + year)
            for line in open(os.path.join(rootpath, 'ImageSets', 'Main', name + '.txt')):
                self.ids.append((rootpath, line.strip()))

    def __getitem__(self, item):
        img, gt, h, w = self.pull_item(item)
        return img, gt

    def __len__(self):
        return len(self.ids)

    def pull_item(self, index):
        img_id = self.ids[index]
        target = ET.parse(self._annopath % img_id).getroot()
        img = cv2.imread(self._imgpath % img_id)
        height, width, channel = img.shape

        if self.target_transform is not None:
            target = self.target_transform(target, width, height)

        if self.transform is not None:
            target = np.array(target)
            img, boxes, labels = self.transform(img, target[:, :4], target[:, 4])
            img = img[:, :, (2, 1, 0)]  # bgr->rgb
            target = np.c_[boxes, np.expand_dims(labels, axis=1)]

        return torch.from_numpy(img).permute(2, 0, 1), target, height, width

    def pull_image(self, index):
        img_id = self.ids[index]
        # Note: here use the bgr form (rgb is also do well: remember to change mean)
        return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR)

    def pull_anno(self, index):
        img_id = self.ids[index]
        anno = ET.parse(self._annopath % img_id).getroot()
        gt = self.target_transform(anno, 1, 1)  # back original size
        return img_id[1], gt

In [14]:
class AnnotationTransform(object):
    """Transforms a VOC annotation into a Tensor of bbox coords and label index
    Initilized with a dictionary lookup of classnames to indexes

    Arguments:
        class_to_ind (dict, optional): dictionary lookup of classnames -> indexes
            (default: alphabetic indexing of VOC's 20 classes)
        keep_difficult (bool, optional): keep difficult instances or not (default: False)
        height (int): height
        width (int): width
    """

    def __init__(self, class_to_ind=None, keep_difficult=False):
        self.class_to_ind = class_to_ind or dict(zip(VOC_CLASSES, range(len(VOC_CLASSES))))
        self.keep_difficult = keep_difficult

    def __call__(self, target, width, height):
        res = []
        for obj in target.iter('object'):
            difficult = int(obj.find('difficult').text) == 1
            if not self.keep_difficult and difficult:
                continue
            name = obj.find('name').text.lower().strip()
            bbox = obj.find('bndbox')

            pts = ['xmin', 'ymin', 'xmax', 'ymax']
            bndbox = []
            for i, pt in enumerate(pts):
                cur_pt = int(bbox.find(pt).text) - 1
                cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height
                bndbox.append(cur_pt)
            label_idx = self.class_to_ind[name]
            bndbox.append(label_idx)
            res += [bndbox]  # each elem: [xmin, ymin, xmax, ymax, label_ind]
        return res

In [15]:
# basic transform: norm+scale, mean is bgr form
# Note: weights from yolo-official not minus mean but with scale
class BaseTransform(object):
    def __init__(self, size=300, mean=(104, 117, 123), scale=False):
        self.size = size
        self.mean = np.array(mean, dtype=np.float32)
        self.scale = scale

    def __call__(self, image, boxes=None, labels=None):
        image = cv2.resize(image, (self.size, self.size)).astype(np.float32)
        image -= self.mean
        image = image / 255.0 if self.scale else image
        return image, boxes, labels


def detection_collate(batch):
    targets = []
    imgs = []
    for sample in batch:
        imgs.append(sample[0])
        targets.append(torch.FloatTensor(sample[1]))
    return torch.stack(imgs, 0), targets


# image: np.array, box: tuple (left, top, right, bottom)
def draw_box(image, label, box, c):
    h, w = image.shape[:2]
    thickness = (w + h) // 300
    left, top, right, bottom = box
    top, left = max(0, np.round(top).astype('int32')), max(0, np.round(left).astype('int32'))
    right, bottom = min(w, np.round(right).astype('int32')), min(h, np.round(bottom).astype('int32'))
    cv2.rectangle(image, (left, top), (right, bottom), cfg.colors[c], thickness)
    cv2.putText(image, label, (left, top - 5), 0, 0.5, cfg.colors[c], 1)

In [50]:
target_transform = AnnotationTransform()
root = '/scratch/rw2268/VOCdevkit/'
image_set = [('2007', 'trainval')]
dataset = VOCDetection(root, image_set, transform=BaseTransform(), target_transform=target_transform)
BATCH_SIZE = 32

In [51]:
data_loader = DataLoader(dataset, BATCH_SIZE, num_workers=4,
                         shuffle=True, collate_fn=detection_collate, pin_memory=True)

### Sample Data

In [21]:
dataset.__getitem__(10)

(tensor([[[  89.,   90.,   90.,  ..., -123., -120., -112.],
          [  89.,   90.,   90.,  ..., -123., -120., -112.],
          [  87.,   88.,   89.,  ..., -123., -121., -106.],
          ...,
          [-103., -108., -108.,  ..., -123., -123., -123.],
          [-104., -105., -110.,  ..., -122., -123., -122.],
          [-107., -103., -108.,  ..., -122., -123., -122.]],
 
         [[  97.,   98.,   99.,  ..., -117., -114., -105.],
          [  98.,   99.,  100.,  ..., -117., -114., -105.],
          [  99.,  100.,  100.,  ..., -117., -115.,  -99.],
          ...,
          [ -85.,  -91.,  -91.,  ..., -117., -117., -117.],
          [ -88.,  -89.,  -94.,  ..., -117., -117., -116.],
          [ -94.,  -90.,  -92.,  ..., -117., -117., -116.]],
 
         [[ 131.,  132.,  132.,  ..., -104.,  -99.,  -88.],
          [ 130.,  130.,  131.,  ..., -104.,  -99.,  -88.],
          [ 127.,  129.,  131.,  ..., -104., -102.,  -85.],
          ...,
          [ -79.,  -85.,  -88.,  ..., -104., -104

In [22]:
print(len(dataset))
img, gt = dataset[0]
print(img.size())
print(gt)

5011
torch.Size([3, 300, 300])
[[0.524      0.56       0.646      0.90133333 8.        ]
 [0.328      0.70133333 0.504      0.98933333 8.        ]
 [0.48       0.51466667 0.588      0.79466667 8.        ]]


## Build SSD Model

In [23]:
from ssd.ssd300 import build_ssd
net = build_ssd('train')

  init.constant(self.weight, self.gamma)


In [24]:
# weight initialization
def xavier(param):
    init.xavier_uniform(param)


def weights_init(m):
    if isinstance(m, nn.Conv2d):
        xavier(m.weight.data)
        m.bias.data.zero_() if m.bias is not None else None


# Sets the learning rate to the initial LR decayed by 10 at every specified step
def adjust_learning_rate(optimizer, lr, gamma, step):
    lr = lr * (gamma ** step)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [27]:
net.extras.apply(weights_init)
net.loc.apply(weights_init)
net.conf.apply(weights_init)

  This is separate from the ipykernel package so we can avoid doing imports until


ModuleList(
  (0): Conv2d(512, 84, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): Conv2d(1024, 126, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (2): Conv2d(512, 126, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): Conv2d(256, 126, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (4): Conv2d(256, 84, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (5): Conv2d(256, 84, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)

## Set up Training Process

In [72]:
num_workers = 4
cuda = True
lr = 1e-4
momentum = 0.9
weight_decay = 5e-4
gamma = 0.1

### Loss Function

In [69]:
num_classes = len(VOC_CLASSES) + 1
overlap_thresh = 0.5
neg_pos = 3
variance = [0.1, 0.2]

In [63]:
from ssd.utils_ssd.box_utils import match, log_sum_exp


# evaluate conf_loss and loc_loss
class MultiBoxLoss(nn.Module):
    def __init__(self):
        super(MultiBoxLoss, self).__init__()
        self.num_classes = num_classes
        self.threshold = overlap_thresh
        self.negpos_ratio = neg_pos
        self.variance = variance

    def forward(self, preds, targets):
        loc_data, conf_data, priors = preds
        num = loc_data.size(0)
        num_priors = priors.size(0)
        # match priors (priors->nearest target)
        loc_t = torch.Tensor(num, num_priors, 4)
        conf_t = torch.LongTensor(num, num_priors)
        if loc_data.is_cuda:
            loc_t, conf_t = loc_t.cuda(), conf_t.cuda()
        for idx in range(num):
            truths = targets[idx][:, :-1]
            labels = targets[idx][:, -1]
            defaults = priors
            match(self.threshold, truths, defaults, self.variance, labels, loc_t, conf_t, idx)
        pos = conf_t > 0
        # location loss
        pos_idx = pos.unsqueeze(2).expand_as(loc_data)
        loc_p = loc_data[pos_idx].view(-1, 4)
        loc_t = loc_t[pos_idx].view(-1, 4)
        loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False)

        # evaluate each priors's loss (the same as the paper)
        batch_conf = conf_data
        loss_c = (log_sum_exp(batch_conf) - batch_conf.gather(2, conf_t.unsqueeze(2))).squeeze(2)
        # hard negative mining: note: the batch size of each iteration is not the same
        # find the "max loss" background
        loss_c[pos] = 0  # filter out pos boxes
        _, loss_idx = loss_c.sort(1, descending=True)
        _, idx_rank = loss_idx.sort(1)
        num_pos = pos.long().sum(1, keepdim=True)
        num_neg = torch.clamp(self.negpos_ratio * num_pos, max=pos.size(1) - 1)  # size: [num, 1]
        neg = idx_rank < num_neg.expand_as(idx_rank)
        # confidence loss (pos:neg=1:3)
        pos_idx = pos.unsqueeze(2).expand_as(conf_data)
        neg_idx = neg.unsqueeze(2).expand_as(conf_data)
        conf_p = conf_data[(pos_idx + neg_idx).gt(0)].view(-1, self.num_classes)
        targets_weightd = conf_t[(pos + neg).gt(0)]
        loss_c = F.cross_entropy(conf_p, targets_weightd, size_average=False)

        return loss_l / num_pos.sum().float(), loss_c / num_pos.sum().float()


In [64]:
optimizer = optim.SGD(net.parameters(), lr=lr,
                      momentum=momentum, weight_decay=weight_decay)
criterion = MultiBoxLoss()

## Begin Training

In [65]:
net.train()
loc_loss, conf_loss = 0, 0
epoch_num = 2
step_index = 0
epoch_size = len(dataset) // BATCH_SIZE


In [66]:
cuda = torch.cuda.is_available()

In [70]:
print("Training SSD on VOC")

batch_iterator = None
images = torch.randn((BATCH_SIZE, 3, 300, 300), requires_grad=True)

if cuda:
    net = net.cuda()
    images = images.cuda()

for epoch in range(epoch_num):
    # to do: adjust learning rate
    for i, (imgs, targets) in enumerate(data_loader):
        if i == epoch_size:
            break
        images.data.copy_(imgs)
        targets = [anno.cuda() for anno in targets] if cuda else [anno for anno in targets]
        t0 = time.time()
        out = net(images)
        optimizer.zero_grad()
        loss_l, loss_c = criterion(out, targets)
        loss = loss_c + loss_l
        loss.backward()
        optimizer.step()
        t1 = time.time()
        if i % 10 == 0:
            print('Timer: %.4f sec.' % (t1 - t0))
            print('epoch ' + repr(epoch) + ' iter ' + repr(i) + ' || Loss: %.4f || ' % (loss.item()), end=' ')
#     if epoch % 20 == 0 and epoch != 0:
#         print('Saving state, epoch: ', epoch)
#         torch.save(net.state_dict(), '../weights/ssd/ssd300_' + repr(epoch) + '.pth')

Training SSD on VOC
Timer: 0.6752 sec.
epoch 0 iter 0 || Loss: 15.1683 ||  Timer: 0.5978 sec.
epoch 0 iter 10 || Loss: 14.9793 ||  Timer: 0.5953 sec.
epoch 0 iter 20 || Loss: 14.9926 ||  Timer: 0.5838 sec.
epoch 0 iter 30 || Loss: 14.9264 ||  Timer: 0.5968 sec.
epoch 0 iter 40 || Loss: 14.9740 ||  Timer: 0.6005 sec.
epoch 0 iter 50 || Loss: 14.9197 ||  Timer: 0.6007 sec.
epoch 0 iter 60 || Loss: 14.8166 ||  Timer: 0.6016 sec.
epoch 0 iter 70 || Loss: 14.9760 ||  Timer: 0.6043 sec.
epoch 0 iter 80 || Loss: 14.8129 ||  Timer: 0.6051 sec.
epoch 0 iter 90 || Loss: 14.8660 ||  Timer: 0.6017 sec.
epoch 0 iter 100 || Loss: 14.8932 ||  Timer: 0.6196 sec.
epoch 0 iter 110 || Loss: 14.8490 ||  Timer: 0.6034 sec.
epoch 0 iter 120 || Loss: 14.8722 ||  Timer: 0.6003 sec.
epoch 0 iter 130 || Loss: 14.8485 ||  Timer: 0.6187 sec.
epoch 0 iter 140 || Loss: 14.9410 ||  Timer: 0.5913 sec.
epoch 0 iter 150 || Loss: 14.7774 ||  Timer: 0.6128 sec.
epoch 1 iter 0 || Loss: 14.8733 ||  Timer: 0.5939 sec.
epoch

Process Process-33:
Process Process-35:
Process Process-36:
Process Process-34:
Traceback (most recent call last):
  File "/home/rw2268/.conda/envs/nlp/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/rw2268/.conda/envs/nlp/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "/home/rw2268/.conda/envs/nlp/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 96, in _worker_loop
    r = index_queue.get(timeout=MANAGER_STATUS_CHECK_INTERVAL)
  File "/home/rw2268/.conda/envs/nlp/lib/python3.6/multiprocessing/queues.py", line 104, in get
    if not self._poll(timeout):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/rw2268/.conda/envs/nlp/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/rw2268/.conda/envs/nlp/lib/python3.6/multiprocessing/connection.py", lin

KeyboardInterrupt: 

In [71]:
epoch_num

230