In [33]:
import os
import os.path
import cv2
import numpy as np
import torch
from torch.utils import data
import xml.etree.ElementTree as ET  # parse xml file
from torch.nn import init
from torch import nn
from torch import optim
import time
import torch.nn.functional as F
from torch.utils.data import DataLoader
import pickle

In [2]:
print(torch.__version__)

0.4.0


## Load VOC Dataset

In [3]:
VOC_CLASSES = (  # always index 0
    'aeroplane', 'bicycle', 'bird', 'boat',
    'bottle', 'bus', 'car', 'cat', 'chair',
    'cow', 'diningtable', 'dog', 'horse',
    'motorbike', 'person', 'pottedplant',
    'sheep', 'sofa', 'train', 'tvmonitor')

In [4]:
class VOCDetection(data.Dataset):
    """VOC Detection Dataset Object

    input is image, target is annotation

    Arguments:
        root (string): filepath to VOCdevkit folder.
        image_set (list with tuple-string): imageset to use (eg. [('2007', 'train')])
        transform (callable, optional): transformation to perform on the input image
        target_transform (callable, optional): transformation to perform on the target `annotation`
            (eg: take in caption string, return tensor of word indices)
    """

    def __init__(self, root, image_set, transform=None, target_transform=None):
        self.root = root
        self.image_set = image_set
        self.transform = transform
        self.target_transform = target_transform
        self._annopath = os.path.join('%s', 'Annotations', '%s.xml')
        self._imgpath = os.path.join('%s', 'JPEGImages', '%s.jpg')
        self.ids = list()
        for (year, name) in image_set:
            rootpath = os.path.join(self.root, 'VOC' + year)
            for line in open(os.path.join(rootpath, 'ImageSets', 'Main', name + '.txt')):
                self.ids.append((rootpath, line.strip()))

    def __getitem__(self, item):
        img, gt, h, w = self.pull_item(item)
        return img, gt

    def __len__(self):
        return len(self.ids)

    def pull_item(self, index):
        img_id = self.ids[index]
        target = ET.parse(self._annopath % img_id).getroot()
        img = cv2.imread(self._imgpath % img_id)
        height, width, channel = img.shape

        if self.target_transform is not None:
            target = self.target_transform(target, width, height)

        if self.transform is not None:
            target = np.array(target)
            img, boxes, labels = self.transform(img, target[:, :4], target[:, 4])
            img = img[:, :, (2, 1, 0)]  # bgr->rgb
            target = np.c_[boxes, np.expand_dims(labels, axis=1)]

        return torch.from_numpy(img).permute(2, 0, 1), target, height, width

    def pull_image(self, index):
        img_id = self.ids[index]
        # Note: here use the bgr form (rgb is also do well: remember to change mean)
        return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR)

    def pull_anno(self, index):
        img_id = self.ids[index]
        anno = ET.parse(self._annopath % img_id).getroot()
        gt = self.target_transform(anno, 1, 1)  # back original size
        return img_id[1], gt

In [5]:
class AnnotationTransform(object):
    """Transforms a VOC annotation into a Tensor of bbox coords and label index
    Initilized with a dictionary lookup of classnames to indexes

    Arguments:
        class_to_ind (dict, optional): dictionary lookup of classnames -> indexes
            (default: alphabetic indexing of VOC's 20 classes)
        keep_difficult (bool, optional): keep difficult instances or not (default: False)
        height (int): height
        width (int): width
    """

    def __init__(self, class_to_ind=None, keep_difficult=False):
        self.class_to_ind = class_to_ind or dict(zip(VOC_CLASSES, range(len(VOC_CLASSES))))
        self.keep_difficult = keep_difficult

    def __call__(self, target, width, height):
        res = []
        for obj in target.iter('object'):
            difficult = int(obj.find('difficult').text) == 1
            if not self.keep_difficult and difficult:
                continue
            name = obj.find('name').text.lower().strip()
            bbox = obj.find('bndbox')

            pts = ['xmin', 'ymin', 'xmax', 'ymax']
            bndbox = []
            for i, pt in enumerate(pts):
                cur_pt = int(bbox.find(pt).text) - 1
                cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height
                bndbox.append(cur_pt)
            label_idx = self.class_to_ind[name]
            bndbox.append(label_idx)
            res += [bndbox]  # each elem: [xmin, ymin, xmax, ymax, label_ind]
        return res

In [6]:
# basic transform: norm+scale, mean is bgr form
# Note: weights from yolo-official not minus mean but with scale
class BaseTransform(object):
    def __init__(self, size=300, mean=(104, 117, 123), scale=False):
        self.size = size
        self.mean = np.array(mean, dtype=np.float32)
        self.scale = scale

    def __call__(self, image, boxes=None, labels=None):
        image = cv2.resize(image, (self.size, self.size)).astype(np.float32)
        image -= self.mean
        image = image / 255.0 if self.scale else image
        return image, boxes, labels


def detection_collate(batch):
    targets = []
    imgs = []
    for sample in batch:
        imgs.append(sample[0])
        targets.append(torch.FloatTensor(sample[1]))
    return torch.stack(imgs, 0), targets


# image: np.array, box: tuple (left, top, right, bottom)
def draw_box(image, label, box, c):
    h, w = image.shape[:2]
    thickness = (w + h) // 300
    left, top, right, bottom = box
    top, left = max(0, np.round(top).astype('int32')), max(0, np.round(left).astype('int32'))
    right, bottom = min(w, np.round(right).astype('int32')), min(h, np.round(bottom).astype('int32'))
    cv2.rectangle(image, (left, top), (right, bottom), cfg.colors[c], thickness)
    cv2.putText(image, label, (left, top - 5), 0, 0.5, cfg.colors[c], 1)

In [7]:
target_transform = AnnotationTransform()
root = '/scratch/rw2268/VOCdevkit/'
image_set = [('2007', 'trainval')]
dataset = VOCDetection(root, image_set, transform=BaseTransform(), target_transform=target_transform)
BATCH_SIZE = 32

In [8]:
data_loader = DataLoader(dataset, BATCH_SIZE, num_workers=4,
                         shuffle=True, collate_fn=detection_collate, pin_memory=True)

### Sample Data

In [9]:
dataset.__getitem__(10)

(tensor([[[  89.,   90.,   90.,  ..., -123., -120., -112.],
          [  89.,   90.,   90.,  ..., -123., -120., -112.],
          [  87.,   88.,   89.,  ..., -123., -121., -106.],
          ...,
          [-103., -108., -108.,  ..., -123., -123., -123.],
          [-104., -105., -110.,  ..., -122., -123., -122.],
          [-107., -103., -108.,  ..., -122., -123., -122.]],
 
         [[  97.,   98.,   99.,  ..., -117., -114., -105.],
          [  98.,   99.,  100.,  ..., -117., -114., -105.],
          [  99.,  100.,  100.,  ..., -117., -115.,  -99.],
          ...,
          [ -85.,  -91.,  -91.,  ..., -117., -117., -117.],
          [ -88.,  -89.,  -94.,  ..., -117., -117., -116.],
          [ -94.,  -90.,  -92.,  ..., -117., -117., -116.]],
 
         [[ 131.,  132.,  132.,  ..., -104.,  -99.,  -88.],
          [ 130.,  130.,  131.,  ..., -104.,  -99.,  -88.],
          [ 127.,  129.,  131.,  ..., -104., -102.,  -85.],
          ...,
          [ -79.,  -85.,  -88.,  ..., -104., -104

In [10]:
print(len(dataset))
img, gt = dataset[0]
print(img.size())
print(gt)

5011
torch.Size([3, 300, 300])
[[0.524      0.56       0.646      0.90133333 8.        ]
 [0.328      0.70133333 0.504      0.98933333 8.        ]
 [0.48       0.51466667 0.588      0.79466667 8.        ]]


## Build SSD Model

In [11]:
from ssd.ssd300 import build_ssd
net = build_ssd('train')

  init.constant(self.weight, self.gamma)


In [12]:
# weight initialization
def xavier(param):
    init.xavier_uniform(param)


def weights_init(m):
    if isinstance(m, nn.Conv2d):
        xavier(m.weight.data)
        m.bias.data.zero_() if m.bias is not None else None


# Sets the learning rate to the initial LR decayed by 10 at every specified step
def adjust_learning_rate(optimizer, lr, gamma, step):
    lr = lr * (gamma ** step)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [13]:
net.extras.apply(weights_init)
net.loc.apply(weights_init)
net.conf.apply(weights_init)

  This is separate from the ipykernel package so we can avoid doing imports until


ModuleList(
  (0): Conv2d(512, 84, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): Conv2d(1024, 126, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (2): Conv2d(512, 126, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): Conv2d(256, 126, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (4): Conv2d(256, 84, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (5): Conv2d(256, 84, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)

## Set up Training Process

In [14]:
num_workers = 4
cuda = True
lr = 1e-4
momentum = 0.9
weight_decay = 5e-4
gamma = 0.1

### Loss Function

In [15]:
num_classes = len(VOC_CLASSES) + 1
overlap_thresh = 0.5
neg_pos = 3
variance = [0.1, 0.2]

In [16]:
from ssd.utils_ssd.box_utils import match, log_sum_exp


# evaluate conf_loss and loc_loss
class MultiBoxLoss(nn.Module):
    def __init__(self):
        super(MultiBoxLoss, self).__init__()
        self.num_classes = num_classes
        self.threshold = overlap_thresh
        self.negpos_ratio = neg_pos
        self.variance = variance

    def forward(self, preds, targets):
        loc_data, conf_data, priors = preds
        num = loc_data.size(0)
        num_priors = priors.size(0)
        # match priors (priors->nearest target)
        loc_t = torch.Tensor(num, num_priors, 4)
        conf_t = torch.LongTensor(num, num_priors)
        if loc_data.is_cuda:
            loc_t, conf_t = loc_t.cuda(), conf_t.cuda()
        for idx in range(num):
            truths = targets[idx][:, :-1]
            labels = targets[idx][:, -1]
            defaults = priors
            match(self.threshold, truths, defaults, self.variance, labels, loc_t, conf_t, idx)
        pos = conf_t > 0
        # location loss
        pos_idx = pos.unsqueeze(2).expand_as(loc_data)
        loc_p = loc_data[pos_idx].view(-1, 4)
        loc_t = loc_t[pos_idx].view(-1, 4)
        loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False)

        # evaluate each priors's loss (the same as the paper)
        batch_conf = conf_data
        loss_c = (log_sum_exp(batch_conf) - batch_conf.gather(2, conf_t.unsqueeze(2))).squeeze(2)
        # hard negative mining: note: the batch size of each iteration is not the same
        # find the "max loss" background
        loss_c[pos] = 0  # filter out pos boxes
        _, loss_idx = loss_c.sort(1, descending=True)
        _, idx_rank = loss_idx.sort(1)
        num_pos = pos.long().sum(1, keepdim=True)
        num_neg = torch.clamp(self.negpos_ratio * num_pos, max=pos.size(1) - 1)  # size: [num, 1]
        neg = idx_rank < num_neg.expand_as(idx_rank)
        # confidence loss (pos:neg=1:3)
        pos_idx = pos.unsqueeze(2).expand_as(conf_data)
        neg_idx = neg.unsqueeze(2).expand_as(conf_data)
        conf_p = conf_data[(pos_idx + neg_idx).gt(0)].view(-1, self.num_classes)
        targets_weightd = conf_t[(pos + neg).gt(0)]
        loss_c = F.cross_entropy(conf_p, targets_weightd, size_average=False)

        return loss_l / num_pos.sum().float(), loss_c / num_pos.sum().float()


In [17]:
optimizer = optim.SGD(net.parameters(), lr=lr,
                      momentum=momentum, weight_decay=weight_decay)
criterion = MultiBoxLoss()

## Begin Training

In [18]:
net.train()
loc_loss, conf_loss = 0, 0
epoch_num = 3
step_index = 0
epoch_size = len(dataset) // BATCH_SIZE


In [19]:
cuda = torch.cuda.is_available()

In [20]:
print("Training SSD on VOC")

batch_iterator = None
images = torch.randn((BATCH_SIZE, 3, 300, 300), requires_grad=True)

if cuda:
    net = net.cuda()
    images = images.cuda()

for epoch in range(epoch_num):
    # to do: adjust learning rate
    for i, (imgs, targets) in enumerate(data_loader):
        if i == epoch_size:
            break
        images.data.copy_(imgs)
        targets = [anno.cuda() for anno in targets] if cuda else [anno for anno in targets]
        t0 = time.time()
        out = net(images)
        optimizer.zero_grad()
        loss_l, loss_c = criterion(out, targets)
        loss = loss_c + loss_l
        loss.backward()
        optimizer.step()
        t1 = time.time()
        if i % 10 == 0:
            print('Timer: %.4f sec.' % (t1 - t0))
            print('epoch ' + repr(epoch) + ' iter ' + repr(i) + ' || Loss: %.4f || ' % (loss.item()), end=' ')
#     if epoch % 20 == 0 and epoch != 0:
#         print('Saving state, epoch: ', epoch)
        

Training SSD on VOC
Timer: 0.9458 sec.
epoch 0 iter 0 || Loss: 29.2489 ||  Timer: 0.8835 sec.
epoch 0 iter 10 || Loss: 17.5946 ||  Timer: 0.8857 sec.
epoch 0 iter 20 || Loss: 15.4053 ||  Timer: 0.8900 sec.
epoch 0 iter 30 || Loss: 15.7195 ||  Timer: 0.8901 sec.
epoch 0 iter 40 || Loss: 15.0775 ||  Timer: 0.8977 sec.
epoch 0 iter 50 || Loss: 15.1548 ||  Timer: 0.8855 sec.
epoch 0 iter 60 || Loss: 14.6714 ||  Timer: 0.8783 sec.
epoch 0 iter 70 || Loss: 14.7855 ||  Timer: 0.8893 sec.
epoch 0 iter 80 || Loss: 14.7947 ||  Timer: 0.8953 sec.
epoch 0 iter 90 || Loss: 14.9965 ||  Timer: 0.8944 sec.
epoch 0 iter 100 || Loss: 14.7506 ||  Timer: 0.8917 sec.
epoch 0 iter 110 || Loss: 14.6937 ||  Timer: 0.8882 sec.
epoch 0 iter 120 || Loss: 14.6921 ||  Timer: 0.8919 sec.
epoch 0 iter 130 || Loss: 14.8242 ||  Timer: 0.8911 sec.
epoch 0 iter 140 || Loss: 14.7320 ||  Timer: 0.8840 sec.
epoch 0 iter 150 || Loss: 14.7143 ||  Timer: 0.9341 sec.
epoch 1 iter 0 || Loss: 14.6691 ||  Timer: 0.8928 sec.
epoch

In [21]:
torch.save(net.state_dict(), '/scratch/rw2268/VOCresults/ssd/ssd300_' + repr(epoch) + '.pth')

## Evaluate Model

In [22]:
#if resume, load pre-trained model
net = build_ssd('test', bone='vgg')
net.load_state_dict(torch.load('/scratch/rw2268/VOCresults/ssd/ssd300_2.pth'))
net.eval()
if cuda:
    net = net.cuda()
testset = VOCDetection(root, [('2007', 'test')], BaseTransform(), AnnotationTransform())

In [23]:
class Timer(object):
    def __init__(self):
        self.total_time = 0.
        self.calls = 0
        self.start_time = 0.
        self.diff = 0.
        self.average_time = 0.

    def tic(self):
        self.start_time = time.time()

    def toc(self, avg=True):
        self.diff = time.time() - self.start_time
        self.total_time += self.diff
        self.calls += 1
        self.average_time = self.total_time / self.calls
        if avg:
            return self.average_time
        else:
            return self.diff

In [29]:
# ----save the pred boxes+info to .pkl-----
# all detections are collected into:
# all_boxes[cls][image] = N x 5 array of detections in
# (x1, y1, x2, y2, score)
def generate_boxes(dataset, net, det_file):
    img_num = len(dataset)
    # TODO: delete
    # img_num = 50
    all_boxes = [[[] for _ in range(img_num)] for _ in range(num_classes)]

    _t = {'im_detect': Timer(), 'misc': Timer()}

    x = torch.randn((1, 3, 300, 300))
    x = x.cuda() if cuda else x
    for i in range(img_num):
        im, gt, h, w = dataset.pull_item(i)
        x.copy_(im.unsqueeze(0))
        _t['im_detect'].tic()
        
        with torch.no_grad():
            y = net(x)
        detect_time = _t['im_detect'].toc(avg=False)
        # "store" to each class
#         print(y[0].size())
#         print(y[0])
#         print(y[1].size())
#         print(y[1])
#         print(y[2].size())
#         print(y[2])
        for j in range(1, y.size(1)):
            dets = y[0, j, :]
            mask = dets[:, 0].gt(0.).expand(5, dets.size(0)).t()
            dets = torch.masked_select(dets, mask).view(-1, 5)
            if dets.size(0) == 0:
                continue
            boxes = dets[:, 1:]
            boxes[:, 0::2] *= w
            boxes[:, 1::2] *= h
            scores = dets[:, 0].cpu().numpy()
            cls_dets = np.c_[boxes.cpu().numpy(), scores]
            all_boxes[j][i] = cls_dets
        print('im_detect: {:d}/{:d} {:.3f}s'.format(i + 1, img_num, detect_time))
        if i > 100:
            break

    with open(det_file, 'wb') as f:
        pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL)
    return all_boxes

In [30]:
def get_output_dir(root, name):
    filedir = os.path.join(root, name)
    if not os.path.exists(filedir):
        os.makedirs(filedir)
    return filedir

In [31]:
output_folder = '/scratch/rw2268/VOCresults/ssd'
output_dir = get_output_dir(output_folder, 'eval')
det_file = os.path.join(output_dir, 'detections.pkl')

In [34]:
print('predict boxes, and save to .pkl')
box_list = generate_boxes(testset, net, det_file)


predict boxes, and save to .pkl
im_detect: 1/4952 0.759s
im_detect: 2/4952 0.766s
im_detect: 3/4952 0.747s
im_detect: 4/4952 0.746s
im_detect: 5/4952 0.771s
im_detect: 6/4952 0.755s
im_detect: 7/4952 0.759s
im_detect: 8/4952 0.756s
im_detect: 9/4952 0.756s
im_detect: 10/4952 0.759s
im_detect: 11/4952 0.752s
im_detect: 12/4952 0.752s
im_detect: 13/4952 0.753s
im_detect: 14/4952 0.749s
im_detect: 15/4952 0.754s
im_detect: 16/4952 0.763s
im_detect: 17/4952 0.752s
im_detect: 18/4952 0.763s
im_detect: 19/4952 0.750s
im_detect: 20/4952 0.757s
im_detect: 21/4952 0.747s
im_detect: 22/4952 0.750s
im_detect: 23/4952 0.756s
im_detect: 24/4952 0.765s
im_detect: 25/4952 0.761s
im_detect: 26/4952 0.761s
im_detect: 27/4952 0.759s
im_detect: 28/4952 0.757s
im_detect: 29/4952 0.755s
im_detect: 30/4952 0.758s
im_detect: 31/4952 0.758s
im_detect: 32/4952 0.755s
im_detect: 33/4952 0.757s
im_detect: 34/4952 0.756s
im_detect: 35/4952 0.757s
im_detect: 36/4952 0.744s
im_detect: 37/4952 0.764s
im_detect: 38/4

[]

In [30]:
a = torch.Tensor(np.array([[1,2,3],[2,1,3]]))

In [43]:
a.gt(2).expand(4, a.size(0)).t()

RuntimeError: The expanded size of the tensor (2) must match the existing size (3) at non-singleton dimension 1

In [39]:
import inspect
inspect.getsource(torch.Tensor.expand())

TypeError: descriptor 'expand' of 'torch._C._TensorBase' object needs an argument

In [42]:
print(torch.Tensor.expand)

<method 'expand' of 'torch._C._TensorBase' objects>
