In [9]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.backends.cudnn as cudnn
import torch.nn.init as init
import argparse
from torch.autograd import Variable
import torch.utils.data as data
from data import v2, v1, AnnotationTransform, VOCDetection, detection_collate, VOCroot, VOC_CLASSES
from utils.augmentations import SSDAugmentation
from layers.modules import MultiBoxLoss
#from ssd import build_ssd
import numpy as np
import time

def str2bool(v):
    return v.lower() in ("yes", "true", "t", "1")

parser = argparse.ArgumentParser(description='Single Shot MultiBox Detector Training')
parser.add_argument('--version', default='v2', help='conv11_2(v2) or pool6(v1) as last layer')
parser.add_argument('--basenet', default='vgg16_reducedfc.pth', help='pretrained base model')
parser.add_argument('--jaccard_threshold', default=0.5, type=float, help='Min Jaccard index for matching')
parser.add_argument('--batch_size', default=16, type=int, help='Batch size for training')
parser.add_argument('--resume', default=None, type=str, help='Resume from checkpoint')
parser.add_argument('--num_workers', default=2, type=int, help='Number of workers used in dataloading')
parser.add_argument('--iterations', default=120000, type=int, help='Number of training iterations')
parser.add_argument('--start_iter', default=0, type=int, help='Begin counting iterations starting from this value (should be used with resume)')
parser.add_argument('--cuda', default=True, type=str2bool, help='Use cuda to train model')
parser.add_argument('--lr', '--learning-rate', default=1e-3, type=float, help='initial learning rate')
parser.add_argument('--momentum', default=0.9, type=float, help='momentum')
parser.add_argument('--weight_decay', default=5e-4, type=float, help='Weight decay for SGD')
parser.add_argument('--gamma', default=0.1, type=float, help='Gamma update for SGD')
parser.add_argument('--log_iters', default=True, type=bool, help='Print the loss at each iteration')
parser.add_argument('--visdom', default=False, type=str2bool, help='Use visdom to for loss visualization')
parser.add_argument('--send_images_to_visdom', type=str2bool, default=False, help='Sample a random image from each 10th batch, send it to visdom after augmentations step')
parser.add_argument('--save_folder', default='weights/', help='Location to save checkpoint models')
parser.add_argument('--voc_root', default=VOCroot, help='Location of VOC root directory')
#args = parser.parse_args(['--voc_root','/home/amir/code/jacky/'])
args = parser.parse_args(['--voc_root','/home/amir/data/voc/VOCdevkit/'])

if args.cuda and torch.cuda.is_available():
    torch.set_default_tensor_type('torch.cuda.FloatTensor')
else:
    torch.set_default_tensor_type('torch.FloatTensor')

cfg = (v1, v2)[args.version == 'v2']

if not os.path.exists(args.save_folder):
    os.mkdir(args.save_folder)

#train_sets = [('2007', 'trainval'), ('2012', 'trainval')]
train_sets = [('2007', 'trainval')]
# train_sets = 'train'
ssd_dim = 300  # only support 300 now
means = (104, 117, 123)  # only support voc now
num_classes = len(VOC_CLASSES) + 1
batch_size = args.batch_size
accum_batch_size = 32
iter_size = accum_batch_size / batch_size
max_iter = 120000
weight_decay = 0.0005
stepvalues = (80000, 100000, 120000)
gamma = 0.1
momentum = 0.9

if args.visdom:
    import visdom
    viz = visdom.Visdom()

ssd_net = build_ssd('train', 300, num_classes)
net = ssd_net

if args.cuda:
    net = torch.nn.DataParallel(ssd_net)
    cudnn.benchmark = True

if args.resume:
    print('Resuming training, loading {}...'.format(args.resume))
    ssd_net.load_weights(args.resume)
else:
    vgg_weights = torch.load(args.save_folder + args.basenet)
    print('Loading base network...')
    ssd_net.vgg.load_state_dict(vgg_weights)

if args.cuda:
    net = net.cuda()


def xavier(param):
    init.xavier_uniform(param)


def weights_init(m):
    if isinstance(m, nn.Conv2d):
        xavier(m.weight.data)
        m.bias.data.zero_()


if not args.resume:
    print('Initializing weights...')
    # initialize newly added layers' weights with xavier method
    ssd_net.extras.apply(weights_init)
    ssd_net.loc.apply(weights_init)
    ssd_net.conf.apply(weights_init)

optimizer = optim.SGD(net.parameters(), lr=args.lr,
                      momentum=args.momentum, weight_decay=args.weight_decay)
criterion = MultiBoxLoss(num_classes, 0.5, True, 0, True, 3, 0.5, False, args.cuda)


def train():
    net.train()
    # loss counters
    loc_loss = 0  # epoch
    conf_loss = 0
    epoch = 0
    print('Loading Dataset...')

    dataset = VOCDetection(args.voc_root, train_sets, SSDAugmentation(
        ssd_dim, means), AnnotationTransform())

    epoch_size = len(dataset) // args.batch_size
    print('Training SSD on', dataset.name)
    step_index = 0
    if args.visdom:
        # initialize visdom loss plot
        lot = viz.line(
            X=torch.zeros((1,)).cpu(),
            Y=torch.zeros((1, 3)).cpu(),
            opts=dict(
                xlabel='Iteration',
                ylabel='Loss',
                title='Current SSD Training Loss',
                legend=['Loc Loss', 'Conf Loss', 'Loss']
            )
        )
        epoch_lot = viz.line(
            X=torch.zeros((1,)).cpu(),
            Y=torch.zeros((1, 3)).cpu(),
            opts=dict(
                xlabel='Epoch',
                ylabel='Loss',
                title='Epoch SSD Training Loss',
                legend=['Loc Loss', 'Conf Loss', 'Loss']
            )
        )
    batch_iterator = None
    data_loader = data.DataLoader(dataset, batch_size, num_workers=args.num_workers,
                                  shuffle=True, collate_fn=detection_collate, pin_memory=True)
    for iteration in range(args.start_iter, max_iter):
        if (not batch_iterator) or (iteration % epoch_size == 0):
            # create batch iterator
            batch_iterator = iter(data_loader)
        if iteration in stepvalues:
            step_index += 1
            adjust_learning_rate(optimizer, args.gamma, step_index)
            if args.visdom:
                viz.line(
                    X=torch.ones((1, 3)).cpu() * epoch,
                    Y=torch.Tensor([loc_loss, conf_loss,
                        loc_loss + conf_loss]).unsqueeze(0).cpu() / epoch_size,
                    win=epoch_lot,
                    update='append'
                )
            # reset epoch loss counters
            loc_loss = 0
            conf_loss = 0
            epoch += 1

        # load train data
        images, targets = next(batch_iterator)

        if args.cuda:
            images = Variable(images.cuda())
            targets = [Variable(anno.cuda(), volatile=True) for anno in targets]
        else:
            images = Variable(images)
            targets = [Variable(anno, volatile=True) for anno in targets]
        # forward
        t0 = time.time()
        out = net(images)
        # backprop
        optimizer.zero_grad()
        loss_l, loss_c = criterion(out, targets)
        loss = loss_l + loss_c
        loss.backward()
        optimizer.step()
        t1 = time.time()
        loc_loss += loss_l.data[0]
        conf_loss += loss_c.data[0]
        if iteration % 10 == 0:
            print('Timer: %.4f sec.' % (t1 - t0))
            #print('iter ' + repr(iteration) + ' || Loss: %.4f ||' % (loss.data[0]), end=' ')
            print('iter ' + repr(iteration) + ' || Loss: %.4f ||' % (loss.data[0]))
            if args.visdom and args.send_images_to_visdom:
                random_batch_index = np.random.randint(images.size(0))
                viz.image(images.data[random_batch_index].cpu().numpy())
        if args.visdom:
            viz.line(
                X=torch.ones((1, 3)).cpu() * iteration,
                Y=torch.Tensor([loss_l.data[0], loss_c.data[0],
                    loss_l.data[0] + loss_c.data[0]]).unsqueeze(0).cpu(),
                win=lot,
                update='append'
            )
            # hacky fencepost solution for 0th epoch plot
            if iteration == 0:
                viz.line(
                    X=torch.zeros((1, 3)).cpu(),
                    Y=torch.Tensor([loc_loss, conf_loss,
                        loc_loss + conf_loss]).unsqueeze(0).cpu(),
                    win=epoch_lot,
                    update=True
                )
        if iteration % 5000 == 0:
            print('Saving state, iter:', iteration)
            torch.save(ssd_net.state_dict(), 'weights/ssd300_0712_' +
                       repr(iteration) + '.pth')
    torch.save(ssd_net.state_dict(), args.save_folder + '' + args.version + '.pth')


def adjust_learning_rate(optimizer, gamma, step):
    """Sets the learning rate to the initial LR decayed by 10 at every specified step
    # Adapted from PyTorch Imagenet example:
    # https://github.com/pytorch/examples/blob/master/imagenet/main.py
    """
    lr = args.lr * (gamma ** (step))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


if __name__ == '__main__':
    train()


NameError: name 'build_ssd' is not defined

In [10]:
dataset = VOCDetection(args.voc_root, train_sets, SSDAugmentation(
        ssd_dim, means), AnnotationTransform())

In [11]:
batch_iterator = None
data_loader = data.DataLoader(dataset, batch_size, num_workers=args.num_workers,
                              shuffle=True, collate_fn=detection_collate, pin_memory=True)
for iteration in range(args.start_iter, max_iter):
    if (not batch_iterator) or (iteration % epoch_size == 0):
        # create batch iterator
        batch_iterator = iter(data_loader)
    if iteration in stepvalues:
        step_index += 1
        adjust_learning_rate(optimizer, args.gamma, step_index)
        if args.visdom:
            viz.line(
                X=torch.ones((1, 3)).cpu() * epoch,
                Y=torch.Tensor([loc_loss, conf_loss,
                    loc_loss + conf_loss]).unsqueeze(0).cpu() / epoch_size,
                win=epoch_lot,
                update='append'
            )
        # reset epoch loss counters
        loc_loss = 0
        conf_loss = 0
        epoch += 1

    # load train data
    images, targets = next(batch_iterator)

NameError: name 'epoch_size' is not defined

In [43]:
image_sets = [('2007', 'trainval')]
targetTransform =AnnotationTransform()

In [58]:
#root = '/home/amir/code/jacky/'
root = '/home/amir/data/voc/VOCdevkit/'
image_set = 'trainval.txt'
#self.transform = transform
#self.target_transform = target_transform
name = 'VOC2007'
_annopath = os.path.join('%s', 'Annotations', '%s.xml')
_imgpath = os.path.join('%s', 'JPEGImages', '%s.jpg')
ids = list()
for (year, name) in image_sets:
    rootpath = os.path.join(root, 'VOC' + year)
    for line in open(os.path.join(rootpath, 'ImageSets', 'Main', name + '.txt')):
        ids.append((rootpath, line.strip()))

In [59]:
ids[0]

('/home/amir/data/voc/VOCdevkit/VOC2007', '000005')

In [94]:
class AnnotationTransform(object):
    """Transforms a VOC annotation into a Tensor of bbox coords and label index
    Initilized with a dictionary lookup of classnames to indexes

    Arguments:
        class_to_ind (dict, optional): dictionary lookup of classnames -> indexes
            (default: alphabetic indexing of VOC's 20 classes)
        keep_difficult (bool, optional): keep difficult instances or not
            (default: False)
        height (int): height
        width (int): width
    """

    def __init__(self, class_to_ind=None, keep_difficult=False):
        self.class_to_ind = class_to_ind or dict(
            zip(VOC_CLASSES, range(len(VOC_CLASSES))))
        self.keep_difficult = keep_difficult

    def __call__(self, target, width, height):
        """
        Arguments:
            target (annotation) : the target annotation to be made usable
                will be an ET.Element
        Returns:
            a list containing lists of bounding boxes  [bbox coords, class name]
        """
        res = []
        for obj in target.iter('object'):
            difficult = int(obj.find('difficult').text) == 1
            if not self.keep_difficult and difficult:
                continue
            name = obj.find('name').text.lower().strip()
            bbox = obj.find('bndbox')
            print width,height
            pts = ['xmin', 'ymin', 'xmax', 'ymax']
            bndbox = []
            for i, pt in enumerate(pts):
                cur_pt = int(bbox.find(pt).text) - 1
                print cur_pt,'-->',
                # scale height or width
                cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height
                print cur_pt
                bndbox.append(cur_pt)
            label_idx = self.class_to_ind[name]
            bndbox.append(label_idx)
            res += [bndbox]  # [xmin, ymin, xmax, ymax, label_ind]
            # img_id = target.find('filename').text[:-4]

        return res  # [[xmin, ymin, xmax, ymax, label_ind], ... ]

targetTransform =AnnotationTransform()

img_id = ids[0]
import cv2

root = '/home/amir/code/jacky/'
#root = '/home/amir/data/voc/VOCdevkit/'
image_set = 'trainval.txt'
#self.transform = transform
#self.target_transform = target_transform
name = 'VOC2007'
_annopath = os.path.join('%s', 'Annotations', '%s.xml')
_imgpath = os.path.join('%s', 'JPEGImages', '%s.jpg')
ids = list()
for (year, name) in image_sets:
    rootpath = os.path.join(root, 'VOC' + year)
    for line in open(os.path.join(rootpath, 'ImageSets', 'Main', name + '.txt')):
        ids.append((rootpath, line.strip()))

img_id = ids[0]
import xml.etree.cElementTree as ET
target = ET.parse(_annopath % img_id).getroot()
img = cv2.imread(_imgpath % img_id)
height, width, channels = img.shape
target1= targetTransform(target, width, height)
print _annopath % img_id
print _imgpath % img_id
print target1

300 300
48 --> 0
45 --> 0
143 --> 0
140 --> 0
/home/amir/code/jacky/VOC2007/Annotations/1.xml
/home/amir/code/jacky/VOC2007/JPEGImages/1.jpg
[[0, 0, 0, 0, 0]]


In [95]:
img.shape

(300, 300, 3)

'%s/Annotations/%s.xml'

In [70]:
list(target.getchildren())[-1].getchildren()

[<Element 'name' at 0x7fe763209c90>,
 <Element 'pose' at 0x7fe763209c30>,
 <Element 'truncated' at 0x7fe763209bd0>,
 <Element 'difficult' at 0x7fe763209ba0>,
 <Element 'bndbox' at 0x7fe763209b70>]

In [64]:
_annopath % img_id

'/home/amir/data/voc/VOCdevkit/VOC2007/Annotations/000005.xml'

In [63]:
target1

[[0, 0, 0, 0, 8], [0, 0, 0, 0, 8], [0, 0, 0, 0, 8]]

In [28]:
import sys
sys.version_info[0]

2

In [40]:
img_id

('/home/amir/code/jacky/VOC2007', '25')

In [56]:
height

300

In [45]:
targetTransform(target)

TypeError: __call__() takes exactly 4 arguments (2 given)

In [13]:
[('/home/amir/data/voc/VOCdevkit/VOC2007', '000005'),
 ('/home/amir/data/voc/VOCdevkit/VOC2007', '000007'),
 ('/home/amir/data/voc/VOCdevkit/VOC2007', '000009'),
 ('/home/amir/data/voc/VOCdevkit/VOC2007', '000012'),
 ('/home/amir/data/voc/VOCdevkit/VOC2007', '000016'),
 ('/home/amir/data/voc/VOCdevkit/VOC2007', '000017'),
 ('/home/amir/data/voc/VOCdevkit/VOC2007', '000019'),
 ('/home/amir/data/voc/VOCdevkit/VOC2007', '000020'),

[
  0  0  0  0  2
 [torch.FloatTensor of size 1x5], 
  0  0  0  0  3
 [torch.FloatTensor of size 1x5], 
     0     0     0     0     4
     0     0     0     0    14
     0     0     0     0    14
     0     0     0     0    14
     0     0     0     0    14
     0     0     0     0    14
     0     0     0     0    14
     0     0     0     0    14
     0     0     0     0    14
     0     0     0     0    14
     0     0     0     0    10
     0     0     0     0     8
     0     0     0     0     8
 [torch.FloatTensor of size 13x5], 
  1  0  1  0  2
 [torch.FloatTensor of size 1x5], 
   1   0   1   0   6
   1   0   1   0   6
   1   0   1   0   6
   1   0   1   0  14
   1   0   1   0  13
 [torch.FloatTensor of size 5x5], 
     0     0     0     0    15
     0     0     0     0    15
     0     0     0     0    15
     0     0     0     0    14
     0     0     0     0    14
     0     0     0     0    14
     0     0     0     0     7
 [torch.FloatTensor of size 7x5], 
   0   0   0  