In [1]:
import torch
import torch.utils.data as data
import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import numpy as np
from scipy.io import loadmat

BATCH_SIZE = 1

In [2]:
VOC_CLASSES = (  # always index 0
    'aeroplane', 'bicycle', 'bird', 'boat',
    'bottle', 'bus', 'car', 'cat', 'chair',
    'cow', 'diningtable', 'dog', 'horse',
    'motorbike', 'person', 'pottedplant',
    'sheep', 'sofa', 'train', 'tvmonitor')


In [3]:
def filter_small_boxes(boxes, min_size):
    """Filters out small boxes."""
    w = boxes[:, 2] - boxes[:, 0]
    h = boxes[:, 3] - boxes[:, 1]
    mask = (w >= min_size) & (h >= min_size)
    return mask

In [4]:
class VOCAnnotationAnalyzer():
    """
    deal with annotation data (dict)
    
    Arguments:
        cls_to_idx (dict, optional): dictionary lookup of classnames -> indexes
            (default: alphabetic indexing of VOC's 20 classes)
        keep_difficult (bool, optional): keep difficult instances or not
            (default: False)
        height (int): height
        width (int): width
    """
    def __init__(self, cls_to_idx=None, keep_difficult=False):
        self.cls_to_idx = cls_to_idx or dict(zip(VOC_CLASSES, range(len(VOC_CLASSES))))
        self.keep_difficult = keep_difficult
        
    def __call__(self, annotation: dict):
        w = int(annotation['size']['width'])
        h = int(annotation['size']['height'])
        # if img only contains one gt that annotation['object'] is just a dict, not a list
        objects = [annotation['object']] if type(annotation['object']) != list else annotation['object']
        res = [] # [xmin, ymin, xmax, ymax, label]
        for box in objects:
            pts = ['xmin', 'ymin', 'xmax', 'ymax']
            difficult = int(box['difficult'])
            if not self.keep_difficult and difficult:
                continue
            name = box['name']
            bnd = []
            for pt in pts:
                bnd.append(int(box['bndbox'][pt]))
            bnd.append(self.cls_to_idx[name])
            res.append(bnd)
            
        return res

In [19]:
class VOCDectectionDataset(data.Dataset):
    def __init__(self, root, year, image_set,
                 transform=None, 
                 target_transform=VOCAnnotationAnalyzer(),
                 dataset_name='VOC07_12',
                 region_propose='selective_search'):
        super(VOCDectectionDataset, self).__init__()
        self.datas = datasets.VOCDetection(root, str(year), image_set, download=False)
        self.image_set = image_set
        self.transform = transform
        self.name = dataset_name
        self.target_transform = target_transform # use for annotation
        self.longer_sides = [480, 576, 688, 864, 1200]
        if region_propose not in ['selective_search', 'edge_box']:
            raise NotImplementedError(f'{region_propose} not Supported')

        self.region_propose = region_propose
        self.box_mat = self.get_mat(year, image_set, region_propose)
            
            
    def get_box_from_mat(self, index):
        return self.box_mat['boxes'][0][index].tolist()

    def get_boxScore_from_mat(self, index):
        score = None
        if self.region_propose == 'edge_box':
            score = self.box_mat['boxScores'][0][index].tolist()
        return score
    
    def get_mat(self, year, image_set, region_propose):
        """
        load the box generated
        """
        boxes = None
        boxes_score = None
        
        if str(year) == '2007' and image_set == 'trainval' and region_propose == 'selective_search':
            mat = loadmat("../region/SelectiveSearchVOC2007trainval.mat")
        elif str(year) == '2007' and image_set == 'test' and region_propose == 'selective_search':
            mat = loadmat("../region/SelectiveSearchVOC2007test.mat")
        if str(year) == '2007' and image_set == 'trainval' and region_propose == 'edge_box':
            mat = loadmat("../region/EdgeBoxesVOC2007trainval.mat")
        elif str(year) == '2007' and image_set == 'test' and region_propose == 'edge_box':
            mat = loadmat("../region/EdgeBoxesVOC2007test.mat")
        return mat
            
    def __getitem__(self, index):
        img, gt = self.datas[index]
        region = self.get_box_from_mat(index)
        region_score = self.get_boxScore_from_mat(index)
        if self.target_transform:
            gt = self.target_transform(gt["annotation"])
        
        w, h = img.size
        if self.image_set == "trainval":
            if self.transform is None:
                # follow by paper: randomly horiztontal flip and randomly resize
                for box in region:
                    box[0], box[1] = box[1], box[0]
                    box[2], box[3] = box[3], box[2]
                
                if np.random.random() > 0.5: # then flip
                    fliper = transforms.RandomHorizontalFlip(1)
                    img = fliper(img)
                    for box in gt: # change gt
                        box[0], box[2] = w - box[2], w - box[0]
                    for box in region: # ssw generate is [ymin, xmin, ymax, xmax]
                        box[0], box[2] = w - box[2], w - box[0]

                # then resize
                max_side = self.longer_sides[np.random.randint(5)]
                if (w > h):
                    resizer = transforms.Resize((int(max_side*h/w), max_side))
                    ratio = max_side/w
                else: # h >= w
                    resizer = transforms.Resize((max_side, int(max_side*w/h)))
                    ratio = max_side/h
                img = resizer(img)
                for box in gt:
                    box[0] = int(ratio * box[0])
                    box[1] = int(ratio * box[1])
                    box[2] = int(ratio * box[2])
                    box[3] = int(ratio * box[3])
                for box in region:
                    box[0] = int(ratio * box[0])
                    box[1] = int(ratio * box[1])
                    box[2] = int(ratio * box[2])
                    box[3] = int(ratio * box[3])
            else:
                raise NotImplementedError("This dataset can only be compatible with the paper's implementation")
            print(np.array(region).shape)

            
            totensor = transforms.ToTensor()
            img = totensor(img)
            gt = np.array(gt)
            gt_box = np.array(gt[:, :4])
            
            gt_target = gt[:, -1]
            target = [0 for _ in range(len(VOC_CLASSES))]
            for t in gt_target:
                target[t] = 1.0
            
            gt_target = np.array(target).astype(np.float32)
            gt_box = np.array(gt) # split gt -> gt_box,  gt_target
            
            
    
            region = np.array(region).astype(np.float32)

            region_filter = filter_small_boxes(region, 20)
            region = region[region_filter]
            
            if region_score:
                region_score = np.array(region_score)


        if "test" in self.image_set:
            pass
        

        if region_score is not None:
            return img, gt_box, gt_target, region, region_score
        else:
            return img, gt_box, gt_target, region
        
    def __len__(self):
        return len(self.datas)

In [20]:
voc = datasets.VOCDetection("~/data/", '2007', 'trainval', download=False)

In [21]:
vd = VOCDectectionDataset("~/data/", 2007, 'trainval', region_propose='edge_box')

In [22]:
vd[0]

(4000, 4)


(tensor([[[0.6353, 0.6392, 0.6392,  ..., 0.0471, 0.0471, 0.0431],
          [0.6314, 0.6353, 0.6353,  ..., 0.0392, 0.0314, 0.0275],
          [0.6353, 0.6353, 0.6353,  ..., 0.0510, 0.0588, 0.0667],
          ...,
          [0.2588, 0.2588, 0.2549,  ..., 0.1333, 0.1373, 0.1412],
          [0.2706, 0.2706, 0.2667,  ..., 0.1882, 0.1725, 0.1686],
          [0.2706, 0.2667, 0.2627,  ..., 0.2549, 0.2196, 0.1922]],
 
         [[0.7255, 0.7294, 0.7294,  ..., 0.0471, 0.0431, 0.0353],
          [0.7333, 0.7333, 0.7333,  ..., 0.0392, 0.0314, 0.0235],
          [0.7412, 0.7412, 0.7412,  ..., 0.0549, 0.0627, 0.0706],
          ...,
          [0.3137, 0.3137, 0.3137,  ..., 0.0392, 0.0353, 0.0353],
          [0.3294, 0.3294, 0.3255,  ..., 0.0510, 0.0431, 0.0431],
          [0.3294, 0.3255, 0.3216,  ..., 0.0863, 0.0745, 0.0667]],
 
         [[0.7490, 0.7529, 0.7529,  ..., 0.0471, 0.0471, 0.0392],
          [0.7569, 0.7608, 0.7608,  ..., 0.0392, 0.0353, 0.0314],
          [0.7686, 0.7686, 0.7686,  ...,

In [73]:
dl = data.DataLoader(vd, batch_size=1)

In [74]:
for img, gt, region, s in dl:
    print(img)
    print(gt)
    print(region)
    print(s)
    break

(4000, 4)
tensor([[[[0.0431, 0.0510, 0.0431,  ..., 0.6431, 0.6392, 0.6353],
          [0.0196, 0.0314, 0.0392,  ..., 0.6353, 0.6314, 0.6314],
          [0.1216, 0.0863, 0.0627,  ..., 0.6353, 0.6353, 0.6353],
          ...,
          [0.1294, 0.1333, 0.1216,  ..., 0.2392, 0.2431, 0.2431],
          [0.1569, 0.1608, 0.1686,  ..., 0.2588, 0.2706, 0.2706],
          [0.1922, 0.2314, 0.2706,  ..., 0.2549, 0.2667, 0.2706]],

         [[0.0353, 0.0471, 0.0431,  ..., 0.7333, 0.7294, 0.7255],
          [0.0196, 0.0314, 0.0392,  ..., 0.7373, 0.7373, 0.7333],
          [0.1333, 0.0902, 0.0667,  ..., 0.7412, 0.7412, 0.7412],
          ...,
          [0.0392, 0.0510, 0.0471,  ..., 0.2902, 0.2980, 0.2941],
          [0.0353, 0.0353, 0.0392,  ..., 0.3176, 0.3294, 0.3294],
          [0.0667, 0.0784, 0.0941,  ..., 0.3137, 0.3255, 0.3294]],

         [[0.0392, 0.0471, 0.0431,  ..., 0.7569, 0.7529, 0.7490],
          [0.0275, 0.0353, 0.0392,  ..., 0.7647, 0.7608, 0.7608],
          [0.1451, 0.1020, 0.070

In [67]:
x = region.numpy()[0]
x

array([[  1.,  61., 402., 430.],
       [  1.,   1., 574., 430.],
       [ 13., 147., 390., 430.],
       ...,
       [108., 199., 140., 260.],
       [282., 392., 332., 415.],
       [392., 225., 419., 275.]], dtype=float32)

In [68]:
f = filter_small_boxes(x, 20)

In [69]:
x[f]

array([[  1.,  61., 402., 430.],
       [  1.,   1., 574., 430.],
       [ 13., 147., 390., 430.],
       ...,
       [108., 199., 140., 260.],
       [282., 392., 332., 415.],
       [392., 225., 419., 275.]], dtype=float32)

In [11]:
x = [[2, 1, 2, 1, 10], [2, 3, 1, 2, 10], []]

In [14]:
x[:1]

[[2, 1, 2, 1]]