In [1]:
import torch 
import numpy as np
from torch.utils.data import Dataset
import torchvision.transforms as transforms

In [26]:
import os
import glob

dataset_path = './datasets/VOCdevkit/VOC2007/'

images_path = os.path.join(dataset_path, 'JPEGImages')
annotation_path = os.path.join(dataset_path, 'Annotations')
images_mode = os.path.join(dataset_path, 'ImageSets', 'Layout')


In [42]:
len(os.listdir(images_path))

9963

In [9]:
import xmltodict
import pprint

example_annotation = os.path.join(annotation_path, '000007.xml')

# xml_dict = xmltodict.parse(example_annotation)
with open(example_annotation, 'rb') as f:
    xml_todict = xmltodict.parse(f)
    pprint.pprint(xml_todict)


{'annotation': {'filename': '000007.jpg',
                'folder': 'VOC2007',
                'object': {'bndbox': {'xmax': '500',
                                      'xmin': '141',
                                      'ymax': '330',
                                      'ymin': '50'},
                           'difficult': '0',
                           'name': 'car',
                           'pose': 'Unspecified',
                           'truncated': '1'},
                'owner': {'flickrid': 'monsieurrompu', 'name': 'Thom Zemanek'},
                'segmented': '0',
                'size': {'depth': '3', 'height': '333', 'width': '500'},
                'source': {'annotation': 'PASCAL VOC2007',
                           'database': 'The VOC2007 Database',
                           'flickrid': '194179466',
                           'image': 'flickr'}}}


In [12]:
example_annotation = os.path.join(annotation_path, '000003.xml')

# xml_dict = xmltodict.parse(example_annotation)
with open(example_annotation, 'rb') as f:
    xml_todict = xmltodict.parse(f)
    # pprint.pprint(xml_todict)
    pprint.pprint(xml_todict['annotation']['object'])
    


[{'bndbox': {'xmax': '215', 'xmin': '123', 'ymax': '195', 'ymin': '155'},
  'difficult': '0',
  'name': 'sofa',
  'pose': 'Unspecified',
  'truncated': '0'},
 {'bndbox': {'xmax': '307', 'xmin': '239', 'ymax': '205', 'ymin': '156'},
  'difficult': '0',
  'name': 'chair',
  'pose': 'Left',
  'truncated': '0'}]


In [30]:
idx_to_labels = {0: "background", 1: "aeroplane",
                 2 : "bicycle",
                 3 : "bird",
                 4 : "boat", 
                 5: "bottle", 
                 6: "bus",
                 7: "car",
                 8: "cat",
                 9: "chair",
                 10: "cow",
                 11: "diningtable",
                 12: "dog",
                 13: "horse",
                 14: "motorbike",
                 15: "person",
                 16: "pottedplant",
                 17: "sheep",
                 18: "sofa",
                 19: "train",
                 20: "tvmonitor"}

labels_to_idx = {label: idx for idx, label in idx_to_labels.items()}

In [70]:
def extract_xml(xml_path):
    with open(xml_path, 'rb') as f:
        annotation_dict = xmltodict.parse(f)
        objects =  annotation_dict['annotation']['object']

        labels = list()
        # print(objects)
        if len(objects) == 1:
            object_name = objects['name']
            label = labels_to_idx[object_name]
            bndbox = objects['bndbox']
            bndbox = [int(bndbox['xmin']), int(bndbox['ymin']), 
                      int(bndbox['xmax']), int(bndbox['ymax'])]

            labels.append((label, bndbox))
        else:
            for obj in objects:
                object_name = obj['name']
                label = labels_to_idx[object_name]
                bndbox = obj['bndbox']
                bndbox = [int(bndbox['xmin']), int(bndbox['ymin']), 
                          int(bndbox['xmax']), int(bndbox['ymax'])]
    
                labels.append((label, bndbox))

        return labels


extract_xml(example_annotation)


{'name': 'sofa', 'pose': 'Unspecified', 'truncated': '0', 'difficult': '0', 'bndbox': {'xmin': '123', 'ymin': '155', 'xmax': '215', 'ymax': '195'}}
{'name': 'chair', 'pose': 'Left', 'truncated': '0', 'difficult': '0', 'bndbox': {'xmin': '239', 'ymin': '156', 'xmax': '307', 'ymax': '205'}}


[(18, [123, 155, 215, 195]), (9, [239, 156, 307, 205])]

In [71]:
import torchvision.transforms  as transforms
from PIL import Image


class PascalVOCDataset(Dataset):
    
    def __init__(self, dataset_path, transform, mode = 'train'):    

        self.transform = transform
        
        images_path = os.path.join(dataset_path, 'JPEGImages')
        annotation_path = os.path.join(dataset_path, 'Annotations')

        images_mode = os.path.join(dataset_path, 'ImageSets', 'Main')
        if mode == 'train':
            images_mode = os.path.join(images_mode, 'train.txt')
        elif mode == 'val':
            images_mode = os.path.join(images_mode, 'val.txt')
        elif mode == 'test':
            images_mode = os.path.join(images_mode, 'test.txt')

        self.images_path = []
        self.annotation_path = []
        with open(images_mode, 'r') as f:
            for file in f.readlines():
                file_id = file.strip()
                # print(file_id)
                file_path = os.path.join(images_path, file_id + ".jpg")
                # print(file_path)
                xml_path = os.path.join(annotation_path, file_id + ".xml")

                self.images_path.append(file_path)
                self.annotation_path.append(xml_path)
                
        # self.images_path = sorted(glob.glob(images_path + "/*.jpg"))
        # self.annotation_path = sorted(glob.glob(images_path + "/*.xml"))

    
    def __len__(self):
        return len(self.annotation_path)

    def __getitem__(self, index):
        image_file = self.images_path[index]
        annotation_file = self.annotation_path[index]
        
        try:
            image = Image.open(image_file)
        except IOError:
            print(f'Corrupt Image at {index}')
            if index == len(self) - 1:
                index = 0
            return self[index + 1]

        image = self.transform(image)
        bndboxes = extract_xml(annotation_file)

        return {"image": image,
               "bndboxes": bndboxes}
        

In [72]:
size = 600

transform = transforms.Compose([
    transforms.Resize(size),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

voc_dataset_train = PascalVOCDataset(dataset_path,
                              transform = transform,
                              mode = 'train')

print(len(voc_dataset_train))


2501


In [73]:
voc_dataset_train[0]

name


TypeError: string indices must be integers

In [56]:
voc_dataset_val = PascalVOCDataset(dataset_path,
                              transform = transform,
                              mode = 'val')

len(voc_dataset_val)

2510