# Test with VOC Dataset

THE VOC dataset from years 2007 and 2012 contains 20 classes (+1 for background class)

We will experiment with this for object detection and would be working with another dataset later on.


In [7]:
import torch
from torch.utils.data import Dataset
import json
import os
from PIL import Image
# from utils import transform
import torchvision.transforms.functional as FT
import xml.etree.ElementTree as ET

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

voc_labels = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable',
              'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor')


label_map = {k: v+1 for v, k in enumerate(voc_labels)}
label_map['background'] = 0
rev_color_map = {v: k for k, v in label_map.items()}

In [12]:
distinct_colors = ['#e6194b', '#3cb44b', '#ffe119', '#0082c8', '#f58231', '#911eb4', '#46f0f0', '#f032e6',
                   '#d2f53c', '#fabebe', '#008080', '#000080', '#aa6e28', '#fffac8', '#800000', '#aaffc3', '#808000',
                   '#ffd8b1', '#e6beff', '#808080', '#FFFFFF']
label_color_map = {k: distinct_colors[i] for i, k in enumerate(label_map.keys())}

# label_color_map

In [13]:
def parse_annotation(annotation_path):
    
    tree = ET.parse(annotation_path)
    root = tree.getroot()
    
    boxes = list()
    labels = list()
    difficulties = list()    # the boxes/objects that were super difficult to detect by the model
    
    for object in root.iter('object'):
        
        difficult = int(object.find('difficult').text == '1')
        
        label = object.find('name').text.lower().strip()
        
        if label not in label_map:
            continue
            
        bbox = object.find('bndbox')
        xmin = int(bbox.find('xmin').text) - 1
        ymin = int(bbox.find('ymin').text) - 1
        xmax = int(bbox.find('xmax').text) - 1
        ymax = int(bbox.find('ymax').text) - 1
        
        boxes.append([xmin, ymin, xmax, ymax])
        labels.append(label_map[label])
        difficulties.append(difficult)
    
    return {'boxes': boxes, 'labels': labels, 'difficulties': difficulties}
        

In [None]:
def create_data_lists(voc07_path, voc12_path, output_folder):
    """
    Create lists of images, bounding boxes, labels of the objects in these images,
    and save them to file.
    
    :param voc07_path: path to the 'VOC2007' folder
    :param voc12_path: path to the 'VOC2012' folder
    :param output_folder: folder where the JSONs must be saved
    """
    voc07_path = os.path.abspath(voc07_path)
    voc12_path = os.path.abspath(voc12_path)
    
    train_images = list()
    train_objects = list()
    n_objects = 0
    
    
    # load the training data
    
    for path in [voc07_path, voc12_path]:
        
        with open(os.path.join(path, 'ImageSets/Main/trainval.txt')) as f:
            ids = f.read().splitlines()
            
        for id in ids:
            objects = parse_annotation(os.path.join(path, 'Annotations', id = '.xml'))
            if len(objects) == 0:
                continue
            
            n_objects += len(objects)
            train_objects.append(objects)
            train_images.append(os.path.join(path, 'JPEGImages', id + '.jpg'))
        
    assert len(train_objects) == len(train_images)
    
    
    

In [6]:

def transform(image, boxes, labels, difficulties, split):
    """
    Apply the transformatons on the images
    
    :param image: Apply transformations on PIL image
    :param boxes: bounding boxes in boundary coordinates, a tensor of dimensions
    :param labels: labels of objects, a tensor of dimensions
    :param difficulties: difficulty of detection, we might not even need this but need to perform transformations on it
    :param split: Split the train and test datasets, since we perform different transformations on each
    :return: return the transformed image,  boxes, labels, difficulties and splitted datasets
    
    """
    assert split in ('TRAIN', 'TEST')
    
    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]
    
    new_image = image
    new_boxes = boxes
    new_labels = labels
    new_difficulties = difficulties
    
    if split == 'TRAIN':
        
        # photometric distortion is not built-in so have to pass that in too
        
        new_image = photometric_distort(new_image)
        
        new_image = FT.to_tensor(new_image)
        
        # expand i.e. zoom out the image with a 50% chance (i.e. applicable to 50% data?)
        # expand with the mean of the whole dataset
        # expand is not in-built, so have to write that function too
        if random.random() < 0.5:
            new_image, new_boxes = expand(new_image, boxes, filter=mean)
            
        new_image, new_boxes, new_labels, new_difficulties = random_crop(new_image, new_boxes, new_labels,
                                                                        new_difficulties)
        
        new_image = FT.to_pil_image(new_image)
        
        # flip image with a 50% chance 
        # write the flip image function too
        if random.random() < 0.5:
            new_image, new_boxes = flip(new_image, new_boxes)
    
    # resize the image (300, 300) - this also converts absolute boundary coordinates to their fractional form
    new_image, new_boxes = resize(new_image, new_boxes, dims=(300, 300))
    
    new_image = FT.to_tensor(new_image)
    
    new_image = FT.normalize(new_image, mean=mean, std=std)
    
    return new_image, new_boxes, new_labels, new_difficulties
    

In [None]:
flip, expand, resize, photometric 
