In [1]:
import torch 
import numpy as np
from torch.utils.data import Dataset
import torchvision.transforms as transforms

In [2]:
idx_to_labels = {0: "background", 1: "aeroplane",
                 2 : "bicycle",
                 3 : "bird",
                 4 : "boat", 
                 5: "bottle", 
                 6: "bus",
                 7: "car",
                 8: "cat",
                 9: "chair",
                 10: "cow",
                 11: "diningtable",
                 12: "dog",
                 13: "horse",
                 14: "motorbike",
                 15: "person",
                 16: "pottedplant",
                 17: "sheep",
                 18: "sofa",
                 19: "train",
                 20: "tvmonitor"}

labels_to_idx = {label: idx for idx, label in idx_to_labels.items()}

In [3]:
import os
import glob

dataset_path = './datasets/VOCdevkit/VOC2007/'

images_path = os.path.join(dataset_path, 'JPEGImages')
annotation_path = os.path.join(dataset_path, 'Annotations')
images_mode = os.path.join(dataset_path, 'ImageSets', 'Layout')


In [4]:
len(os.listdir(images_path))

9963

In [5]:
import xmltodict
import pprint

example_annotation = os.path.join(annotation_path, '000007.xml')

# xml_dict = xmltodict.parse(example_annotation)
with open(example_annotation, 'rb') as f:
    xml_todict = xmltodict.parse(f)
    pprint.pprint(xml_todict)


{'annotation': {'filename': '000007.jpg',
                'folder': 'VOC2007',
                'object': {'bndbox': {'xmax': '500',
                                      'xmin': '141',
                                      'ymax': '330',
                                      'ymin': '50'},
                           'difficult': '0',
                           'name': 'car',
                           'pose': 'Unspecified',
                           'truncated': '1'},
                'owner': {'flickrid': 'monsieurrompu', 'name': 'Thom Zemanek'},
                'segmented': '0',
                'size': {'depth': '3', 'height': '333', 'width': '500'},
                'source': {'annotation': 'PASCAL VOC2007',
                           'database': 'The VOC2007 Database',
                           'flickrid': '194179466',
                           'image': 'flickr'}}}


In [6]:
example_annotation = os.path.join(annotation_path, '000003.xml')

# xml_dict = xmltodict.parse(example_annotation)
with open(example_annotation, 'rb') as f:
    xml_todict = xmltodict.parse(f)
    # pprint.pprint(xml_todict)
    pprint.pprint(xml_todict['annotation']['object'])
    


[{'bndbox': {'xmax': '215', 'xmin': '123', 'ymax': '195', 'ymin': '155'},
  'difficult': '0',
  'name': 'sofa',
  'pose': 'Unspecified',
  'truncated': '0'},
 {'bndbox': {'xmax': '307', 'xmin': '239', 'ymax': '205', 'ymin': '156'},
  'difficult': '0',
  'name': 'chair',
  'pose': 'Left',
  'truncated': '0'}]


In [7]:
def extract_xml(xml_path):
    with open(xml_path, 'rb') as f:
        annotation_dict = xmltodict.parse(f)
        objects =  annotation_dict['annotation']['object']

        labels = list()
        bndboxes = list()
        # print(objects)
        if isinstance(objects, dict):
            object_name = objects['name']
            label = labels_to_idx[object_name]
            bndbox = objects['bndbox']
            bndbox = [int(bndbox['xmin']), int(bndbox['ymin']), 
                      int(bndbox['xmax']), int(bndbox['ymax'])]

            labels.append(label)
            bndboxes.append(bndbox)
        else:
            for obj in objects:
                object_name = obj['name']
                label = labels_to_idx[object_name]
                bndbox = obj['bndbox']
                bndbox = [int(bndbox['xmin']), int(bndbox['ymin']), 
                          int(bndbox['xmax']), int(bndbox['ymax'])]
    
                labels.append(label)
                bndboxes.append(bndbox)
    
        return torch.tensor(labels), torch.tensor(bndboxes)


labels, bndboxes = extract_xml(example_annotation)

In [43]:
def calculate_iou(boxes1, boxes2):

    # Calculate coordinates of intersection area
    x1 = max(boxes1[0], boxes2[0])
    y1 = max(boxes1[1], boxes2[1])
    x2 = min(boxes1[2], boxes2[2])
    y2 = min(boxes1[3], boxes2[3])

    # print(x1, y1, x2, y2)
    # Calculate area of the intersection
    intersection_area = max(0, x2 - x1) * max(0, y2 - y1)
    # print(intersection_area)
    
    # Calculate area of union
    area_box1 = (boxes1[2] - boxes1[0]) * (boxes1[3] - boxes1[1])
    area_box2 = (boxes2[2] - boxes2[0]) * (boxes2[3] - boxes2[1])
    # print(area_box1)
    # print(area_box2)
    union_area = area_box1 + area_box2 - intersection_area
    # print(union_area)
    
    # Calculate IoU
    iou = intersection_area / union_area

    return iou

# Example usage:
box1 = torch.tensor([351,  82, 463, 545])
box2 = torch.tensor([239, 156, 307, 205])
iou = calculate_iou(box1, box2)
print("IoU:", iou.item())

IoU: 0.0


In [55]:
import cv2

def selective_search(image):
     # Create a Selective Search Segmentation object
    ss = cv2.ximgproc.segmentation.createSelectiveSearchSegmentation()

    # Set input image for selective search
    ss.setBaseImage(image)

    # Switch to fast but low recall selective search method
    ss.switchToSelectiveSearchQuality()

    # Run selective search on the input image
    rects = ss.process()

    rects[:, 2] += rects[:, 0] 
    rects[:, 3] += rects[:, 1] 
    
    return torch.tensor(rects)

image_3 = os.path.join(images_path, '000003.jpg')
image_3 = cv2.imread(image_3)
bounding_boxes = selective_search(image_3)
bounding_boxes

tensor([[203, 257, 209, 301],
        [ 53,  75,  63, 144],
        [152,  64, 255, 196],
        ...,
        [  0,   0, 426, 311],
        [  0,   0, 500, 211],
        [  0,   0, 500, 319]], dtype=torch.int32)

In [57]:
bounding_boxes.shape

torch.Size([9478, 4])

In [58]:
labels, bndboxes

(tensor([18,  9]),
 tensor([[123, 155, 215, 195],
         [239, 156, 307, 205]]))

In [59]:
labels.shape

torch.Size([2])

In [78]:
set_bndboxes = []
resulting_labels = []

positive_nums = 16
negative_nums = 64 - 16

positive_counter = 0
negative_counter = 0

count = 0

for bnd_box in range(bounding_boxes.shape[0]):

    if positive_counter >= positive_nums and negative_counter >= negative_nums:
        print(f"Stop at: {count}")
        break
    
    combination_iou = []
    for idx in range(labels.shape[0]):
        # print(bounding_boxes[bnd_box])
        # print(bndboxes[idx])
        iou = calculate_iou(bounding_boxes[bnd_box], bndboxes[idx])
        combination_iou.append(iou)
    combination_iou = np.array(combination_iou)
    # print(combination_iou)
    max_iou = np.max(combination_iou)
    label = None

    
    
    count += 1
    if max_iou >= 0.5 and positive_counter < positive_nums:
        element = np.argmax(combination_iou)
        label = labels[element]
        positive_counter += 1
    elif max_iou < 0.5 and max_iou >= 0.1 and negative_counter < negative_nums:
        label = 0
        negative_counter += 1
    else:
        continue

    

    resulting_labels.append(label)
    set_bndboxes.append(bounding_boxes[bnd_box])
    

print(count)
resulting_labels = torch.tensor(resulting_labels)
# set_bndboxes = torch.tensor(set_bndboxes)
# print(set_bndboxes)
print(resulting_labels)
    

Stop at: 5811
5811
tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  9, 18,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 18, 18,  9, 18,
        18, 18,  9,  9,  9, 18, 18, 18, 18, 18])


In [79]:
image_3.shape

(375, 500, 3)

In [71]:
len(resulting_labels)

64

In [63]:
resulting_labels

non_zero_indicies = (resulting_labels != 0 )
indicies = torch.nonzero(non_zero_indicies).squeeze()
indicies

tensor([ 24,  25,  51, 139, 157, 206, 285, 305, 324, 409, 428, 441, 484, 537,
        546, 554, 580, 606, 644, 647, 682, 691, 698, 726, 740, 767, 798, 812,
        813, 884, 886, 945])

In [None]:
resulting_labels

In [14]:
import torchvision.transforms  as transforms
from PIL import Image
import cv2

class PascalVOCDataset(Dataset):    
    def __init__(self, dataset_path, transform, R = 64, s = 600, mode = 'train'):    

        self.transform = transform
        self.s = s
        self.R = R  # Number of bounding boxes
        images_path = os.path.join(dataset_path, 'JPEGImages')
        annotation_path = os.path.join(dataset_path, 'Annotations')

        images_mode = os.path.join(dataset_path, 'ImageSets', 'Main')
        if mode == 'train':
            images_mode = os.path.join(images_mode, 'train.txt')
        elif mode == 'val':
            images_mode = os.path.join(images_mode, 'val.txt')
        elif mode == 'test':
            images_mode = os.path.join(images_mode, 'test.txt')

        self.images_path = []
        self.annotation_path = []
        with open(images_mode, 'r') as f:
            for file in f.readlines():
                file_id = file.strip()
                # print(file_id)
                file_path = os.path.join(images_path, file_id + ".jpg")
                # print(file_path)
                xml_path = os.path.join(annotation_path, file_id + ".xml")

                self.images_path.append(file_path)
                self.annotation_path.append(xml_path)
                
        # self.images_path = sorted(glob.glob(images_path + "/*.jpg"))
        # self.annotation_path = sorted(glob.glob(images_path + "/*.xml"))

    
    def __len__(self):
        return len(self.annotation_path)

    # Try to combine Selective Search to to generate 64 Bounding boxes in this stage 
    def __getitem__(self, index):        
        image_file = self.images_path[index]
        annotation_file = self.annotation_path[index]
            
        # try:
        #     image = Image.open(image_file)
        # except IOError:
        #     print(f'Corrupt Image at {index}')
        #     if index == len(self) - 1:
        #         index = 0
        #     return self[index + 1]

        image_cv = cv2.imread(image_file)
        
        if image_cv is None: 
            print(f'Failed to load image at index {index}')
            if index == len(self) - 1:
                index = 0
            return self[index + 1]
            
        resize_image_cv = cv2.resize(image_cv, (self.s, self.s))
        
        
        # image = self.transform(image)
        bndboxes = extract_xml(annotation_file)

        
        return {"image": image,
               "bndboxes": bndboxes}
        

In [15]:
size = 600

transform = transforms.Compose([
    transforms.
    transforms.Resize(size),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

voc_dataset_train = PascalVOCDataset(dataset_path,
                              transform = transform,
                              mode = 'train')

print(len(voc_dataset_train))


2501


In [16]:
for i in range(len(voc_dataset_train)):
    print(voc_dataset_train[i]['bndboxes'])

NameError: name 'image' is not defined

In [None]:
voc_dataset_val = PascalVOCDataset(dataset_path,
                              transform = transform,
                              mode = 'val')

len(voc_dataset_val)