In [None]:
import torch
import numpy as np

In [None]:
from torchvision.datasets import VOCDetection
from torchvision.transforms.functional import to_tensor, to_pil_image
from PIL import Image, ImageDraw, ImageFont
import torchvision.transforms as transforms
import xml.etree.ElementTree as ET
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

In [None]:
classes = [
    "aeroplane",
    "bicycle",
    "bird",
    "boat",
    "bottle",
    "bus",
    "car",
    "cat",
    "chair",
    "cow",
    "diningtable",
    "dog",
    "horse",
    "motorbike",
    "person",
    "pottedplant",
    "sheep",
    "sofa",
    "train",
    "tvmonitor"
]

In [None]:
def showImageWithBox(img, label_matrix, C=7):
  img_height, img_width = img.shape[1], img.shape[2]
  img = to_pil_image(img)
  draw = ImageDraw.Draw(img)
  W, H = img.size

  for i in range(C):
    for j in range(C):
      color = np.random.randint(0, 255, size=(3, ), dtype="uint8").tolist()
      if label_matrix.size(-1) == 25:     # if label_matrix is label
        if label_matrix[20] != 1.:
          continue
        center_x_converted, center_y_converted = label_matrix[21] * W, label_matrix[22] * H
        width_converted, height_converted = label_matrix[23] * W, label_matrix[24] * H

        # left_top, right_bot
        left_top = (center_x_converted - width_converted / 2, center_y_converted + height_converted / 2)
        right_bot = (center_x_converted + width_converted / 2, center_y_converted - height_converted / 2)
        draw.rectangle((left_top, right_bot), outline=tuple(color), width=3)
      elif label_matrix.size(-1) == 30:   # if label_matrix is 
        pass

  for tg, label in zip(targets, labels):
    id_ = int(label)
    bbox = tg[:]
    color = np.random.randint(0, 255, size=(3, ), dtype="uint8").tolist()
    name = classes[id_]
    draw.rectangle((((bbox[0] - bbox[2]/2) * img_width, (bbox[1] + bbox[3]/2) * img_height),\
                    ((bbox[0] + bbox[2]/2) * img_width, (bbox[1] - bbox[3]/2) * img_height)),\
                    outline=tuple(color), width=3)
    draw.text(((bbox[0] - bbox[2]/2) * img_width, (bbox[1] - bbox[3]/2) * img_height),\
              name, fill=(255,255,255,0))
  plt.figure(figsize=(15, 15))
  plt.imshow(np.array(img))

In [None]:
img_transform = transforms.Compose([
    transforms.Resize((448, 448)),
    transforms.ToTensor(),
])
#img_transform = transforms.ToTensor() 

In [None]:
class CustomVOCDetection(VOCDetection):
  def __init__(
    self,
    root: str,
    year: str="2012",
    image_set: str="train",
    download: bool = False,
    transform = None,
    target_transforms = None,
    transforms = None,
    S = 7,
    B = 2,
    C = 20,
  ):
    super(CustomVOCDetection, self).__init__(root, year, image_set, download, transform, target_transforms, transforms)
    self.S = S
    self.B = B
    self.C = C

  def __getitem__(self, index):
    img = Image.open(self.images[index]).convert('RGB')
    org_img_size = img.size
    if self.transforms:
      img = self.transforms(img)
    img_size = img.shape
    target = self.parse_voc_xml(ET.parse(self.annotations[index]).getroot())
    boxes = []

    for t in target['annotation']['object']:
      xmax, xmin, ymax, ymin = \
        float(t['bndbox']['xmax']) / org_img_size[0] * img_size[2],\
        float(t['bndbox']['xmin']) / org_img_size[0] * img_size[2],\
        float(t['bndbox']['ymax']) / org_img_size[1] * img_size[1],\
        float(t['bndbox']['ymin']) / org_img_size[1] * img_size[1]
      x, y, width, height = (xmax + xmin) // 2, (ymax + ymin) // 2, (xmax - xmin), (ymax - ymin)
      boxes.append([classes.index(t['name']), x, y, width, height])

    ## load image, annotation
    label_matrix = torch.zeros((self.S, self.S, self.C + 5 * self.B))
    for box in boxes:
      class_label, x, y, width, height = box
      class_label = int(class_label)

      cell_size = img.shape[1] // self.S  # premise : img.shape[1] == img.shape[0]
      i, j = int(x // cell_size), int(y // cell_size)
      if label_matrix[i, j, 20] == 0:
        label_matrix[i, j, 20] = 1
        box_info = torch.tensor(
            [x / img.shape[2], y / img.shape[1], width / img.shape[2], height / img.shape[1]]
        )
        label_matrix[i, j, 21:25] = box_info
        label_matrix[i, j, class_label] = 1
    return img, label_matrix 

In [None]:
train_dataset = CustomVOCDetection('.', year='2007', image_set='train', download=True, transforms=img_transform)
val_dataset = CustomVOCDetection('.', year='2007', image_set='test', download=True, transforms=img_transform)

Downloading http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar to ./VOCtrainval_06-Nov-2007.tar


  0%|          | 0/460032000 [00:00<?, ?it/s]

Extracting ./VOCtrainval_06-Nov-2007.tar to .
Downloading http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar to ./VOCtest_06-Nov-2007.tar


  0%|          | 0/451020800 [00:00<?, ?it/s]

Extracting ./VOCtest_06-Nov-2007.tar to .


In [None]:
img, label_matrix = train_dataset[0]

In [None]:
#showImageWithBox(img, boxes[:,1:], boxes[:,0])

In [None]:
print("======== NOTICE ========")
print("1. Train dataset is stored in \"train_dataset\" variable")
print("2. Val dataset is stored in \"val_dataset\" variable")
print()
print("* By using both variables, Create your own DataLoader")
print("  ex) train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)")

1. Train dataset is stored in "train_dataset" variable
2. Val dataset is stored in "val_dataset" variable

* By using both variables, Create your own DataLoader
  ex) train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)


In [None]:
"""
img, targets, labels = val_dataset[4000]
showImageWithBox(img, targets, labels)
"""

'\nimg, targets, labels = val_dataset[4000]\nshowImageWithBox(img, targets, labels)\n'