In [1]:
import torch
import numpy as np

In [2]:
from torchvision.datasets import VOCDetection
from torchvision.transforms.functional import to_tensor, to_pil_image
from PIL import Image, ImageDraw, ImageFont
import torchvision.transforms as transforms
import xml.etree.ElementTree as ET

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

In [3]:
classes = [
    "aeroplane",
    "bicycle",
    "bird",
    "boat",
    "bottle",
    "bus",
    "car",
    "cat",
    "chair",
    "cow",
    "diningtable",
    "dog",
    "horse",
    "motorbike",
    "person",
    "pottedplant",
    "sheep",
    "sofa",
    "train",
    "tvmonitor"
]

In [4]:
img_transform = transforms.Compose([
                                     transforms.Resize((448, 448)),
                                     transforms.ToTensor(),
])

In [72]:
def showImage(img, label_matrix, C=7):
  # img:tensor (7,7,3)
  # label_matrix:tensor (7, 7, 25) or (7, 7, 30)
  img = to_pil_image(img)
  draw = ImageDraw.Draw(img)
  W, H = img.size
  
  cell_size_i, cell_size_j = img.size[1] / C, img.size[0] / C
  for i in range(C):
    for j in range(C):
      # =====================
      # FOR SHOWING A DATASET
      # =====================
      if label_matrix.size(-1) == 25 and label_matrix[i,j,20] == 1:
        color = np.random.randint(0, 255, size=(3,), dtype="uint8").tolist()
        x_converted, y_converted, width_converted, height_converted = (
          cell_size_j * (j + label_matrix[i,j,21]),
          cell_size_i * (i + label_matrix[i,j,22]),
          cell_size_j * label_matrix[i,j,23],
          cell_size_i * label_matrix[i,j,24],
        )

        left_top = (x_converted - width_converted / 2), (y_converted + height_converted / 2)
        right_bot = (x_converted + width_converted / 2), (y_converted - height_converted / 2)
        draw.rectangle((
            (x_converted-1, y_converted+1),
            (x_converted+1, y_converted-1),
        ), fill=(0,0,0), outline=tuple(color), width=10)
        draw.rectangle((left_top, right_bot), outline=tuple(color), width=3)
        draw.text((x_converted - width_converted / 2, y_converted - height_converted / 2),\
                  classes[torch.argmax(label_matrix[i,j,:20]).item()], fill=(255, 255, 255, 0), width=3)
      # ========================
      # FOR SHOWING A PREDICTION
      # ========================
      delta = 0.5 # ?????????????????????????????????????????????????????????
      if label_matrix.size(-1) == 30 and (label_matrix[i, j, 20] > 0.5 or 0.5):
        pass
  plt.figure(figsize=(10, 10))
  plt.imshow(img)
  plt.show()
  

In [38]:
class CustomVOCDetection(VOCDetection):
  def __init__(
      self,
      root: str,
      year: str="2007",
      image_set: str="train",
      download: bool = False,
      transform=None,
      target_transform = None,
      transforms=None,
      S=7,
      B=2,
      C=20,
  ):
    super(CustomVOCDetection, self).__init__(root, year, image_set, download, transform, target_transform, transforms)
    self.S = S
    self.B = B
    self.C = C
  
  def __getitem__(self, index):
    img = Image.open(self.images[index]).convert('RGB')
    target = self.parse_voc_xml(ET.parse(self.annotations[index]).getroot())
    boxes = []
    for t in target['annotation']['object']:
      xmax, xmin, ymax, ymin = (
        float(t['bndbox']['xmax']),
        float(t['bndbox']['xmin']),
        float(t['bndbox']['ymax']),
        float(t['bndbox']['ymin']),
      )
      x, y, width, height = (xmax + xmin) / 2, (ymax + ymin) / 2, (xmax - xmin), (ymax - ymin)
      boxes.append([classes.index(t['name']), x, y, width, height])

    cell_size_i, cell_size_j = img.size[1] / self.S, img.size[0] / self.S
    label_matrix = torch.zeros((self.S, self.S, self.C + 5))
    for box in boxes:
      class_label, x, y, width, height = box
      class_label = int(class_label)
      j, i = int(x // cell_size_j), int(y // cell_size_i)
      if label_matrix[i, j, 20] == 0:
        label_matrix[i, j, 20]  = 1
        box_info = torch.tensor([
           (x - (cell_size_j * j)) / cell_size_j,
           (y - (cell_size_i * i)) / cell_size_i,
           width / cell_size_j,
           height / cell_size_i
        ])
        label_matrix[i, j, 21:25] = box_info
        label_matrix[i, j, class_label] = 1
    if self.transform:
      img = self.transform(img)
    return img, label_matrix

    

In [39]:
train_dataset = CustomVOCDetection(root=".", year="2007", image_set="train", download=True, transform=img_transform)
val_dataset = CustomVOCDetection(root='.', year="2007", image_set="test", download=True, transform=img_transform)
print(f"Variables ( train_dataset, val_dataset ) are loaded. Use It.")

Using downloaded and verified file: ./VOCtrainval_06-Nov-2007.tar
Extracting ./VOCtrainval_06-Nov-2007.tar to .


In [73]:
test_size = 10
for i in range(test_size):
  random_index = np.random.randint(0, len(train_dataset))
  img, label_matrix = train_dataset[random_index]
  showImage(img, label_matrix)

Output hidden; open in https://colab.research.google.com to view.