# **Import Libraries**

In [1]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import torchvision.transforms as transforms
import torch.optim as optim
import torchvision.transforms.functional as FT
from torch.utils.data import DataLoader, Subset
from torchvision.datasets import VOCDetection
from tqdm import tqdm
seed = 123
torch.manual_seed(seed)
from collections import Counter

import random

# Contents

In this notebook, you will create a Yolo Based object detection model.

Use the Pascal VOC dataset

After training show the performance of the model by visualizing some images and their predicted bounding box on them.

# **Dataset Preprocessing**

In [27]:
voc_classes = {
    "aeroplane": 0,
    "bicycle": 1,
    "bird": 2,
    "boat": 3,
    "bottle": 4,
    "bus": 5,
    "car": 6,
    "cat": 7,
    "chair": 8,
    "cow": 9,
    "diningtable": 10,
    "dog": 11,
    "horse": 12,
    "motorbike": 13,
    "person": 14,
    "pottedplant": 15,
    "sheep": 16,
    "sofa": 17,
    "train": 18,
    "tvmonitor": 19,
}

#  Reverse of label to class id mapping. needed because the model predictions will be ids and we need to change it to label to visualize it.
reverse_voc_classes = {v: k for k, v in voc_classes.items()}

In [45]:
class VOCDataset(torch.utils.data.Dataset):

    def __init__(self, root, year='2012', image_set='train', transform=None, target_transform=None):
        self.voc = VOCDetection(root, year=year, image_set=image_set, transform=None, target_transform=None, download=True)
        self.transform = transform
        self.target_transform = target_transform
        self.cache = [-1]*len(self.voc)

        self.S = 7
        self.B = 2
        self.C = 20

    def __len__(self):
        return len(self.voc)

    def __getitem__(self, index):

        if isinstance(self.cache[index], tuple):
            res = self.cache[index]

        else:

            image, target = self.voc[index]

            width, height = image.size

            boxes = []

            for obj in target['annotation']['object']:
                xmin = float(obj['bndbox']['xmin']) / width
                ymin = float(obj['bndbox']['ymin']) / height
                xmax = float(obj['bndbox']['xmax']) / width
                ymax = float(obj['bndbox']['ymax']) / height
                label_class = voc_classes[obj['name']]

                centerx = (xmax + xmin) / 2
                centery = (ymax + ymin) / 2
                boxwidth = xmax - xmin
                boxheight = ymax - ymin

                boxes.append([label_class, centerx, centery, boxwidth, boxheight])

            # boxes = torch.tensor(boxes)
            image = FT.to_tensor(image)

            if self.transform:
                image = self.transform(image)
            if self.target_transform:
                target = self.target_transform(target)

            # Convert To Cells
            label_matrix = torch.zeros((self.S, self.S, self.C + 5 * self.B))

            for box in boxes:
                class_label, x, y, width, height = box

                # i,j represents the cell row and cell column
                i, j = int(self.S * y), int(self.S * x)
                x_cell, y_cell = self.S * x - j, self.S * y - i

                """
                Calculating the width and height of cell of bounding box,
                relative to the cell is done by the following, with
                width as the example:
                
                width_pixels = (width*self.image_width)
                cell_pixels = (self.image_width)
                
                Then to find the width relative to the cell is simply:
                width_pixels/cell_pixels, simplification leads to the
                formulas below.
                """
                width_cell, height_cell = (
                    width * self.S,
                    height * self.S,
                )

                # format: 0-19: OHC, 20: conf, 21-30: Boxes

                # If no object already found for specific cell i,j
                # Note: This means we restrict to ONE object
                # per cell!

                box_idx = 0

                if label_matrix[i, j, self.C] == 0:

                    # Set one hot encoding for class_label
                    label_matrix[i, j, class_label] = 1

                    # Box coordinates
                    bbox_truth = torch.tensor(
                        [1.0, x_cell, y_cell, width_cell, height_cell]
                    )

                    box_start = self.C+(5*box_idx)

                    label_matrix[i, j, box_start:box_start+len(bbox_truth)] = bbox_truth

            res = (image, label_matrix)
            self.cache[index] = res

        image, label_matrix = res

        height, width = image.shape[-2:]

        x_shift = int((0.2 * random.random() - 0.1) * width)
        y_shift = int((0.2 * random.random() - 0.1) * height)
        scale = 1 + 0.2 * random.random()

        image = FT.affine(image, angle=0.0, scale=scale, translate=(x_shift, y_shift), shear=0.0)
        image = FT.adjust_hue(image, 0.2 * random.random() - 0.1)
        image = FT.adjust_saturation(image, 0.2 * random.random() + 0.9)

        return res


In [None]:
transform = transforms.Compose([
    transforms.Resize((448, 448), antialias=None)
])

train_dataset = VOCDataset('data', year='2007', transform=transform)

# **Model Architecture**

Define the model architecture

# **Model Training**

Train the model on VOC data

## Visualizing the results