In [35]:
# ! pip cache purge
# ! pip install --user albumentations


In [36]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import patches
from collections import Counter
import cv2
from glob import glob
from tqdm import tqdm
# from termcolor import colored

import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader

import torchvision
from torchvision import transforms

import albumentations as A
from albumentations.pytorch import ToTensorV2

In [37]:
# from torchvision.datasets import VOCDetection

# dataset = VOCDetection(
#     root="path_to_save_voc",  # Đường dẫn thư mục lưu trữ dataset
#     year="2007",             # VOC 2007 hoặc 2012
#     image_set="train",        # 'train', 'val', hoặc 'trainval'
#     download=True             # Tải dataset nếu chưa có
# )

In [38]:
import torchvision
import torch

class CustomVOCDataset(torchvision.datasets.VOCDetection):
    def __init__(self, root, year='2007', image_set='train', download=False,
                 class_mapping=None, s=7, b=2, c=20, custom_transforms=None):
        # Gọi hàm khởi tạo của VOCDetection
        super().__init__(root, year, image_set, download)
        self.s = s  # Grid size sxs
        self.b = b
        self.c = c
        self.class_mapping = class_mapping  # Mapping class name to class indices
        self.custom_transforms = custom_transforms

    def __getitem__(self, index):
        image, target = super(CustomVOCDataset, self).__getitem__(index)
        img_width, img_height = image.size

        boxes = self.convert_to_yolo_format(target, class_mapping)
        bboxes = boxes[:, 1:]
        labels = boxes[:, 0]

        ### Augentation

        if self.custom_transforms :
            augmented = self.custom_transforms()(
                image=np.array(image),
                bboxes=bboxes,
                labels=labels
                                                 )

            image = augmented['image'] # np.array
            bboxes = augmented['bboxes'] # list 
            labels = augmented['labels'] # list


        # as_tensor để tiết kiệm bộ nhớ và nhanh hơn
        image = torch.as_tensor(image)
        bboxes = torch.tensor(bboxes) # bbox đã tự chuẩn hóa về 0 - 1
        labels = torch.tensor(labels)


        #  vì mình muốn 1 ô chỉ vẽ 1 bbox thôi nên B = 1 => ()
        labels_matrix = torch.zeros(self.s, self.s, (5 * self.b) + self.c)
        
        # Iterate through each bounding box in YOLO format .
        for bbox, label in zip(bboxes, labels):
            x, y, width, height = bbox.tolist()
            class_label = int(label)
            
            # calculate the grid cell (i, j) the box belongs to
            i, j = int(self.s * y), int(self.s * x) # 7 * 0.53 = 3 (index đánh từ 0)
            x_cell, y_cell = self.s * x - j, self.s * y - i  # 7×0.53−3=0.71 : vị trí tương đối x, y nằm trong cell đó

            # tỷ lệ w, h của bbox ánh xạ sang w, h trong cell  : width_cell=0.2×7=1.4 nghĩa là  chiều rộng 1.4 lần chiều rộng của một ô lưới.
            width_cell, height_cell = width * self.s, height * self.s


            # If no object has been found in this specific cell (i, j) before:
            if labels_matrix[i, j, 20] == 0:
                # Mark that an object exists in this cell.
                labels_matrix[i, j, 20] = 1

                # Store the box coordinates as an offset from the cell boundaries.
                box_coordinates = torch.tensor(
                    [x_cell, y_cell, width_cell, height_cell]
                )

                # Set the box coordinates in the label matrix.
                labels_matrix[i, j, 21:25] = box_coordinates

                # Set the one-hot encoding for the class label.
                labels_matrix[i, j, class_label] = 1

        # labels_matrix ở datasets chỉ gắn 25 ô thôi (vì 1 ô tối đa chỉ có 1 class)
        # còn khi qua trainning thì gắn đủ 30 ô vì 1 ô được đoán 2 class
        return image, labels_matrix



    def convert_to_yolo_format(self, target, class_mapping):
        """
            Convert annotation data from VOC format to YOLO format.

            Parameters:
            target (dict): Annotation data from VOCDetection dataset.
            img_width (int): Width of the original image.
            img_height (int): Height of the original image.
            class_mapping (dict): Mapping from class names to integer IDs.

            Returns:
            torch.Tensor: Tensor of shape [N, 5] for N bounding boxes,
            each with [class_id, x_center, y_center, width, height].
        """

        annotations = target['annotation']['object']
        real_width, real_height = int(target['annotation']['size']['width']), int(
            target['annotation']['size']['height'])

        boxes = []

        # Loop through each annotation and convert it to YOLO format.
        for anno in annotations:
            xmin = int(anno['bndbox']['xmin']) / real_width
            xmax = int(anno['bndbox']['xmax']) / real_width
            ymin = int(anno['bndbox']['ymin']) / real_height
            ymax = int(anno['bndbox']['ymax']) / real_height

            x_center = (xmin + xmax) / 2
            y_center = (ymin + ymax) / 2
            width = xmax - xmin
            height = ymax - ymin
            class_name = anno['name']
            class_id = class_mapping[class_name] if class_name in class_mapping else 0

            boxes.append([class_id, x_center, y_center, width, height])

            # 20 ô đầu là one-hot-encoding, 1 ô px, 

        return np.array(boxes)
    
# Class mapping 
class_mapping = {
    'aeroplane': 0,
    'bicycle': 1,
    'bird': 2,
    'boat': 3,
    'bottle': 4,
    'bus': 5,
    'car': 6,
    'cat': 7,
    'chair': 8,
    'cow': 9,
    'diningtable': 10,
    'dog': 11,
    'horse': 12,
    'motorbike': 13,
    'person': 14,
    'pottedplant': 15,
    'sheep': 16,
    'sofa': 17,
    'train': 18,
    'tvmonitor': 19
}

### Augmentation

In [39]:
WIDTH = 448
HEIGHT = 448

def get_train_transforms():
    return A.Compose(
        [
            A.OneOf(
                [
                    A.HueSaturationValue(
                        hue_shift_limit=0.2,
                        sat_shift_limit=0.2,
                        val_shift_limit=0.2,
                        p=0.9,
                    ),
                    A.RandomBrightnessContrast(
                        brightness_limit=0.2,
                        contrast_limit=0.2,
                        p=0.9,
                    ),
                ],
                p=0.9,
            ),
            A.ToGray(p=0.01),
            A.HorizontalFlip(p=0.2),
            A.VerticalFlip(p=0.2),
            A.Resize(height=WIDTH, width=WIDTH, p=1.0),
            # A.Cutout(num_holes=8, max_h_size=64, max_w_size=64, fill_value=0, p=0.5),
            ToTensorV2(p=1.0),
        ],
        p=1.0,
        bbox_params=A.BboxParams(
            format="yolo",
            min_area=0,
            min_visibility=0,
            label_fields=["labels"],
        ),
    )


def get_valid_transforms():
    return A.Compose(
        [
            A.Resize(height=WIDTH, width=WIDTH, p=1.0),
            ToTensorV2(p=1.0),
        ],
        p=1.0,
        bbox_params=A.BboxParams(
            format="yolo",
            min_area=0,
            min_visibility=0,
            label_fields=["labels"],
        ),
    )

In [None]:
# Tạo dataset
dataset = CustomVOCDataset(
    root='./data',  # Đường dẫn tới thư mục VOC
    year='2007',  # Phiên bản VOC
    image_set='train',
    download=False,
    class_mapping=class_mapping,
    custom_transforms=get_train_transforms
)

# Lấy một mẫu
image, labels_matrix = dataset.__getitem__(0)

tensor([[[69, 66, 67,  ..., 55, 54, 51],
         [71, 67, 67,  ..., 55, 52, 52],
         [69, 68, 67,  ..., 57, 54, 55],
         ...,
         [79, 80, 80,  ..., 80, 81, 81],
         [81, 82, 82,  ..., 79, 79, 79],
         [82, 83, 83,  ..., 80, 79, 79]],

        [[69, 66, 67,  ..., 56, 56, 53],
         [71, 67, 67,  ..., 56, 54, 54],
         [69, 68, 67,  ..., 58, 56, 57],
         ...,
         [78, 79, 80,  ..., 80, 81, 81],
         [80, 81, 81,  ..., 79, 79, 79],
         [81, 82, 81,  ..., 81, 79, 79]],

        [[67, 64, 65,  ..., 56, 55, 52],
         [68, 65, 65,  ..., 55, 53, 53],
         [67, 66, 65,  ..., 57, 55, 56],
         ...,
         [74, 75, 77,  ..., 78, 79, 79],
         [76, 77, 79,  ..., 76, 77, 77],
         [77, 78, 79,  ..., 76, 77, 77]]], dtype=torch.uint8)