[Reference](https://www.kaggle.com/code/benihime91/pytorch-fasterrcnn)

### Imports

In [1]:
import os
import numpy as np
import pandas as pd
from PIL import Image

# Sk learn
from sklearn.model_selection import train_test_split

# PyTorch
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision import transforms, datasets, models
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor


In [2]:
# Count the total number of images

img_list = list(sorted(os.listdir('./data/images/')))
len(img_list)

853

#### Split the data

In [3]:
train_img, test_img = train_test_split(img_list, test_size = 0.2, random_state=42)
print(f"No. of training images: {len(train_img)}")
print(f"No. of test images: {len(test_img)}")

No. of training images: 682
No. of test images: 171


### DataSet Class

In [29]:
from bs4 import BeautifulSoup
from torchvision import transforms as T

def generate_box(obj):
    xmin = int(obj.find('xmin').text)
    ymin = int(obj.find('ymin').text)
    xmax = int(obj.find('xmax').text)
    ymax = int(obj.find('ymax').text)
    return [xmin, ymin, xmax, ymax]

def generate_labels(obj):
    if obj.find('name').text == "with_mask":
        return 1
    elif obj.find('name').text == "mask_weared_incorrect":
        return 2
    elif obj.find('name').text == "without_mask":
        return 3
    return 0

def get_transform():
    transforms = []
    transforms.append(T.PILToTensor())
    transforms.append(T.ConvertImageDtype(torch.float),)
    return T.Compose(transforms)


class MaskDataset(Dataset):
    def __init__(self, img_dir, annot_dir, img_list, transforms):
        self.transforms = transforms
        self.imgs = img_list
        self.img_dir = img_dir
        self.annot_dir = annot_dir

    def __len__(self):
        return len(self.imgs)

    def __getitem__(self, idx):
        f_image = 'maksssksksss'+ str(idx) + '.png'
        f_label = 'maksssksksss'+ str(idx) + '.xml'
        
        img_path = os.path.join(self.img_dir, f_image)
        label_path = os.path.join(self.annot_dir, f_label)
        
        # Open the image
        img = Image.open(img_path).convert("RGB")
        
        target = self.__generate_target(idx, label_path)
        
        if self.transforms is not None:
            # img, target = self.transforms(img, target)
            img = self.transforms(img)

        return img, target
    
    
    @staticmethod
    def __generate_target(img_id, file):
        with open(file, 'r') as f:
            data = f.read()
            soup = BeautifulSoup(data, 'xml')
            objects = soup.find_all('object')
            
            num_objects = len(objects)
            
            # Bounding boxes
            boxes = []
            labels = []
            
            for i in objects:
                boxes.append(generate_box(i))
                labels.append(generate_labels(i))
        
            # Convert to tensor
            boxes = torch.as_tensor(boxes, dtype=torch.float32)
            labels = torch.as_tensor(labels, dtype=torch.int64)
            
            img_id = torch.tensor(img_id)
            
            # Area is width and height
            area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
            iscrowd = torch.zeros((num_objects,), dtype=torch.int64)
            
            # Return a dictionary format
            target = {}
            target['boxes'] = boxes
            target['labels'] = labels
            target['image_id'] = img_id
            target['area'] = area
            target['iscrowd'] = iscrowd
            
            return target

### DataLoader Class

In [30]:
train_dataset = MaskDataset(img_dir= './data/images/',
                        annot_dir= './data/annotations/',
                        img_list= train_img,
                        transforms= get_transform())

In [40]:
train_dataset[0][0].size()

torch.Size([3, 366, 512])

In [41]:
train_dataset[1][0].size()

torch.Size([3, 156, 400])

In [36]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = 2, num_workers = 0)

In [37]:
# for data in train_loader:
#     print(data)
#     break

RuntimeError: stack expects each tensor to be equal size, but got [3, 366, 512] at entry 0 and [3, 156, 400] at entry 1