<a href="https://colab.research.google.com/github/HaoYamado/notebooks/blob/master/Object_Detection_With_Torchvision.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Torchvision Object Detection

> Examples given from official documentation PyTorch


![alt text](https://www.cis.upenn.edu/~jshi/ped_html/images/PennPed00015_1.png)

In [0]:
import os
import numpy as np
import torch
from PIL import Image

class PennFudanDataset(object):
  def __init__(self, root, transforms):
    self.root = root
    self.transforms = transforms
    # load all image files, sorting them,
    # ensure that they are aligned
    self.imgs = list(sorted(os.listdir(os.path.join(root, 'https://drive.google.com/open?id=1rxfPo7TVrTMheRK1R6scSxD2GtZ0dGxc/')))) # PNGImages
    self.masks = list(sorted(os.listdir(os.path.join(root, 'https://drive.google.com/open?id=1gjzGqtZ0m_3VhKREp2U582Pbm7l62fWy')))) # PedMasks

  def __getitem__(self, idx):
    # load images ad masks
    img_path = os.path.join(self.root, 'PNGImages', self.imgs[idx])
    mask_path = os.path.join(self.root, 'PedMasks', self.masks[idx])
    img = Image.open(img_path).convert('RGB')
    # Note that we haven't converted the mask to RGB
    # because each color corresponds to a different instance
    # with 0 being background
    mask = Image.open(mask_path)
    # convert the PIL image into a numpy array
    mask = np.array(mask)
    # instances are encoded as different colors
    obj_ids = np.unique(mask)
    # first id is the background, so remove it
    obj_ids = obj_ids[1:]

    # split the color-encoded mask into a set
    # of binary masks
    masks = mask == obj_ids[:, None, None]

    # get bounding box coordinates for each mask
    num_obj = len(obj_ids)
    boxes = []
    for i in range(num_objs):
      pos = np.where(masks[i])
      xmin = np.min(pos[1])
      xmax = np.max(pos[1])
      ymin = np.min(pos[0])
      ymax = np.max(pos[0])
      boxes.append([xmin, ymin, xmax, ymax])

    # converting everything into a torch.Tensor
    boxes = torch.as_tensor(boxes, dtype=torch.float32)
    # there is only one class
    labels = torch.ones((num_objs,), dtype=torch.int64)
    masks = torch.as_tensor(masks, dtype=torch.utint8)

    image_id = torch.tensor([idx])
    area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
    # suppose all instances are not crowd
    iscrowd = torch.zeros((num_obj,), dtype=torch.int64)

    target = {}
    target['boxes'] = boxes
    target['labels'] = labels
    target['masks'] = masks
    target['image_id'] = image_id
    target['area'] = area
    target['iscrowd'] = iscrowd

    if self.transforms is not None:
      img, target = self.transforms(img, target)
    
    return img, target

def __len__(self):
  return len(self.imgs)

# Defining model

In this notebooks, we will using Mask R_CNN(paper in repository), which is based  on top os Fster R_CNN. Faster R-CNN is a model that predicts both bounding boxes and class scores for potential object in the image
 
![alt text](https://pytorch.org/tutorials/_static/img/tv_tutorial/tv_image03.png) Mask R-CNN adds an extra branch into Fater R-CNN, which also predicts segmentation masks for each instance.
![alt text](https://pytorch.org/tutorials/_static/img/tv_tutorial/tv_image04.png)


## 1 - Finetuning from a pretrained model

In [2]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

# load a model pre-trained on COCO
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

# replace the classifier with a new one, that has
# num_classes which is user-defined
num_classes = 2 # 1 class (person) + background
# get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
100%|██████████| 160M/160M [00:04<00:00, 33.5MB/s]


## 2 - Modifying the model to add a different backbone

In [0]:
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator

# load a pre-trained model for classification and return
# only features
backbone = torchvision.models.mobilenet_v2(pretrained=True).features
# FasterRCNN needs to know the number of
# output channels in a backbone/ For mobilenet_v2, it's 1280
# so we need to add it here
backbone.out_channels = 1280

# let's make the RPN generate 5 x 3 anchors per spatial
# location, with 5 different sizes and 3 different aspect
# rations. We have a Tuple[Tuple[int]] because each feature
# map could potentially have different sizes and
# aspect rations
anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),),
                                          aspect_ratios=((0.5, 1.0, 2.0),))
# let's define what are the feature maps that we will
# use to perform the region of interest cropping, as well as
# the sie of the crop after rescaling
# if your backbone return a Tensor, featmap_names is expectd to
# be [0]. More generally, the backbone should return an
# OrderedDict[Tensor], and in featmap_names you can choose which
# feature maps to use.
roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=[0],
                                                output_size=7,
                                                sampling_ratio=2)
# put the pieces together inside a FasterRCNN model
model = FasterRCNN(backbone,
                   num_classes=2,
                   rpn_anchor_generator=anchor_generator,
                   box_roi_pool=roi_pooler)