# Object Detection with Faster RCNN

Code is for the following video: https://www.youtube.com/watch?v=Uc90rr5jbA4&t=71s

Do give this notebook a thumbs-up if you liked it. Thanks!

## imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [8]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

We require the latest version of torchvision

In [None]:
!pip install -U torchvision # We need a new versino of torchvision for this project

Here are all the necessary libraries

In [6]:
import torch
import torchvision
from torchvision import datasets, models
from torchvision.transforms import functional as FT
from torchvision import transforms as T
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import DataLoader, sampler, random_split, Dataset
import copy
import math
from PIL import Image
import cv2
import albumentations as A  # our data augmentation library

import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
# remove arnings (optional)
import warnings
warnings.filterwarnings("ignore")
from collections import defaultdict, deque
import datetime
import time
from tqdm import tqdm # progress bar
from torchvision.utils import draw_bounding_boxes

In [None]:
print(torch.__version__)
print(torchvision.__version__)

PyCOCOTools provides many utilities for dealing with datasets in the COCO format, and if you wanted, you could evaluate the model's performance on the dataset with some of the utilities provided with this library.

That is out of scope for this notebook, however.

In [None]:
# our dataset is in cocoformat, we will need pypcoco tools
!pip install pycocotools
from pycocotools.coco import COCO

In [16]:
# Now, we will define our transforms
from albumentations.pytorch import ToTensorV2

We use albumentations as our data augmentation library due to its capability to deal with bounding boxes in multiple formats

In [5]:
def get_transforms(train=False):
    if train:
        transform = A.Compose([
            A.Resize(600, 600), # our input size can be 600px
            A.HorizontalFlip(p=0.3),
            A.VerticalFlip(p=0.3),
            A.RandomBrightnessContrast(p=0.1),
            A.ColorJitter(p=0.1),
            ToTensorV2()
        ], bbox_params=A.BboxParams(format='coco'))
    else:
        transform = A.Compose([
            A.Resize(600, 600), # our input size can be 600px
            ToTensorV2()
        ], bbox_params=A.BboxParams(format='coco'))
    return transform

## Dataset

This is our dataset class. It loads all the necessary files and it processes the data so that it can be fed into the model.

In [9]:
class SalatkopfDetection(datasets.VisionDataset):
    def __init__(self, root, split='train', transform=None, target_transform=None, transforms=None):
        # the 3 transform parameters are reuqired for datasets.VisionDataset
        super().__init__(root, transforms, transform, target_transform)
        self.split = split #train, valid, test
        self.coco = COCO(os.path.join(root, split, "_coco_annotations.json")) # annotatiosn stored here
        self.ids = list(sorted(self.coco.imgs.keys()))
        self.ids = [id for id in self.ids if (len(self._load_target(id)) > 0)]
    
    def _load_image(self, id: int):
        path = self.coco.loadImgs(id)[0]['file_name']
        image = cv2.imread(os.path.join(self.root, self.split, path))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        return image
    def _load_target(self, id):
        return self.coco.loadAnns(self.coco.getAnnIds(id))
    
    def __getitem__(self, index):
        id = self.ids[index]
        image = self._load_image(id)
        target = self._load_target(id)
        target = copy.deepcopy(self._load_target(id))
        
        boxes = [t['bbox'] + [t['category_id']] for t in target] # required annotation format for albumentations
        if self.transforms is not None:
            transformed = self.transforms(image=image, bboxes=boxes)
        
        image = transformed['image']
        boxes = transformed['bboxes']
        
        new_boxes = [] # convert from xywh to xyxy
        for box in boxes:
            xmin = box[0]
            xmax = xmin + box[2]
            ymin = box[1]
            ymax = ymin + box[3]
            new_boxes.append([xmin, ymin, xmax, ymax])

            # print(xmin, ymin, xmax, ymax)
        
        boxes = torch.tensor(new_boxes, dtype=torch.float32)
        
        targ = {} # here is our transformed target
        targ['boxes'] = boxes
        targ['labels'] = torch.tensor([t['category_id'] for t in target], dtype=torch.int64)
        targ['image_id'] = torch.tensor([t['image_id'] for t in target])
        targ['area'] = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0]) # we have a different area
        targ['iscrowd'] = torch.tensor([t['iscrowd'] for t in target], dtype=torch.int64)
        return image.div(255), targ # scale images
    def __len__(self):
        return len(self.ids)

In [11]:
dataset_path_green = "/content/drive/MyDrive/faster_r-cnn/data/green"

In [12]:
dataset_path_black = "/content/drive/MyDrive/faster_r-cnn/data/black"

In [13]:
dataset_path_normal = "/content/drive/MyDrive/faster_r-cnn/data/normal"

In [None]:
#load classes
coco = COCO(os.path.join(dataset_path_normal, "train", "_coco_annotations.json"))
categories = coco.cats
n_classes = len(categories.keys())
categories

This code just gets a list of classes

In [None]:
classes = [i[1]['name'] for i in categories.items()]
classes

In [None]:
train_dataset_normal = SalatkopfDetection(root=dataset_path_normal, transforms=get_transforms(True))

In [None]:
train_dataset_black = SalatkopfDetection(root=dataset_path_black, transforms=get_transforms(True))

In [None]:
train_dataset_green = SalatkopfDetection(root=dataset_path_green, transforms=get_transforms(True))

This is a sample image and its bounding boxes, this code does not get the model's output

In [None]:
# Lets view a sample
sample = train_dataset_normal[0]
img_int = torch.tensor(sample[0] * 255, dtype=torch.uint8)
plt.imshow(draw_bounding_boxes(
    img_int, sample[1]['boxes'], [classes[i] for i in sample[1]['labels']], width=4
).permute(1, 2, 0))

In [None]:
# Lets view a sample
sample = train_dataset_black[0]
img_int = torch.tensor(sample[0] * 255, dtype=torch.uint8)
plt.imshow(draw_bounding_boxes(
    img_int, sample[1]['boxes'], [classes[i] for i in sample[1]['labels']], width=4
).permute(1, 2, 0))

In [None]:
# Lets view a sample
sample = train_dataset_green[0]
img_int = torch.tensor(sample[0] * 255, dtype=torch.uint8)
plt.imshow(draw_bounding_boxes(
    img_int, sample[1]['boxes'], [classes[i] for i in sample[1]['labels']], width=4
).permute(1, 2, 0))

In [None]:
len(train_dataset_normal)

## Model

Our model is FasterRCNN with a backbone of `MobileNetV3-Large`. We need to change the output layers because we have just 7 classes but this model was trained on 90 classes.

In [23]:
# lets load the faster rcnn model
def get_model():
  model = models.detection.fasterrcnn_resnet50_fpn_v2(pretrained=True)
  in_features = model.roi_heads.box_predictor.cls_score.in_features # we need to change the head
  model.roi_heads.box_predictor = models.detection.faster_rcnn.FastRCNNPredictor(in_features, n_classes)
  return model

This is our collating function for the train dataloader, it allows us to create batches of data that can be easily pass into the model

In [24]:
def collate_fn(batch):
    return tuple(zip(*batch))

In [25]:
train_loader_normal_4 = DataLoader(train_dataset_normal, batch_size=4, shuffle=True, num_workers=4, collate_fn=collate_fn)

In [50]:
train_loader_normal_2 = DataLoader(train_dataset_normal, batch_size=2, shuffle=True, num_workers=4, collate_fn=collate_fn)

In [27]:
train_loader_black_4 = DataLoader(train_dataset_black, batch_size=4, shuffle=True, num_workers=4, collate_fn=collate_fn)

In [51]:
train_loader_black_2 = DataLoader(train_dataset_black, batch_size=2, shuffle=True, num_workers=4, collate_fn=collate_fn)

In [29]:
train_loader_green_4 = DataLoader(train_dataset_green, batch_size=4, shuffle=True, num_workers=4, collate_fn=collate_fn)

In [52]:
train_loader_green_2 = DataLoader(train_dataset_green, batch_size=2, shuffle=True, num_workers=4, collate_fn=collate_fn)

The following blocks ensures that the model can take in the data and that it will not crash during training

In [31]:
model = get_model()

In [33]:
images,targets = next(iter(train_loader_normal_4))
images = list(image for image in images)
targets = [{k:v for k, v in t.items()} for t in targets]
output = model(images, targets) # just make sure this runs without error

In [34]:
images,targets = next(iter(train_loader_black_4))
images = list(image for image in images)
targets = [{k:v for k, v in t.items()} for t in targets]
output = model(images, targets) # just make sure this runs without error

In [35]:
images,targets = next(iter(train_loader_green_4))
images = list(image for image in images)
targets = [{k:v for k, v in t.items()} for t in targets]
output = model(images, targets) # just make sure this runs without error

In [20]:
device = torch.device("cuda") # use GPU to train

In [37]:
model = model.to(device)

## Optimizer

Here, we define the optimizer. If you wish, you can also define the LR Scheduler, but it is not necessary for this notebook since our dataset is so small.

> Note, there are a few bugs with the current way `lr_scheduler` is implemented. If you wish to use the scheduler, you will have to fix those bugs

In [38]:
# Now, and optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.01, momentum=0.9, nesterov=True, weight_decay=1e-4)
# lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[16, 22], gamma=0.1) # lr scheduler

In [39]:
import sys

## Training

The following is a function that will train the model for one epoch. Torchvision Object Detections models have a loss function built in, and it will calculate the loss automatically if you pass in the `inputs` and `targets`

In [40]:
def train_one_epoch(model, optimizer, loader, device, epoch):
    model.to(device)
    model.train()
    
#     lr_scheduler = None
#     if epoch == 0:
#         warmup_factor = 1.0 / 1000 # do lr warmup
#         warmup_iters = min(1000, len(loader) - 1)
        
#         lr_scheduler = optim.lr_scheduler.LinearLR(optimizer, start_factor = warmup_factor, total_iters=warmup_iters)
    
    all_losses = []
    all_losses_dict = []
    
    for images, targets in tqdm(loader):
        images = list(image.to(device) for image in images)
        targets = [{k: torch.tensor(v).to(device) for k, v in t.items()} for t in targets]
        
        loss_dict = model(images, targets) # the model computes the loss automatically if we pass in targets
        losses = sum(loss for loss in loss_dict.values())
        loss_dict_append = {k: v.item() for k, v in loss_dict.items()}
        loss_value = losses.item()
        
        all_losses.append(loss_value)
        all_losses_dict.append(loss_dict_append)
        
        if not math.isfinite(loss_value):
            print(f"Loss is {loss_value}, stopping trainig") # train if loss becomes infinity
            print(loss_dict)
            sys.exit(1)
        
        optimizer.zero_grad()
        losses.backward()
        optimizer.step()
        
#         if lr_scheduler is not None:
#             lr_scheduler.step() # 
        
    all_losses_dict = pd.DataFrame(all_losses_dict) # for printing
    print("Epoch {}, lr: {:.6f}, loss: {:.6f}, loss_classifier: {:.6f}, loss_box: {:.6f}, loss_rpn_box: {:.6f}, loss_object: {:.6f}".format(
        epoch, optimizer.param_groups[0]['lr'], np.mean(all_losses),
        all_losses_dict['loss_classifier'].mean(),
        all_losses_dict['loss_box_reg'].mean(),
        all_losses_dict['loss_rpn_box_reg'].mean(),
        all_losses_dict['loss_objectness'].mean()
    ))

10 Epochs should be enough to train this model for a high accuracy

In [None]:
num_epochs=100

model_normal_4 = get_model()
model_normal_4 = model_normal_4.to(device)

for epoch in range(num_epochs):
    train_one_epoch(model_normal_4, optimizer, train_loader_normal_4, device, epoch)
#     lr_scheduler.step()

In [42]:
torch.save(model_normal_4, "/content/drive/MyDrive/faster_r-cnn/saved_models/resnet50_normal_4_100_epochs.pth")

In [None]:
num_epochs=100

model_black_4 = get_model()
model_black_4 = model_black_4.to(device)

for epoch in range(num_epochs):
    train_one_epoch(model_black_4, optimizer, train_loader_black_4, device, epoch)
#     lr_scheduler.step()

In [44]:
torch.save(model_black_4, "/content/drive/MyDrive/faster_r-cnn/saved_models/resnet50_black_4_100_epochs.pth")

In [None]:
num_epochs=100

model_green_4 = get_model()
model_green_4 = model_green_4.to(device)

for epoch in range(num_epochs):
    train_one_epoch(model_green_4, optimizer, train_loader_green_4, device, epoch)
#     lr_scheduler.step()

In [47]:
torch.save(model_green_4, "/content/drive/MyDrive/faster_r-cnn/saved_models/resnet50_green_4_100_epochs.pth")

In [None]:
num_epochs=100

model_normal_2 = get_model()
model_normal_2 = model_normal_2.to(device)

for epoch in range(num_epochs):
    train_one_epoch(model_normal_2, optimizer, train_loader_normal_2, device, epoch)
#     lr_scheduler.step()

In [54]:
torch.save(model_normal_2, "/content/drive/MyDrive/faster_r-cnn/saved_models/resnet50_normal_2_100_epochs.pth")

In [None]:
num_epochs=100

model_black_2 = get_model()
model_black_2 = model_black_2.to(device)

for epoch in range(num_epochs):
    train_one_epoch(model_black_2, optimizer, train_loader_black_2, device, epoch)
#     lr_scheduler.step()

In [57]:
torch.save(model_black_2, "/content/drive/MyDrive/faster_r-cnn/saved_models/resnet50_black_2_100_epochs.pth")

In [None]:
num_epochs=100

model_green_2 = get_model()
model_green_2 = model_green_2.to(device)

for epoch in range(num_epochs):
    train_one_epoch(model_green_2, optimizer, train_loader_green_2, device, epoch)
#     lr_scheduler.step()

In [59]:
torch.save(model_green_2, "/content/drive/MyDrive/faster_r-cnn/saved_models/resnet50_green_2_100_epochs.pth")

In [None]:
%tb

In [None]:
# our learning rate was too low, due to a lr scheduler bug. For this task, we wont need a scheudul.er

## Trying on sample Images

This is the inference code for the model. First, we set the model to evaluation mode and clear the GPU Cache. We also load a test dataset, so that we can use fresh images that the model hasn't seen.

In [None]:
# we will watch first epoich to ensure no errrors
# while it is training, lets write code to see the models predictions. lets try again
model.eval()
torch.cuda.empty_cache()

In [None]:
test_dataset = AquariumDetection(root=dataset_path, split="test", transforms=get_transforms(False))

In [None]:
img, _ = test_dataset[2]
img_int = torch.tensor(img*255, dtype=torch.uint8)
with torch.no_grad():
    prediction = model([img.to(device)])
    pred = prediction[0]

In [None]:
# it did learn

In [None]:
prediction

In [None]:
fig = plt.figure(figsize=(14, 10))
plt.imshow(draw_bounding_boxes(img_int,
    pred['boxes'][pred['scores'] > 0.8],
    [classes[i] for i in pred['labels'][pred['scores'] > 0.8].tolist()], width=4
).permute(1, 2, 0))