# Import Libraries

In [1]:
import sys
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image 

import torch
import torch.nn as nn
import torchvision
from torchvision import transforms
from torch.utils import data

from MSCOCO import MSCOCO

# Dataset and Dataloader

In [2]:
coco_object_categories = []
coco_classes = {}
target_classes = ['__bgr__', 'person', 'car']


with open('./mscoco_labels.txt', 'r') as f:
    for id, category in enumerate(f.readlines()):
        category = re.sub('[\d]+\W+','', category.rstrip())         
        coco_object_categories.append(category)
        coco_classes[category] = id

In [3]:
root_path = '../../../Datasets/MS COCO'

In [4]:
coco_interface = MSCOCO(root_path, coco_classes, target_classes, 'annotation_data.json')

In [5]:
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')

print(f'Available device: {device}')

Available device: cuda


In [6]:
dataloader_args = {'batch_size':1, 'shuffle':True}
train_dataloader = data.DataLoader(coco_interface, **dataloader_args)

In [7]:
idx, img, labels = coco_interface[0]

# Faster RCNN model

## Finetuning from pre trained model

In [8]:
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

In [9]:
frcnn_args = {'box_score_thresh':0.5, 'num_classes':91, 'rpn_batch_size_per_image':256, 'box_batch_size_per_image':256}
frcnn_model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True, **frcnn_args)
frcnn_model.eval()

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [10]:
torch.save(frcnn_model.state_dict(), './fasterrcnn_object_detection.pth')

In [10]:
# replace the classifier
num_classes = 3  # 20 object + background (Pascal VOC 2012)
# get number of input features for the classifier
in_features = frcnn_model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
frcnn_model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

In [11]:
# move model to cuda
if device == torch.device('cuda'):
    frcnn_model = frcnn_model.to(device)

In [12]:
optimizer_pars = {'lr':1e-5, 'weight_decay':1e-3}
optimizer = torch.optim.Adam(list(frcnn_model.parameters()),**optimizer_pars)
total_epoch = 1

In [13]:
idx, X, y = next(iter(train_dataloader))

In [14]:
y

{'labels': tensor([[1, 1, 1]]),
 'boxes': tensor([[[284.5600, 270.7100, 389.8000, 447.8900],
          [181.4800, 209.5000, 303.8900, 409.2300],
          [  4.3000, 264.7500, 162.5100, 442.3300]]])}

In [15]:
for e in range(total_epoch):
    epoch_loss = 0
    for id, batch in enumerate(train_dataloader):
        optimizer.zero_grad()
        idx, X,y = batch
        if device == torch.device('cuda'):
            X,y['labels'],y['boxes'] = X.to(device), y['labels'].to(device), y['boxes'].to(device)
        # list of images        
        images = [im for im in X]                
        targets = []
        lab={}
        # THIS IS IMPORTANT!!!!!        
        # get rid of the first dimension (batch)        
        # REPEAT: DO NOT USE BATCH DIMENSION           
        # 
        lab['boxes'] = y['boxes'].squeeze_(0)
        lab['labels']= y['labels'].squeeze_(0)
        targets.append(lab)
        # avoid empty objects
        if len(targets)>0:
            loss = frcnn_model(images, targets)            
            total_loss = 0
            print(loss)
            for k in loss.keys():
                total_loss += loss[k]
            total_loss.backward()
            optimizer.step()
            epoch_loss +=total_loss.item()
        print("dddd")
    epoch_loss/=len(dataloader)
    
    print("Loss={0:.4f} in Epoch={1:d}".format(epoch_loss, e))

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


{'loss_classifier': tensor(2.4504, device='cuda:0', grad_fn=<NllLossBackward0>), 'loss_box_reg': tensor(0.1695, device='cuda:0', grad_fn=<DivBackward0>), 'loss_objectness': tensor(0.3341, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), 'loss_rpn_box_reg': tensor(0.0026, device='cuda:0', grad_fn=<DivBackward0>)}
dddd


RuntimeError: CUDA out of memory. Tried to allocate 16.00 MiB (GPU 0; 2.00 GiB total capacity; 1.13 GiB already allocated; 0 bytes free; 1.18 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [22]:
def predict(img):
    frcnn_model.eval()
    #img = Image.open(img)
    img = np.array(img)
    img_tensor = transforms.ToTensor()(img)   
    out = frcnn_model([img_tensor])
    scores = out[0]['scores'].detach().numpy()
    bboxes = out[0]['boxes'].detach().numpy()
    classes = out[0]['labels'].detach().numpy()
    print(scores)
    fig, ax = plt.subplots()
    ax.imshow(img)
    for i in range(len(classes)):
        if scores[i] > 0.75:
            bbox = bboxes[i]
            rect = patches.Rectangle((bbox[0],bbox[1]),bbox[2]-bbox[0],bbox[3]-bbox[1], edgecolor='r', facecolor="none")
            ax.add_patch(rect)
            #ax.text((bbox[0]+bbox[2])/2 - 30, bbox[1]-5, pascal_voc_classes_names[i], c='r')

    plt.axis('off')
    plt.show()