Vehicle Detection with Faster RCNN and Transfer Learning

In [None]:
# Import necessary libraries and modules
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import cv2
from PIL import Image
from tempfile import TemporaryDirectory
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

plt.rcParams['figure.figsize'] = [15, 15]
#%matplotlib notebook

In [None]:
#----Housekeeping----
CLASSES = 2 # car + background

# Set compute device (GPU or CPU)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

# Training data and annotations paths
train_data_path = 'vehicle_data/train_images/'
train_annotation_path = 'train_annotations.json'
# Validation data and annotation paths
val_data_path = 'vehicle_data/val_images/'
val_annotation_path = 'val_annotations.json'

BATCH_SIZE = 16

Now we must construct our data set object. This inherits from the PyTorch *Dataset* class, and we must rewrite some class methods for our specific problem. We use *pycocotools* to read our JSON annotation file. See the [documentation](https://pytorch.org/vision/0.12/generated/torchvision.models.detection.fasterrcnn_resnet50_fpn.html#torchvision.models.detection.fasterrcnn_resnet50_fpn) for our model to understand why we format the data in this way.

In [None]:
#----Data loader----

import utils #downloaded from torch/vision/references/detection
class VehicleDetectionDataset(Dataset):
    
    def __init__(self, images_root, annot_file, transform=None, target_transform=None):
        from pycocotools.coco import COCO # Module for parsing JSON annotation files in COCO format
        self.img_root = images_root # Path to image directory
        self.coco = COCO(annot_file) # COCO object representing annotations
        self.ids = list(self.coco.imgs.keys()) # List of unique image identifiers in annotation file
        self.transform = transform
        self.target_transform = target_transform
        
    def __len__(self):
        return len(self.ids)
    
    def __getitem__(self, idx):
        # Get image and corresponding annotations from index
        coco = self.coco
        img_id = self.ids[idx] 
        ann_ids = coco.getAnnIds(imgIds=img_id)
        target = coco.loadAnns(ann_ids)
        
        path = coco.loadImgs(img_id)[0]['file_name']
        
        img = Image.open(os.path.join(self.img_root, path)).convert('RGB')
        
        # Perform transformations on images if desired
        if self.transform is not None:
            img = self.transform(img)
            
        if self.target_transform is not None:
            target = self.target_transform(target)
        
        # Our images contain other annotations for other objects that we don't care about. So we'll loop through
        # each label in the image and retain only those which are of vehicles.
        
        # Initialize lists
        bbox = []
        area = []
        image_id = []
        class_labels = []
        empty_check = 0
        # Loop through each annotation in the image
        for i in target:
            if i['category_id'] == 1: #Indicating box is for a car
                # Convert bounding box format from (xmin, ymin, width, height) to (xmin, ymin, xmax, ymax)
                box_convert = [i['bbox'][0], 
                               i['bbox'][1], 
                               i['bbox'][0] + i['bbox'][2], 
                               i['bbox'][1] + i['bbox'][3]]
                # Append targets of relevant annotations to lists
                bbox.append(box_convert)
                area.append(i['bbox'][2] * i['bbox'][3])
                image_id.append(i['image_id'])
                class_labels.append(i['category_id'])
                empty_check += 1
        
        # Per documentation of our pretrained model, our training target must be a dictionary with the keys as specified
        # and the values as pytorch tensors.
        target_dict = {}
        target_dict["boxes"] = torch.as_tensor(bbox)
        target_dict["labels"] = torch.as_tensor(class_labels)
        target_dict["image_id"] = torch.as_tensor(image_id)
        target_dict["area"] = torch.as_tensor(area)
        
        target_list_of_dict = []
        target_list_of_dict.append(target_dict)
        if empty_check == 0:
            print(path)
            
            
        return img, target_dict
    
    

transform = transforms.Compose([
    # you can add other transformations in this list
    transforms.ToTensor() ])

#Instantiate data set object and create a data loader
train_dataset = VehicleDetectionDataset(train_data_path, train_annotation_path, transform)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=utils.collate_fn)

val_dataset = VehicleDetectionDataset(val_data_path, val_annotation_path, transform)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

num_train_images = len(train_dataset)
num_val_images = len(val_dataset)

print("Number of images in training set: ", num_train_images)
print("Number of images in validation set: ", num_val_images)

Now before we get into the model, we'll define some helper functions for viewing images and imposing bounding boxes on them.

In [None]:
#---- Function to pre-process frame for inference, transform image data to be operable with PyTorch ----
def prepare_img_for_inference(img, resize=False):
    # Keep copy of image for displaying
    original = img.copy()
    
    # Transform from BGR channel order of cv2 to RGB channel order for pytorch
    image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    image = image.transpose((2,0,1))
    
    # Add batch dimension
    image = np.expand_dims(image, axis=0)
    # Rescale between [0,1]
    image = image / 255.0
    # Recast to PyTorch tensor
    image = torch.FloatTensor(image)
    
    # Send to device
    image = image.to(device)
    
    return image, original


#---- Function which takes model prediction and imposes predicted bounding boxes on the image for display ----
def display_image_and_inferences(detections, orig_image, conf_threshold, display=False):
    # Loop through all predicted bounding boxes
    for i in range(0, len(detections["boxes"])):
        # extract the confidence (i.e., probability) associated with the prediction
        confidence = detections["scores"][i]
        # filter out weak detections by ensuring the confidence is
        # greater than the minimum confidence
        if confidence > conf_threshold:
            # extract the index of the class label from the detections,
            # then compute the (x, y)-coordinates of the bounding box
            # for the object
            idx = int(detections["labels"][i])
            box = detections["boxes"][i].detach().cpu().numpy()
            (startX, startY, endX, endY) = box.astype("int")
            # display the prediction to our terminal
            label = "Car: {:.2f}%".format(confidence * 100)
            print("[INFO] {}".format(label))
            # draw the bounding box and label on the image
            cv2.rectangle(orig_image, (startX, startY), (endX, endY),
                (0,0,255), 2)
            y = startY - 15 if startY - 15 > 15 else startY + 15
            cv2.putText(orig_image, label, (startX, y),
                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 1)
            
    # Used for inline display of sample
    if display:
        plt.imshow(orig_image)
        plt.show()
    
    # Return the image with bounding boxes drawn
    return orig_image


#---- Function to take test video, make model predictions frame by frame, and display annotations in video ----
def annotate_video(model, video_path, conf):
    # Create opencv VideoCapture object to read MP4 file

    # Check if VideoCapture has opened video without problem
    if not video.isOpened():
        print("VideoCapture initialization failed!")
        return None
    # Read initial frame from video

    # Flag to indicate if video is playing or paused
 
    # Do not collect gradients to speed up inferences

        # The first return of VideoCapture.read() will become false when the video is out of frames
        while ret_val:
            # Only resume inferences if playing flag is True
            if playing:
                # Use function to pre-process the frame for our model

                # Perform inference on video frame

                # Use defined function to draw bounding boxes from inference onto frame

                # Place frame onto display window
                cv2.imshow("Video Model Predictions", display_frame)
            # Block for minimal amount and proceed
            key = cv2.waitKey(1) & 0xFF
            # If 'q' key is pressed, stop the process and return
            if key == ord('q'):
                cv2.destroyAllWindows()
                video.release()
                return None
            # If 'p' key is pressed, toggle between pause and play
            if key == ord('p'):
                playing = not playing
            # Grab next frame from video
            
            
    cv2.destroyAllWindows()
    video.release()
    return None

Now we'll load the model and perform a sample inference to see how it works. Note, the pre-trained model already classifies cars, we are applying this particular transfer learning as a demonstration.

In [None]:
#Load model
rcnn_model = models.detection.fasterrcnn_mobilenet_v3_large_fpn(
    pretrained=True, progress=True, pretrained_backbone=True)

# Send model to compute device and set evaluation mode to dictate the behavior of certain layers (dropout, batchnorm, etc.)
rcnn_model.to(device)
rcnn_model.eval()

# Load and process sample image
sample_img_path = "test_image.png"
sample_img = cv2.imread(sample_img_path)
sample_img_inference, original_sample = prepare_img_for_inference(sample_img)
# Run inference and display predicted classifications and detections on original image
sample_output = rcnn_model(sample_img_inference)[0]
_ = display_image_and_inferences(sample_output, original_sample, 0.7, display=True)


In [None]:
#Freeze weights in network
for param in rcnn_model.parameters():
    param.requires_grad = False

#---Reset final fully connected layer to reflect desired number of classes---
num_features = rcnn_model.roi_heads.box_predictor.cls_score.in_features
rcnn_model.roi_heads.box_predictor = FastRCNNPredictor(num_features, CLASSES)
#Place model on compute device
rcnn_model = rcnn_model.to(device)

#create optimizer - note that only final layer parameters are being optimized
params = [p for p in rcnn_model.parameters() if p.requires_grad]
opt = optim.SGD(params, lr = 0.001)

In [None]:
from engine import train_one_epoch, evaluate #downloaded from torch/vision/references/detection

# Set some number of epochs and train
num_epochs = 10
for ep in range(num_epochs):
    train_one_epoch(rcnn_model, opt, train_dataloader, device, ep, print_freq=20)

In [None]:
rcnn_model.eval()

# Run inference and display predicted classifications and detections on original image




Now we'll load our video and pass it through our model to predict vehicles in the frames