# Object Detection with YOLO (You Only Look Once)

Matheus Schmitz  
<a href="https://www.linkedin.com/in/matheusschmitz/">LinkedIn</a>  
<a href="https://matheus-schmitz.github.io/">Github Portfolio</a>  

**Configuration File**: https://github.com/pjreddie/darknet/blob/master/cfg/yolov3.cfg

## Part 1 - Neural Network Backend

In [31]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# Ignore warnings
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter('ignore')

In [8]:
# Function to read the configuration file
# Returns the blocks used to build the neural network
def parse_config_file(config_file):
    
    # Open the file for reading
    file = open(config_file, 'r')
    
    # Read the lines and convert to a list
    # Remove blank lines
    # Remove comments
    # Remove white spaces
    lines = file.read().split('\n')
    lines = [line for line in lines if len(line) > 0]
    lines = [line for line in lines if line[0] != '#']
    lines = [line.strip() for line in lines]
    
    # Dictionary and list of hyperparameter blocks
    block = {}
    blocks = []
    
    # Loop through the lines
    for line in lines:
        
        # Get the type (class) of the block of hyperparameters
        if line[0] == '[':
            # We are starting a new block, so first add the previous/current one to the list of blocks, then start the new one
            if len(block) != 0:
                blocks.append(block)
                block = {}
            # Name the block type accordingly (minus the [] brackets)
            block['type'] = line[1:-1].strip()
        # If/while inside a block, get the hyperparameter and value to use
        else:
            key, value = line.split("=")
            block[key.strip()] = value.strip()
    
    # Need an extra line to append the last block (because usually we append the block before starting the next one)
    blocks.append(block)
    
    return blocks

In [9]:
# Function to create the network modules
# Returns PyTorch objects
def create_modules(blocks):
        
    # Info about the neural network input parameters
    net_info = blocks[0]
    
    # Create an object to build the modules
    # https://pytorch.org/docs/master/generated/torch.nn.ModuleList.html
    module_list = nn.ModuleList()
    
    # Number of color channels in the images
    num_filters = 3
    
    # List used in the route layers to keep record of all filters' outputs
    output_filters = []
    
    # Iterate through the blocks and create the neural network's modules (layers)
    for index, block in enumerate(blocks[1:]):
        
        # Create the current module (sequence of steps)
        # https://pytorch.org/docs/master/generated/torch.nn.Sequential.html
        module = nn.Sequential()
        
        # Check the module type
        if block['type'] == 'convolutional':
            
            # Extract the hyperparameters for a convolutional layer
            activation = block['activation']
            filters = int(block['filters'])
            padding = int(block['pad'])
            kernel_size = int(block['size'])
            stride = int(block['stride'])
            bias = 0
            
            # Add batch normalization if the layer has it
            try:
                batch_normalize = int(block['batch_normalize'])
            except:
                batch_normalize = 0    
                
            # Adjust the padding
            if padding:
                pad = (kernel_size - 1)//2
            else:
                pad = 0
                
            # Create the convolutional layer
            # https://pytorch.org/docs/master/generated/torch.nn.Conv2d.html
            conv = nn.Conv2d(in_channels = num_filters,
                             out_channels = filters,
                             kernel_size = kernel_size,
                             stride = stride,
                             padding = pad,
                             bias = bias)
            
            # Add the convolutional layer to the list of modules
            module.add_module(f'conv_{index}', conv)
            
            # Add the batch normalization layer
            # https://pytorch.org/docs/master/generated/torch.nn.BatchNorm2d.html
            if batch_normalize:
                bn = nn.BatchNorm2d(num_features = filters)
                module.add_module(f'batch_norm_{index}', bn)
                
            # Check the activation type and add an activation layer (YOLO v3 uses only LeakyReLU activations)
            # https://pytorch.org/docs/master/generated/torch.nn.LeakyReLU.html
            if activation == 'leaky':
                activn = nn.LeakyReLU(negative_slope = 0.1,
                                      inplace = True)
                module.add_module(f'leaky_{idx}', activn)
            
        # Upsampling layer - used to restore the image resolution to the size of the previous layer
        # https://pytorch.org/docs/stable/generated/torch.nn.Upsample.html   
        elif block['type'] == 'upsample':
            stride = int(block['stride'])
            upsample = nn.Upsample(scale_factor = 2,
                                   mode = 'bilinear')
            module.add_module(f'upsample_{index}', upsample)
            
        # Route layer - used to calculate the output depth (filters) resulting from concatenation. Similar to concat layers.
        # When the attribute has a single value, the layer generates feature maps indexed by that value.
        # E.g. if route = -4, then the route layer will have the feature maps of a layer 4 steps behind it.
        # When the attribute has two values, the layer returns a concatenation of the feature maps of both layers by index number.
        # E.g. if route = -1, 61, the layer will have the feature maps of the previous layer (-1) and the 61st layer, ...
        # ... with both feature maps concatenated along the depth dimension.
        # https://github.com/pjreddie/darknet/issues/545
        # https://github.com/AlexeyAB/darknet/issues/487#issuecomment-374902735
        # https://github.com/AlexeyAB/darknet/issues/279#issuecomment-397248821
        # https://github.com/AlexeyAB/darknet#how-to-train-to-detect-your-custom-objects
        elif block['type'] == 'route':
            block['layers'] == block['layers'].split(',')
            
            # Route "start"
            start = int(block['layers'][0])
            
            # Check if there are two attributes/values
            # If yes, set the second attribute as the route "end"
            try:
                end = int(block['layers'][1])
            # If not, set the end to zero
            except:
                end = 0
            
            # Calculate relative positions
            if start > 0:
                start = start - index
            if end > 0:
                end = end = index
                
            # Create the layer
            route = EmptyLayer()
            
            # Add the layer to the neural network module
            module.add_module(f'route_{index}', route)
            
            # Extract the filters
            if end < 0:
                filters = output_filters[index + start] + output_filters[index + end]
            else:
                filters = output_filters[index + start]
                
        # Shortcut layer - same as a skip layer in ResNet.
        # E.g. if the hyperparameter is -3, then the shortcut layer's output is obtained by merging the feature vectors ...
        # ... from the previous layer abd the layer 3 steps behind the shortcut layer.
        elif block['type'] == 'shortcut':
                
            # Create the layer
            shortcut = EmptyLayer()
            
            # Add layer to the model
            module.add_module(f'shortcut_{index}', shortcut)
        
        # YOLO layer with anchor detection
        # The YOLO layer is the detection layer. The anchors describe 9 total anchors, yet only those anchors indexed by ...
        # ... the mask attributes are used. E.g. if the mask value is 0, 1, 2, that means the first, second and third anchors ...
        # ... will be used. This makes sense given that each cell in the detection layer predicts 3 boxes. In total, we have ...
        # ... detection in three scales, resulting in 9 anchors.
        # Anchors: Predetermined set of bounding boxes with specific height-width ratios.
        # Mask: List of anchor IDs which the layer is responsible for predictign.
        # Num: total number of anchors.
        # YOLO v3 predicts a predetermined set of anchors, which have initial sizes (height, width), some of which (the one ...
        # ... closest to the object size) will be redimensioned to the object's size.
        # Each YOLO layer must know all anchors, but is responsible for only a subset of them.
        # The mask tells the layer which anchors it should use for predicting. The first YOLO layer is assigned anchors 6,7,8, ...
        # ... the second gets 3,4,5, and the third gets 0,1,2.
        elif block['type'] == 'yolo':
            
            # Extract the mask values
            mask = list(map(int, block['mask'].split(',')))
            
            # Extract the anchor values
            anchors = list(map(int, block['anchors'].split(',')))
            anchors = [(anchors[i], anchors[i+1]) for i in range(0, len(anchors), 2)]
            
            # Filter the list of anchors using the mask
            anchors = [anchors[i] for i in mask]
            
            # Create the anchor detection layer
            detection = DetectAnchors(anchors)
            
            # Add layer to the model
            module.add_module(f'detection_{index}', detection)
            
        # Load the list of modules (layer groups), filters and output filters
        module_list.append(module)
        num_filters = filters
        output_filters.append(filters)
        
    return (net_info, module_list)

# Function summary:
# YOLO has 5 layer types: Convolutional, Upsample, Route, Shortcut and YOLO.
# All customization of a YOLO model is done by adjusting hyperparameter values in the configuration file.
# The configuration file describes the YOLO network layout block by block.
# The YOLO architecture is also known as Darknet.

In [11]:
# Function to make predictions
# Take a feature vector of detections and transform it into a 2D tensor, in which each tensor line corresponds to ...
# ... the attributes of one bounding box (anchor).
# Arguments:
# prediction (tensor): previous output
# input_dim (int): dimension of the input image
# anchros (list(tuple)): anchors used in the YOLO detection layer
# num_classes (int): total number of classes
# CUDA (bool): optional argument to define whether or not to use GPU
# Function retuns:
# prediction (tensor): redimensioned (3D tensor) prediction output of the current YOLO layer.
# The three dimensions are: [batch size, number of bounding boxes, bound box attributes]
def make_predictions(prediction, input_dim, anchors, num_classes, CUDA = True):
    
    # Hyperparameters for the predictions
    batch_size = prediction.size(0)
    stride = input_dim // prediction.size(2)
    grid_size = input_dim // stride
    bbox_attribuites = 5 + num_classes
    num_anchors = len(anchors)
    
    # Adjust the prediction object's shape
    prediction = prediction.view(batch_size, bbox_attribuites * num_anchors, grid_size * grid_size)
    
    # Transpose the matrix
    prediction = prediction.transpose(1, 2).contiguous()
    
    # New shape adjust
    prediction = prediction.view(batch_size, grid_size * grid_size * num_anchors, bbox_attribuites)
    
    # Proportionally resize the anchors based on stride
    # [(,),(,),(,)] -> tensor([[,],[,],[,]])   size([3,2])
    anchors = [(anchor[0]/stride, anchor[1]/stride) for anchor in anchors]
    
    # Sigmoid transformation: centre_X, centre_Y, objectness score
    prediction[:, :, 0] = torch.sigmoid(prediction[:, :, 0])
    prediction[:, :, 1] = torch.sigmoid(prediction[:, :, 1])
    prediction[:, :, 4] = torch.sigmoid(prediction[:, :, 4])
    
    # Add the grids for the coordinate centers
    grid = np.arange(grid_size)
    a, b = np.meshgrid(grid, grid)
    
    # Adjust the shapes
    x_offset = torch.FloatTensor(a).view(-1, 1)
    y_offset = torch.FloatTensor(b).view(-1, 1)
    
    # Verify whether to use GPU
    if CUDA:
        x_offset = x_offset.cuda()
        y_offset = y_offset.cuda()
        prediction = prediction.cuda()
        
    # Concatenate x and y for the prediction
    x_y_offset = torch.cat((x_offset, y_offset), dim=1).repeat(1, num_anchors).view(-1, 2).unsqueeze(0)
    prediction[:, :, :2] += x_y_offset
    
    # Convert the object with the anchor values to a float tensor
    anchors = torch.FloatTensor(anchors)
    
    # If using GPU, send the anchors to the GPU too
    if CUDA:
        anchors = anchors.cuda()
        
    # Matrix operations for the final anchor value
    anchors = anchors.repeat(grid_size * grid_size, 1).unsqueeze(0)
    
    # Element-wise multiplication of the anchors' predictions
    prediction[:, :, 2:4] = torch.exp(prediction[:, :, 2:4]) * anchors
    
    # Sigmoid activation for the class scores
    prediction[:, :, 5:5 + num_classes] = torch.sigmoid(prediction[:, :, 5:5 + num_classes])
    
    # Reshape the predictions map to the size of the input image
    prediction[:, :, :4] *= stride
    
    return prediction

In [12]:
# Class that returns the route (concat) layer of the neural network
# Used to concatenate withthe network body, simplifying changes to the input data
class EmptyLayer(nn.Module):
    def __init__(self):
        super(EmptyLayer, self).__init__()

In [13]:
# Anchor detection class
class DetectAnchors(nn.Module):
    def __init__(self, anchors):
        super(DetectAnchors, self).__init__()
        self.anchors = anchors

In [17]:
# YOLO architecture, also referred to as Darknet in the paper's documentation
# Has the convolutional layers
class Darknet(nn.Module):
    
    # Class constructor
    def __init__(self, config_file):
        
        # Initialize the class
        super(Darknet, self).__init__()
        
        # Read the configuration file
        self.blocks = parse_config_file(config_file)
        
        # Create the meural networks' modules
        self.net_info, self.module_list = create_modules(self.blocks)
        
    # Forward propagation
    def forward(self, x, CUDA):
        
        # Modules (network's layers)
        modules = self.blocks[1:]
        
        # Cache of all layer outputs, needed for the route/shortcut layers
        outputs = {}
        
        # Track if the first detetion layer was found
        write = False
        
        # Loop through the modules
        for idx, module in enumerate(modules):
            
            # Module type
            module_type = (module['type'])
            
            # Convolution and upsample layers
            if module_type == 'convolution' or module_type == 'upsample':
                
                # Define the layer
                x = self.module_list[i](x)
            
            # Route layers: concatenate two feature maps from other layers
            elif module_type == 'route':
                
                # Layers
                layers = [int(a) for a in module['layers']]
                
                if layers[0] > 0:
                    layers[0] = layers[0] - idx
                    
                if len(layers) == 1:
                    x = outputs[idx + (layers[0])]
                else:
                    if layers[1] > 0:
                        layers[1] = layers[1] - idx
                        
                    feature_map_1 = outputs[idx + layers[0]]
                    feature_map_2 = outputs[idx + layers[1]]
                    
                    # Concatenate along the depth dimension
                    x = torch.cat((feature_map_1, feature_map_), dim = 1)
            
            # Shortcut layer
            elif module_type == 'shortcut':
                
                # Origin layer
                from_layer = int(module['from'])

                # Addition
                x = outputs[idx - 1] + outputs[idx + from_layer]
                
            # YOLO layer
            elif module == 'yolo':
                
                # Neural network's hyperparameters
                anchors = self.module_list[idx][0].anchors
                input_dim = int(self.net_info['height'])
                num_classes = int(module['classes'])
                
                # Make predictions (aka detect objects in the image)
                x = make_predictions(prediction = x,
                                     input_dim = input_dim,
                                     anchors = anchors,
                                     num_classes = num_classes,
                                     CUDA = CUDA)
                
                # If this is the first detection layer, then x represents the detections
                if write is False:
                    detections = x
                    write = True
                # If this is not the first detection layer, concatenate the predictions with the ones from previous layers
                else:
                    detections = torch.cat((detection, x), dim = 1)
                    
            outputs[idx] = x
        
        return detections
    
    
    # Backward propagation
    # Loading weights from the pre-trained model
    def load_weights(self, weights_file):
        
        # Open the weights file
        wf = open(weight_file, 'rb')
        
        # Extract the header
        header = np.fromfile(wf, dtype=np.int32, count=5)
        self.header = torch.from_numpy(header)
        self.seen = self.header[3]
        
        # Load the weights
        weights = np.fromfile(wf, dtype=np.float32)
        
        # Control parameter to track where in the weights file we are
        prt = 0
        
        # Loop through the module types
        for idx in range(len(self.module_list)):
            
            # Extract the module type
            module_type = self.blocks[idx+1]['type']
            
            # Load the weights for the convolutional layers
            if module_type == 'convolutional':
                model = self.module_list[idx]
                try:
                    batch_normalize = int(self.blocks[idx+1]['batch_normalize'])
                except:
                    batch_normalize = 0
                conv = model[0]
                
                if (batch_normalize):
                    
                    # Load the weights for the batch normalization layer
                    bn = model[1]
                    num_bn_biases = bn.bias.numel()
                    bn_biases = torch.from_numpy(weights[ptr : ptr+num_bn_biases])
                    ptr += num_bn_biases
                    
                    bn_weights = torch.from_numpy(weights[ptr : ptr+num_bn_biases])
                    ptr += num_bn_biases
                    
                    bn_running_mean = torch.from_numpy(weights[ptr : ptr+num_bn_biases])
                    ptr += num_bn_biases
                    
                    bn_running_var = torch.from_numpy(weights[ptr : ptr+num_bn_biases])
                    ptr += num_bn_biases
                    
                    # Adjust dimensions
                    bn_biases = bn_biases.view_as(bn.bias.data)
                    bn_weights = bn_biases.view_as(bn.weight.data)
                    bn_running_mean = bn_biases.view_as(bn.running_mean)
                    bn_running_var = bn_biases.view_as(bn.running_var)
                    
                    # Copy the data to the model
                    bn.bias.data.copy_(bn_biases)
                    bn.weight.data.copy_(bn_weights)
                    bn.running_mean.copy_(bn_running_mean)
                    bn.running_var.copy_(bn_running_var)
                
                else:
                    num_biases = conv.bias.numel()
                    
                    # load the weights
                    conv_biases = torch.from_numpy(weights[ptr : ptr+num_bn_biases])
                    ptr += num_bn_biases
                    
                    # Reshape the loaded weights accorindg to the model weights' dimension
                    conv_biases = conv_biases.view_as(conv.bias.data)
                    
                    # Copy the data to the model
                    conv.bias.data.copy_(conv_biases)
                    
                # Load the convolutional layer's weights (same with or without batch_normalization)
                num_weights = conv.weight.numel()
                conv_weights = torch.from_numpy(weights[ptr : ptr+num_bn_biases])
                ptr += num_bn_biases
                
                # Reshape the loaded weights accorindg to the model weights' dimension
                conv_weights = conv_weights.view_as(conv.bias.data)

                # Copy the data to the model
                conv.weight.data.copy_(conv_weights)       

## Part 2 - Input Processor Frontend

In [20]:
# Py Data Stack
import numpy as np
import pandas as pd

# System manipulation
import os
import time
import argparse
import random
import pickle as pkl

# Image Processing
import cv2
import torch
import torch.nn as nn
from torch.autograd import Variable

In [21]:
# Function to parse command line arguments
def arg_parse():
    
    parser = argparse.ArgumentParser(description='Object Detector with YOLO')
    parse.add_argument("--input", dest="input", help="Directory with images/videos for object detection.", default="input", type=str)
    parse.add_argument("--output", dest="output", help="Directory to store images/videos with detected objects.", default="output", type=str)
    parse.add_argument("--batch", dest="batch", help="Batch size.", default=1)
    parse.add_argument("--confidence", dest="confidence", help="Confidense threshold to filter predictions.", default=0.7)
    parse.add_argument("--nms_thresh", dest="nms_thresh", help="NMS threshold.", default=0.4)
    parse.add_argument("--cfg_file", dest="cfg_file", help="YOLO configuration file.", default="config/yolov3.cfg", type=str)
    parse.add_argument("--weights", dest="weights", help="File with pretrained weights.", default="weights/yolov3.weights", type=str)
    parse.add_argument("--resolution", dest="resolution", help="Resolution of the input images. Increase to improve accuracy. Decrease to speed up detection.", default="384", type=str)

    return parser.parse_args()

In [22]:
# Function to draw the bounding box
def draw_bbox(x, results, classes, colors):
    
    # Define the box's corners
    corner1 = tuple(x[1:3].int())
    corner2 = tuple(x[3:5].int())
    
    # Image
    img = results[int(x[0])]
    
    # Line thickness
    line_thickness = round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
    cls = int(x[-1])
    
    # Color
    color = random.choice(colors)
    
    # Label
    label = f'{classes[cls]}'
    
    # Define the bounding box's limits
    cvs.rectangle(img, corner1, corner2, color, thickness=line_thickness)
    
    # Font thickness
    font_thickness = max(tl -1, 1)
    
    # Label text size
    font_size = cvs.getTextSize(label, 0, fontScale=font_thickness/3, thickness=font_thickness)[0]
    
    # Bbox corners
    corner2 = corner1[0] + font_size[0] + 3, corner1[1] - font_size[1] - 3
    
    # Draw the bbox and incluse the class label
    cv2.rectangle(img, corner1, corner2, color, -1)
    cv2.putText(img, 
                label, 
                (corner1[0], corner1[1] - 2), 
                0, 
                line_thickness/3, 
                [255, 255, 255], 
                thickness = font_thickness,
                lineType = cv2.LINE_AA)
    
    return img

In [26]:
# Function to calculate the IoU of two bounding boxes.
# The Intersection over Union (IoU) if a measure of the overlap between two bounding boxes.
# IoU is the Jaccard Similarity of the areas of two objects in a plane.
# In computer vision it is used to correctly detect an object.
# By convertion, the predicted bouding box is considered correct if the IoU is greater than 0.5.
# Incrasing the threshold improves precision but worsens recall.
# If the predicted bbox and the real bbox overlapped perfectly, the IoU would be 1.
# Non-Maximal Supression (NMS) cleans the multiple detections and keeps only one detection per object. For this, it ...
# ... chooses the highest probability bbox and supresses all other bboxes whose IoU is greater. Therefore, in the end ...
# ... only one bbox is kept, likely the most precise one (and unlikely the least precise one).
def bbox_iou(box1, box2):
    
    # Calculate the maximum and minimum intersection
    inter_max_xy = torch.min(box1[:, 2:4], box2[:, 2:4])
    inter_min_xy = torch.min(box1[:, 0:2], box2[:, 0:2])
    
    # Calculate the intersection area
    inter_size = torch.clamp((inter_max_xy - inter_min_xy), min=0)
    inter_area = inter_size[:, 0] * inter_size[:, 1]
    
    # Calculate the areas
    b1_area = (box1[:, 2] - box1[:, 0])*(box1[:, 3] - box1[:, 1])
    b2_area = (box2[:, 2] - box2[:, 0])*(box2[:, 3] - box2[:, 1])
    
    # Calculate IoU
    iou = inter_area / (b1_area + b2_area - inter_area)
    
    return iou

In [25]:
# Function to make the detections
# To obtain true detections, subjecting the output to the objectiveness threshold and to the Non-Maximal Supression (NMS)
# Return a tensor of shape: (D * 8), in which D in the number of true detections in all images, each represented by a line.
# Each detection has the attributes: image index for the batch to which the image belongs, 4 bbox coordinates, ...
# ... objectiveness score, score of the max confidence class, and class index.
def write_results(prediction, confidence, num_classes, nms_conf=0.4):
    
    # Task 1: Objectness confidence thresholding
    conf_mask = (prediction[:, :, 4] > confidence).float().unsqueeze(2)
    prediction *= conf_mask
    
    # Task 2: Locate the bbox corners
    box_corner = prediction.detach().clone()
    box_corner[:, :, 0] = (predictionbox_corner[:, :, 0] - box_corner[:, :, 2]/2)
    box_corner[:, :, 1] = (predictionbox_corner[:, :, 1] - box_corner[:, :, 3]/2)
    box_corner[:, :, 2] = (predictionbox_corner[:, :, 2] - box_corner[:, :, 2]/2)
    box_corner[:, :, 3] = (predictionbox_corner[:, :, 3] - box_corner[:, :, 3]/2)
    prediction[:, :, :4] = box_corner[:, :, :4]

    batch_size = prediction.size(0)
    write = False
    
    # Task 3: Loop through the batch's images
    # Confidence threshold -> Only care about the value of the largest score class.
    # Get the index of the largest score class and its score.
    for ib in range(batch_size):
        
        # Image prediction
        image_prediction = prediction[ib]
        
        # Indexes and ajust dimensions
        max_conf, max_conf_indexes = torch.max(image_prediction[:, 5:5+num_classes], dim=1)
        max_conf = max_conf.float().unsqueeze(1)
        max_conf_indexes = max_conf_indexes.float().unsqueeze(1)
        image_prediction = torch.cat((image_prediction[:, :5], max_conf, max_conf_indexes), dim=1)
        
        # Get rid of the zero objectiveness lines
        non_zero_indexes = torch.nonzero(image_prediction[:, 4]).squeeze()
        image_prediction_ = image_prediction[non_zero_indexes, :].view(-1, 7)
        
        # If there is no prediction, go to next iteration
        if image_prediction_.shape[0] == 0:
            continue
        
        # Get the classes detected in the image
        img_classes = unique(img_prediction_[:, -1])
        
        # NMS of each class
        for cls in img_classes:
            
            # Get the detections attributed to the current class
            cls_mask = image_prediction_ * (image_prediction_[:, -1] == cls).float().unsqueeze(1)
            cls_mask_indexes = torch.nonzero(cls_mask[:, -2]).squeeze()
            
            # Get the bboxes with the same class
            img_pred_classes = image_prediction_[cls_mask_indexes].view(-1, 7)
            
            # Classificate the detections along the objectiveness score sequence from highest to lowest
            obj_conf_desc_indices = torch.sort(img_pred_classes[:, 4], descending=True)[1]
            img_pred_classes = img_pred_classes[obj_conf_desc_indices]
            num_detections = img_pred_classes.size(0)
            
            # NMS
            for i in range(num_detections):
                
                # Obtain the IoU (intersection over union) for all boxes below the one being seen
                try:
                    ious = bbox_iou(img_pred_classes[i].unsqueeze(0), img_pred_classes[i+1])
                except ValueError:
                    break
                except IndexError:
                    break
                
                # Zero all detections with have a IoU > threshold, that is, similar to the above bbox
                iou_mask = (ious < nms_conf).float().unsqueeze(1)
                img_pred_classes[i+1:] *= iou_mask
                
                # Keep the non-zero lines, including bboxes distinct from the above bbox
                non_zero_indexes = torch.nonzero(img_pred_classes[:, 4]).squeeze()
                img_pred_classes = img_pred_classes[non_zero_indexes].view(-1, 7)
                
            # Saving the predictions
            # For each batch with index i, with k detections, the batch_indexes will be a k-by-1 tensor filled with i
            batch_indexes = img_pred_classes.new_full((img_pred_classes.size(0), 1), ib)
            
            # Generate the final tuple
            seq = batch_indexes, img_pred_classes
            
            # Concatenate the detections
            if write is False:
                output = torch.cat(seq, dim=1)
                write = True
            else:
                new_out = torch.cat(seq, dim=1)
                output = torch.cat((output, new_out))
                
    
    try:
        return output
    except:
        return 0

In [32]:
# Function to write the bounding box for each image
def write_bbox(x, results, classes, colors):
    
    # bbox coordinates
    corner1 = tuple(x[1:3].int())
    corner2 = tuple(x[3:5].int())
    
    # Image
    img = results[int(x[0])]
    
    # Bbox line thickness
    line_thickness = round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
    
    # Class
    cls = int(x[-1])
    
    # Color
    color = random.choice(colors)
    
    # Label
    label = f"{classes[cls]}"
    
    # Create the rectangle (bbox) in the image using OpenCV
    cv2.rectangle(img, corner1, corner2, color, thickness=line_thickness)
    
    # Font thickness
    font_thickness = max(line_thickness - 1, 1)
    
    # Font size
    font_size = cv2.getTextSize(label, 0, fontScale=line_thickness/3, thickness=line_thickness)[0]
    
    # Corner
    corner2 = corner1[0] + font_size[0]+3, corner1[1]-t_size[1]-3
    
    # Write bbox and text
    cv2.rectangle(img, corner1, corner2, color, -1)
    cv2.putText(img, 
                label, 
                (corner1[0], corner1[1]-2), 
                0, 
                line_thickness/3, 
                [255, 255, 255], 
                thickness=line_thickness,
                lineType=cv2.LINE_AA)

In [30]:
def main():
    
    # Get user inputs
    args = arg_parse()
    
    # Define the input data
    images = args.input
    
    # Batch size
    batch_size = args.batch
    
    # Prediction confidence
    confidence = float(args.confidence)
    
    # Non-Maximal Suppression (NMS)
    nms_thresh = float(args.nms_thresh)
    
    # Control variable
    start = 0
    
    # Check if there is an available GPU
    CUDA = torch.cuda.is_available()
    
    # Load the classes
    num_classes = 80
    classes_file = open('classes/coco.names', 'r')
    classes_names = classes_file.read().split('\n')[:-1]
    classes = class_names
    
    # Load the YOLO model
    print("\nLoading Model...")
    model = Darknet(args.cfg_file)
    model.load_weights(args.weights)
    print("\nModel Loaded Successfully!")
    
    # Define the resolution of the input images
    model.net_info['height'] = args.resolution
    
    # Input dimensions
    input_dim = int(model.net_info['height'])
    
    # Return an error if the dimensions are inadequate
    assert input_dim % 32 == 0
    assert input_dim > 0
    
    # If there is a GPU, send the model to it
    if CUDA:
        model.cuda()
        
    # Set the model to evaluation mode
    model.eval()
    
    # Check if files and directories exist
    try:
        # Fetch the images for detection
        img_list = [os.path.join(os.path.realpath('.'), images, img) for img in os.listdir(images)]
    except NotADirectoryError:
        img_list = []
        img_list.append(os.path.join(os.path.realpath('.'), images))
    except FileNotFoundError:
        print(f'Could not find a file named {images}')
        exit()
    
    # If the output directory does not exist, creat it
    if not os.path.exists(args.output):
        os.makedirs(args.output)
    
    # Load the images, reading them with opencv
    loaded_images = [cv2.imread(image) for image in img_list]
    
    # Define the batches
    img_batches = list(map(prep_image, loaded_images, [input_dim for idx in range(len(img_list))]))
    
    # Input image dimensions
    img_dim_list = [(img.shape[1], img.shape[0]) for img in loaded_images]
    
    # Convert to tensor
    img_dim_list = torch.FloatTensor(img_dim_list).repeat(1, 2)
    
    # Create the image batches and send them to the GPU, if one is available
    # Each batch is a concatenation of images according to batch_size
    
    # Check if there is a single image
    counter = 0
    if(len(img_dim_list) % batch_size):
        counter = 1
    
    # Handle image batching
    if batch_size != 1:
        num_batches = len(img_list) // batch_size + counter
        img_batches = [torch.cat((img_batches[idx * batch_size : min((idx+1) * batch_size, len(img_batches))]), dim=0) 
                       for idx in range(num_batches)]
    if CUDA:
        img_dim_list = img_dim_list.cuda()
    
    # Control variable for the dectection loop
    write = False
    
    # Checkpoint
    start_det_loop = time.time()
    
    # Loop through the image batches and detect
    for idx, batch in enumerate(img_batches):
        
        # Checkpoint
        start = time.time()
        
        # If using GPU, send the batch there
        if CUDA:
            batch = batch.cuda()
            
        # Predict with the model
        with torch.no_grad():
            prediction = model(batch, CUDA)
            
        # Store the prediction details
        prediction = write_results(prediction=prediction, confidence=confidence, num_classes=num_classes, nms_conf=nms_thresh)
        
        # Checkpoint
        end = time.time()
        
        # If type(prediction) == int, that means there is no prediction for this batch
        if type(prediction) == int:
            for img_index, image in enumerate(img_list[idx*batch_size : min((idx+1)*batch_size, len(img_list))]):
                
                # Print the details about the detection attempt
                print(f"{image.split('/')[-1]:20s} predicted in {(end-start)/batch_size:6.3f} seconds.")
                print(f"{'Deteted objects:':20s}")
                print("-----------------------------------------------")
                
            continue
        
        # Bounding box indexes
        prediction[:, 0] += idx*batch_size
        
        if write is False:
            output = prediction
            write = True
        else:
            output = torch.cat((output, prediction))
            
        # Loop through the bbox indexes
        for img_idx, image in enumerate(img_list[idx*batch_size: min((idx+1)*batch_size, len(img_list))]):
            
            # Generate a global index
            global_img_index = idx*batch_size + img_idx # Image index on all images
            
            # Get the classes according go the indexes
            objs = [classes[int(x[-1])] for x in output if int(x[0]) == global_img_index]
            
            # Print the result
            print(f"{image.split('/')[-1]:20s} predicted in {(end-start)/batch_size:6.3f} seconds.")
            print(f"{'Deteted objects:':20s} {' '.join(objs)}")
            print("-----------------------------------------------")
            
        # Synchonize the GPU with the CPU
        if CUDA:
            torch.cuda.synchronize()
            
        # Draw the bounding boxes
        try:
            output
        except:
            print("No detection was made!")
            exit()
            
        # Adjust the bbox coordinates for inserting in the image
        
        # Filter only the images with detections
        img_dim_list = torch.index_select(img_dim_list, 0, output[:, 0].long())
        
        # Image scale factor
        scaling_factors = torch.min(input_dim / img_dim_list, 1)[0].view(-1, 1)
        
        # x coordinate of the bbox corners
        output[:, [1, 3]] -= (input_dim - scaling_factors * img_dim_list[:, 0].view(-1, 1)) / 2

        # y coordinate of the bbox corners
        output[:, [2, 4]] -= (input_dim - scaling_factors * img_dim_list[:, 1].view(-1, 1)) / 2
        
        # Redimension to the original size
        output[:, 1:5] /= scaling_factors
        
        # Clip the bboxes whose limits are outside the image borders
        # https://pytorch.org/docs/stable/generated/torch.clamp.html
        for i in range(output.shape[0]):
            output[i, [1, 3]] = torch.clamp(output[i, [1, 3]], 0.0, img_dim_list[i, 0])
            output[i, [2, 4]] = torch.clamp(output[i, [2, 4]], 0.0, img_dim_list[i, 1])
            
        # Color palette
        colors = pkl.load(open('palette', 'rb'))
        
        # Checkpoints
        output_recast = time.time()
        class_load = time.time()
        draw_start = time.time()
        
        # Iteration
        list(map(lambda x: write_bbox(x, loaded_images, classes, colors), output))
        
        # Checkpoints
        draw_end = time.time()
        
        # Paths to save the images and detected objects
        det_names = pd.Series(img_list).apply(lambda x: f"{args.output}/det_{x.plit('/')[-1]}")
        
        # Save images with detections to the paths in det_names
        list(map(cv2.imwrite, det_names, loaded_images))
        
        print("Summary")
        print("------------------------------------------------")
        print(f"{'Task':25s}: {'Total Time (seconds)'}")
        print()
        print(f"{'Checking Directory':25s}: {load_batch - read_dir:2.3f}")
        print(f"{'Loading Batch':25s}: {start_det_loop - load_batch:2.3f}")
        print(f"{'Detecting ('+str(len(img_list))+' images':25s}: {output_recast - start_det_loop:2.3f}")
        print(f"{'Processing Outputs':25s}: {class_load - output_recast:2.3f}")
        print(f"{'Drawing Bounding Boxes':25s}: {draw_start - draw_end:2.3f}")
        print(f"{'Mean Image Loading Time':25s}: {(end - load_batch) / len(img_list):2.3f}")
        print("------------------------------------------------")
        
        # Clean the GPU cache
        torch.cuda.empty_cache()
        
        print('\nDetection Finished!')