<a href="https://colab.research.google.com/github/Jaseelkt007/ML/blob/master/Object%20detection%20using%20Yolov1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
import torch
import torch.nn as nn

"""
about architecture config:
Tuple is structured by (kernel_size, filters, stride, padding)
"M" is maxpooling with stride 2x2 and kernel 2x2
List is structured by tuples and lastly int with number of repeats
"""

architecture_config = [
    (7, 64, 2, 3),
    "M",
    (3, 192, 1, 1),
    "M",
    (1, 128, 1, 0),
    (3, 256, 1, 1),
    (1, 256, 1, 0),
    (3, 512, 1, 1),
    "M",
    [(1, 256, 1, 0), (3, 512, 1, 1), 4],
    (1, 512, 1, 0),
    (3, 1024, 1, 1),
    "M",
    [(1, 512, 1, 0), (3, 1024, 1, 1), 2],
    (3, 1024, 1, 1),
    (3, 1024, 2, 1),
    (3, 1024, 1, 1),
    (3, 1024, 1, 1),
]


class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(CNNBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
        self.batchnorm = nn.BatchNorm2d(out_channels)
        self.leakyrelu = nn.LeakyReLU(0.1)

    def forward(self, x):
        return self.leakyrelu(self.batchnorm(self.conv(x)))


class Yolov1(nn.Module):
    def __init__(self, in_channels=3, **kwargs):
        super(Yolov1, self).__init__()
        self.architecture = architecture_config
        self.in_channels = in_channels
        self.darknet = self._create_conv_layers(self.architecture)
        self.fcs = self._create_fcs(**kwargs)

    def forward(self, x):
        x = self.darknet(x)
        return self.fcs(torch.flatten(x, start_dim=1))

    def _create_conv_layers(self, architecture):
        layers = []
        in_channels = self.in_channels

        for x in architecture:
            if type(x) == tuple:
                layers += [ CNNBlock(in_channels, x[1], kernel_size=x[0], stride=x[2], padding=x[3],)]
                in_channels = x[1]

            elif type(x) == str:
                layers += [nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))]

            elif type(x) == list:
                conv1 = x[0]
                conv2 = x[1]
                num_repeats = x[2]

                for _ in range(num_repeats):
                    layers += [CNNBlock(in_channels, conv1[1], kernel_size=conv1[0], stride=conv1[2],padding=conv1[3], )]
                    layers += [CNNBlock(conv1[1], conv2[1], kernel_size=conv2[0], stride=conv2[2], padding=conv2[3], )]
                    in_channels = conv2[1]

        return nn.Sequential(*layers)

    def _create_fcs(self, split_size, num_boxes, num_classes):
        S, B, C = split_size, num_boxes, num_classes
        return nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024 * S * S, 496), # reduced to 496 for simplification
            nn.Dropout(0.0),
            nn.LeakyReLU(0.1),
            nn.Linear(496, S * S * (C + B * 5)),
        )

def test(S = 7 , B =2, C = 20):
    model = Yolov1(split_size =S , num_boxes = B ,  num_classes =C)
    x = torch.randn((2, 3, 448,448))
    print(model(x).shape)
#test()

torch.Size([2, 1470])


In [26]:
from collections import Counter


def intersection_over_union(boxes_preds , box_labels , box_format = "midpoint"): # box [x1,y1, x2,y2]

    if box_format == "midpoint": # box = [xcenter, ycenter , w, h]
        box1_x1 = boxes_preds[...,0:1] - boxes_preds[...,2:3]/2
        box1_y1 = boxes_preds[...,1:2] - boxes_preds[..., 3:4]/2
        box1_x2 = boxes_preds[...,0:1] + boxes_preds[...,2:3]/2
        box1_y2 = boxes_preds[...,1:2] + boxes_preds[..., 3:4]/2

        box2_x1 = box_labels[...,0:1] - box_labels[...,2:3]/2
        box2_y1 = box_labels[...,1:2] - box_labels[..., 3:4]/2
        box2_x2 = box_labels[...,0:1] + box_labels[...,2:3]/2
        box2_y2 = box_labels[...,1:2] + box_labels[..., 3:4]/2

    elif box_format == "corners": # box [x1,y1, x2,y2]
        box1_x1 = boxes_preds[..., 0:1] # sliced to retain shape
        box1_y1 = boxes_preds[..., 1:2]
        box1_x2 = boxes_preds[..., 2:3]
        box1_y2 = boxes_preds[..., 3:4]
        box2_x1 = box_labels[..., 0:1]
        box2_y1 = box_labels[..., 1:2]
        box2_x2 = box_labels[..., 2:3]
        box2_y2 = box_labels[..., 3:4]

    x1 = torch.max(box1_x1 , box2_x1)
    y1 = torch.max(box1_y1, box2_y1)
    x2 = torch.min(box1_x2, box2_x2)
    y2 = torch.min(box1_y2, box2_y2)

    intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)

    box1_area = abs((box1_x2 - box1_x1) * (box1_y2 - box1_y2))
    box2_area = abs((box2_x2 - box2_x1) * (box2_y2 - box2_y1))

    return intersection/ (box1_area + box2_area - intersection + 1e-6)


# Non Max Suppression
def non_max_suppresion(
    bboxes,
    prob_threshold,
    iou_threshold,
    box_format='corners',

):
# predictions -> [[1,0.9,x1,y1,x2,y2]]
    assert type(bboxes) == list

    bboxes = [box for box in bboxes if box[1] > prob_threshold] # select the box with high probability
    bboxes = sorted(bboxes , key= lambda x:x[1] , reverse=True) # sort in descending order based x[1] , ie, high probablity
    bboxes_after_nms =[]

    while(bboxes):
        chosen_box = bboxes.pop(0) # remove that element,return 1st element  which has the highest probability
        # check the IOU btw choosen and other are less than iou_threshold
        bboxes = [ box for box in bboxes if box[0] != chosen_box[0] or intersection_over_union(torch.tensor(chosen_box[2:]) , torch.tensor(box[2:], box_format = box_format)) < iou_threshold]
        bboxes_after_nms.append(chosen_box)

    return bboxes_after_nms


def mean_average_precision(
    pred_boxes, true_boxes, iou_threshold=0.5, box_format="midpoint", num_classes=20
):
    # pred_boxes (list): [[train_idx, class_idx, prob_score, x1, y1, x2,y2], [],[],...]
    # list storing all AP for respective classes
    average_precisions = []
    epsilon = 1e-6

    for c in range(num_classes):
        detections = []
        ground_truths = []

        for detection in pred_boxes: # check the no of prediction made on the class c
            if detection[1] == c:
                detections.append(detection)

        for true_box in true_boxes: # only keep the ground truth for class c
            if true_box[1] == c:
                ground_truths.append(true_box)

        # count the frequency of each g[0] then return a dictionary --> how many ground truth in each image
        amount_bboxes = Counter([gt[0] for gt in ground_truths])

        for key, val in amount_bboxes.items():
            amount_bboxes[key] = torch.zeros(val)
            #0:torch.tensor([0,0,0]) , 1: torch.tensor([0,0,0,0,0])} , zeros will track whether a ground truth box has already benn matched to a predicted bbox

        detections.sort(key=lambda x: x[2], reverse=True)
        TP = torch.zeros((len(detections)))
        FP = torch.zeros((len(detections)))
        total_true_bboxes = len(ground_truths)

        if total_true_bboxes == 0:
            continue

        for detection_idx, detection in enumerate(detections):
            ground_truth_img = [
                bbox for bbox in ground_truths if bbox[0] == detection[0]
            ]
             # this list extracts all ground turth bbox for same image as currect prediction
            # eg: if img has 3 objects, then this will have 3 bboxes

            num_gts = len(ground_truth_img)
            best_iou = 0
            # now calculate the iou btw this predicted bbox with all bboxes in that specific image - ground_truth_img
            for idx, gt in enumerate(ground_truth_img):
                iou = intersection_over_union(
                    torch.tensor(detection[3:]),
                    torch.tensor(gt[3:]),
                    box_format=box_format,
                )

                if iou > best_iou:
                    best_iou = iou
                    best_gt_idx = idx
            # check if iou is greater than iou threshold and also we have to check whether we have covered this bbox before, then its is TP
            if best_iou > iou_threshold:
                # only detect ground truth detection once
                if amount_bboxes[detection[0]][best_gt_idx] == 0:
                    # true positive and add this bounding box to seen
                    TP[detection_idx] = 1
                    amount_bboxes[detection[0]][best_gt_idx] = 1
                else:
                    FP[detection_idx] = 1

            # if IOU is lower then the detection is a false positive
            else:
                FP[detection_idx] = 1

        TP_cumsum = torch.cumsum(TP, dim=0)
        FP_cumsum = torch.cumsum(FP, dim=0)
        recalls = TP_cumsum / (total_true_bboxes + epsilon)
        precisions = torch.divide(TP_cumsum, (TP_cumsum + FP_cumsum + epsilon))
        precisions = torch.cat((torch.tensor([1]), precisions))
        recalls = torch.cat((torch.tensor([0]), recalls))
        # torch.trapz for numerical integration
        average_precisions.append(torch.trapz(precisions, recalls))

    return sum(average_precisions) / len(average_precisions)

In [29]:
# YOLO loss

class YoloLoss(nn.Module):
    def __init__(self, S =7 , B =2 , C = 20):
        super(YoloLoss, self).__init__()
        self.mse = nn.MSELoss(reduction="sum")
        self.S = S
        self.B = B
        self.C = C
        self.lambda_coord = 5
        self.lambda_noobj = 0.5

    def forward(self, predictions, target):
        predictions = predictions.reshape(-1, self.S ,self.S, self.C + self.B *5)
        # Calculate IOU for both predictions
        iou_b1 = intersection_over_union(predictions[..., 21:25], target[..., 21:25])
        iou_b2 = intersection_over_union(predictions[26:30], target[..., 21:25])
        ious = torch.cat([iou_b1.unsqueeze(0), iou_b2.unsqueeze(0)], dim = 0)
        iou_max , best_box = torch.max(ious, dim= 0) # best box and its index
        exists_box = target[..., 20].unsqueeze(3) # Iobj_i

        # Box cordinates
        box_predictions = exists_box * (best_box * predictions[..., 26:30] + (1 - best_box)*predictions[..., 21:25]) # x,y,w,h

        box_targets = exists_box * target[..., 21:25]
        box_predictions[...,2:4] = torch.sign(box_predictions[...,2:4]) * torch.sqrt(torch.abs(box_predictions[..., 2:4] + 1e-6)) # w,h
        box_targets[...,2:4] = torch.sqrt(box_targets[...,2:4])

        # (N, S,S, 4) --> (N * S * S , 4)
        box_loss = self.mse( torch.flatten(box_predictions, end_dim=-2), torch.flatten(box_predictions, end_dim=-2),)

        # Object loss
        pred_box = ( best_box * predictions[..., 25:26] + (1 - best_box) * predictions[..., 20:21])
        # (N * S *S)
        object_loss = self.mse( torch.flatten(exists_box * pred_box) , torch.flatten(exists_box * target[..., 20:21])  )

        # For No Object for both bounding box
        # (N , S,S,1) -> (N ,S*S)
        no_object_loss = self.mse( torch.flatten((1-exists_box)*predictions[...,20:21], start_dim=1),  torch.flatten((1-exists_box)*target[...,20:21], start_dim=1))
        no_object_loss += self.mse( torch.flatten((1-exists_box)*predictions[...,25:26], start_dim=1),  torch.flatten((1-exists_box)*target[...,20:21], start_dim=1))

        # For Class Loss  , (N, S, S, 20) --> (N*S*S, 20)
        class_loss = self.mse( torch.flatten(exists_box * predictions[..., :20], end_dim=-2), torch.flatten(exists_box * target[...,:20], end_dim=-2))

        # Total loss
        loss = self.lambda_coord * box_loss  + object_loss + self.lambda_noobj * no_object_loss + class_loss

        return loss
