In [1]:
import numpy as np
import torch.nn as nn
import torchvision.transforms as transforms
import torch
from torch.utils.data import DataLoader
import pandas as pd
from utils import bbox_iou
from dataset import DetectionDataset, Normalise, Pad, ToTensor

## Labels

In [2]:
mean = [92.11938007161459, 102.83839236762152, 104.90335580512152]
std = [66.09941202519124, 70.6808655565459, 75.05305001603533]

## load custom dataset + transforms
transformed_train_data = DetectionDataset(
    label_dict="det_train_shortened.json",
    root_dir='images/',
    classes_file="data/bdd100k.names",
    grid_sizes=[13, 26, 52],
    anchors = np.array([
            [[116,90], [156,198], [373,326]],
            [[30, 61], [62, 45], [59,119]],
            [[10, 13], [16, 30], [33, 23]],
        ]),
    transform=transforms.Compose([
        Normalise(
            mean=mean,
            std=std
        ),
        Pad(416),
        ToTensor()
    ])
)

# separate into batches
train_loader = DataLoader(
    transformed_train_data,
    batch_size=1,
    shuffle=True,
    num_workers=0
)

In [56]:

for i, data in enumerate(train_loader):
    image, labels = data.values()
labels.shape

torch.Size([1, 10647, 17])

### simulate batch size 2

In [57]:
labels = torch.cat((labels[0], labels[0]), 0).reshape(2, -1, 17)
labels.shape

torch.Size([2, 10647, 17])

### reshape to 2d

In [53]:
labels = labels.reshape(-1, 17)
labels.shape

torch.Size([21294, 17])

In [47]:
labels[93]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

## Predictions

In [62]:
coco_preds = torch.load("ex_tensors/yolo_layer_output_size10647.pt")

In [82]:
coco_preds[coco_preds[:,:,4] > 0.9].shape

torch.Size([8, 85])

In [58]:
pretend_preds = torch.load("ex_tensors/yolo_layer_output_size10647.pt")[:,:,:17]

### simulate batch size 2

In [59]:
prediction = torch.cat((pretend_preds[0], pretend_preds[0]), 0).reshape(2, -1, 17)
# prediction = prediction.reshape(-1,17)
prediction.shape

torch.Size([2, 10647, 17])

## Testing each loss component

In [5]:
labels[:,:,4]

array([0., 1.], dtype=float32)

In [85]:
pretend_preds[:,:,4]

tensor([[3.4648e-07, 2.7254e-08, 9.5856e-09,  ..., 8.5092e-06, 7.8421e-08,
         2.4701e-08]])

In [15]:
t1 = labels[:,:,4]*torch.log(pretend_preds[:,:,4])
t2 = (1 - labels[:,:,4])*torch.log(1 - pretend_preds[:,:,4])
-torch.sum(t1 + t2)

tensor(355.5738)

In [61]:
x_mse = torch.square(prediction[:,:,0] - labels[:,:,0])
y_mse = torch.square(prediction[:,:,1] - labels[:,:,1])
torch.sum(x_mse + y_mse)/prediction.shape[0]

tensor(1.2275e+09)

In [128]:
obj_i = (labels[:,:,4] == 1)

In [130]:
-torch.sum(obj_i*torch.sum(labels[:,:,-12:]*torch.log(pretend_preds[:,:,-12:]), 2))

tensor(250.8586)

In [123]:
-torch.sum(labels[:,:,-12:]*torch.log(pretend_preds[:,:,-12:]))

tensor(250.8586)

obj_ij = 1 if
- there is an obj in cell i 
- confidence of predictor j of this cell is highest among all predictors of this cell

In [148]:
confs = prediction[:,:,4]
confs.shape

torch.Size([2, 10647])

In [172]:
confs.reshape(2, -1, 3).shape

torch.Size([2, 3549, 3])

In [168]:
highest_conf = torch.argmax(confs.reshape(2, -1, 3), axis=2)
highest_conf

tensor([[0, 0, 0,  ..., 2, 0, 0],
        [0, 0, 0,  ..., 2, 0, 0]])

In [174]:
mask = torch.arange(confs.reshape(2, -1, 3).size(2)).reshape(1, 1, -1) == highest_conf.unsqueeze(2)
mask

tensor([[[ True, False, False],
         [ True, False, False],
         [ True, False, False],
         ...,
         [False, False,  True],
         [ True, False, False],
         [ True, False, False]],

        [[ True, False, False],
         [ True, False, False],
         [ True, False, False],
         ...,
         [False, False,  True],
         [ True, False, False],
         [ True, False, False]]])

In [177]:
mask.reshape(2, -1).shape

torch.Size([2, 10647])

In [189]:
obj = (labels[:,:,4] == 1)*mask.reshape(2, -1)

In [190]:
noobj = (labels[:,:,4] == 0)*mask.reshape(2, -1)

In [186]:
x_mse = torch.square(prediction[:,:,0] - labels[:,:,0])
y_mse = torch.square(prediction[:,:,1] - labels[:,:,1])

In [192]:
torch.sum(x_mse+y_mse)

tensor(2.4550e+09)

In [193]:
torch.sum((x_mse+y_mse)*obj)

tensor(138753.9219)

In [194]:
w_mse = torch.square(torch.sqrt(prediction[:,:,2]) - torch.sqrt(labels[:,:,2]))
h_mse = torch.square(torch.sqrt(prediction[:,:,3]) - torch.sqrt(labels[:,:,3]))
torch.sum(w_mse + h_mse)

tensor(1696083.5000)

In [196]:
torch.sum((w_mse*h_mse)*obj)

tensor(31479.4844)

In [197]:
t1 = labels[:,:,4]*torch.log(prediction[:,:,4])
t2 = (1 - labels[:,:,4])*torch.log(1 - prediction[:,:,4])

In [202]:
-torch.sum(t1+t2)

tensor(535.5921)

In [206]:
-torch.sum(noobj*(t1+t2))

tensor(91.4740)

In [204]:
-torch.sum(obj*(t1+t2))

tensor(146.5062)

## Loss function

In [None]:
class Yolo_Loss(nn.Module):
    def __init__(self):
        super().__init__()
        # lambda constants
        self.lambda_class = 1
        self.lambda_noobj = 5
        self.lambda_box = 5
        self.lambda_obj = 1


    def forward(self, prediction, label):
        """
        Computes difference between prediction and label.
        
        Input:
        =prediction=    Tensor of all prediction arrays of size (n_batches, 10647, 5+n_classes).
        =label=         Tensor of all label arryays of size (n_batches, 10647, 5+n_classes).
        
        Output:
        =loss=          Total loss computed for this batch.
        """
        batch_size = prediction.shape[0]

        # I^obj_i in paper
        # mask for actual object in grid
        obj_i = (label[:,:,4] == 1) # size (batch_size, 10647)

        # I^obj_ij in paper
        # mask for when there IS obj in label AND box has highest conf score
        confs = prediction[:,:,4].reshape(batch_size, -1, 3) # reshape to easily find argmax(box1,box2,box3)
        highest_conf = torch.argmax(confs.reshape(batch_size, -1, 3), axis=2)
        mask = torch.arange(confs.reshape(batch_size, -1, 3).size(2)).reshape(1, 1, -1) == highest_conf.unsqueeze(2) # create T/F mask
        mask = mask.reshape(batch_size, -1) # reshape back to (batch_size, 10647)
        # now AND with (there is object) mask
        obj_ij = mask*obj_i # size (batch_size, 10647)

        # I^noobj_ij in paper
        noobj_i = (label[:,:,4] == 0) # true if there are no objects, size (batch_size, 10647)
        noobj_ij = mask*noobj_i # size (batch_size, 10647)

        ## box loss
        # use generic square diff loss (mse)
        x_mse = torch.square(prediction[:,:,0] - label[:,:,0])
        y_mse = torch.square(prediction[:,:,1] - label[:,:,1])
        bbox_centre_mse = x_mse + y_mse
        bbox_centre_loss = torch.sum(obj_ij*bbox_centre_mse)

        w_mse = torch.square(torch.sqrt(prediction[:,:,2]) - torch.sqrt(label[:,:,2]))
        h_mse = torch.square(torch.sqrt(prediction[:,:,3]) - torch.sqrt(label[:,:,3]))
        bbox_dims_mse = w_mse + h_mse
        bbox_dims_loss = torch.sum(obj_ij*bbox_dims_mse)

        bbox_loss = (1/batch_size)*(bbox_centre_loss + bbox_dims_loss)

        ## object loss
        # use binary cross entropy loss
        t1 = label[:,:,4]*torch.log(prediction[:,:,4])
        t2 = (1 - label[:,:,4])*torch.log(1 - prediction[:,:,4])
        obj_bce = t1 + t2
        obj_loss = -(1/batch_size)*torch.sum(obj_ij*obj_bce)

        ## no object loss
        noobj_loss = -(1/batch_size)*torch.sum(noobj_ij*obj_bce)

        ## class loss
        # use cross entropy loss
        class_loss = -torch.sum(obj_i*torch.sum(label[:,:,-12:]*torch.log(prediction[:,:,-12:]), axis=2))

        loss = self.lambda_box*bbox_loss + self.lambda_obj*obj_loss + self.lambda_noobj*noobj_loss + self.lambda_class*class_loss

        return loss

In [None]:
def forward(self, prediction, label, anchors):
    """
    Computes difference between prediction and label.
    
    Input:
    =prediction=    Tensor of all prediction arrays of size (n_batches, 10647, 5+n_classes).
    =label=         Tensor of all label arryays of size (n_batches, 10647, 5+n_classes).
    
    Output:
    =loss=          Total loss computed for this batch.
    """
    # check objectness for identity function - 4th ix in labels and predictions
    obj = (label[:,:,4] == 1) # I^obj_ij
    noobj = (label[:,:,4] == 0) # I^noobj_ij

    ## box coordinate loss
    prediction[None] = self.sigmoid(prediction[None]) # currently x,y coords
    label[None] = torch.log((1e-16 + label[None]/anchors)) # width and height coords
    bbox_coord_loss = self.mse(prediction[None][obj], label[None][obj])

    ## object loss
    anchors = anchors.reshape(1,3,1,1,2)
    box_preds = torch.cat([self.sigmoid(prediction[None]), torch.exp(prediction[None])*anchors], dim=-1)
    result = bbox_iou(box_preds[obj], label[None][obj]).detach()
    obj_loss = self.mse(self.sigmoid(prediction[None][obj]), result*label[None][obj])

    ## no object loss
    noobj_loss = self.bcwell((prediction[None][noobj]), (label[None][noobj]))

    ## class loss
    class_loss = self.cross_entropy((prediction[None][obj]), (label[None][obj].long()))

    loss = self.lambda_box*bbox_coord_loss + self.lambda_obj*obj_loss + self.lambda_noobj*noobj_loss + self.lambda_class*class_loss

    return loss