In [1]:
import numpy as np
import torch.nn as nn
import torchvision.transforms as transforms
import torch
from torch.utils.data import DataLoader
import pandas as pd
from utils import bbox_iou
from dataset import DetectionDataset, Normalise, Pad, ToTensor

## Labels

In [2]:
mean = [92.11938007161459, 102.83839236762152, 104.90335580512152]
std = [66.09941202519124, 70.6808655565459, 75.05305001603533]

## load custom dataset + transforms
transformed_train_data = DetectionDataset(
    label_dict="det_train_shortened.json",
    root_dir='images/',
    classes_file="data/bdd100k.names",
    grid_sizes=[13, 26, 52],
    anchors = np.array([
            [[116,90], [156,198], [373,326]],
            [[30, 61], [62, 45], [59,119]],
            [[10, 13], [16, 30], [33, 23]],
        ]),
    transform=transforms.Compose([
        Normalise(
            mean=mean,
            std=std
        ),
        Pad(416),
        ToTensor()
    ])
)

# separate into batches
train_loader = DataLoader(
    transformed_train_data,
    batch_size=1,
    shuffle=True,
    num_workers=0
)

In [56]:

for i, data in enumerate(train_loader):
    image, labels = data.values()
labels.shape

torch.Size([1, 10647, 17])

### simulate batch size 2

In [57]:
labels = torch.cat((labels[0], labels[0]), 0).reshape(2, -1, 17)
labels.shape

torch.Size([2, 10647, 17])

### reshape to 2d

In [53]:
labels = labels.reshape(-1, 17)
labels.shape

torch.Size([21294, 17])

In [47]:
labels[93]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

## Predictions

In [62]:
coco_preds = torch.load("ex_tensors/yolo_layer_output_size10647.pt")

In [82]:
coco_preds[coco_preds[:,:,4] > 0.9].shape

torch.Size([8, 85])

In [97]:
coco_preds[coco_preds[:,:,4] > 0.9][0][-80:]

tensor([2.1258e-04, 1.3264e-05, 2.4101e-01, 1.8465e-05, 1.6691e-05, 2.1706e-03,
        4.3476e-05, 8.3802e-01, 1.8591e-04, 6.2785e-06, 5.8800e-05, 1.1566e-05,
        3.5152e-06, 1.5744e-05, 2.4891e-07, 1.0322e-07, 4.5328e-06, 3.4355e-05,
        1.3557e-05, 3.3143e-05, 1.4100e-05, 4.8677e-06, 1.0699e-05, 3.8718e-06,
        4.3630e-06, 1.5898e-05, 2.1193e-05, 4.1297e-07, 5.3927e-06, 1.0333e-05,
        3.0426e-06, 1.2371e-05, 3.0568e-06, 3.5038e-05, 1.8086e-06, 3.6378e-06,
        8.2389e-06, 9.5498e-06, 1.3817e-06, 6.0767e-05, 1.4973e-06, 1.4722e-04,
        5.9483e-06, 1.3926e-06, 1.6157e-06, 1.3894e-05, 1.3107e-05, 6.3812e-06,
        2.6948e-05, 4.2925e-05, 2.3591e-05, 7.2650e-05, 2.0674e-05, 1.2399e-05,
        2.1565e-05, 8.5081e-05, 2.1246e-05, 4.1816e-05, 1.6182e-04, 2.3307e-06,
        8.8631e-06, 2.0375e-06, 8.7221e-06, 1.2796e-05, 2.2218e-06, 1.8633e-06,
        2.5516e-05, 1.0967e-05, 1.4625e-05, 4.6067e-06, 1.0541e-05, 8.3772e-07,
        1.9108e-05, 1.2276e-05, 1.2190e-

In [98]:
sm = nn.Softmax(dim=-1)
sm(coco_preds[coco_preds[:,:,4] > 0.9][0][-80:])

tensor([0.0123, 0.0123, 0.0156, 0.0123, 0.0123, 0.0123, 0.0123, 0.0283, 0.0123,
        0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123,
        0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123,
        0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123,
        0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123,
        0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123,
        0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123,
        0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123,
        0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123, 0.0123])

In [83]:
for i in range(8):
    print(torch.sum(coco_preds[coco_preds[:,:,4] > 0.9][i][-80:]))

tensor(1.0830)
tensor(1.0040)
tensor(0.9999)
tensor(1.0000)
tensor(1.0014)
tensor(0.9986)
tensor(0.9994)
tensor(1.1833)


In [88]:
for i in range(8):
    print(torch.sum(coco_preds[0,i,-80:]))

tensor(0.6226)
tensor(1.1889)
tensor(2.1072)
tensor(0.3755)
tensor(0.3186)
tensor(0.4990)
tensor(0.4447)
tensor(0.3065)


In [58]:
pretend_preds = torch.load("ex_tensors/yolo_layer_output_size10647.pt")[:,:,:17]

### simulate batch size 2

In [59]:
prediction = torch.cat((pretend_preds[0], pretend_preds[0]), 0).reshape(2, -1, 17)
# prediction = prediction.reshape(-1,17)
prediction.shape

torch.Size([2, 10647, 17])

## Testing each loss component

In [5]:
labels[:,:,4]

array([0., 1.], dtype=float32)

In [85]:
pretend_preds[:,:,4]

tensor([[3.4648e-07, 2.7254e-08, 9.5856e-09,  ..., 8.5092e-06, 7.8421e-08,
         2.4701e-08]])

In [15]:
t1 = labels[:,:,4]*torch.log(pretend_preds[:,:,4])
t2 = (1 - labels[:,:,4])*torch.log(1 - pretend_preds[:,:,4])
-torch.sum(t1 + t2)

tensor(355.5738)

In [61]:
x_mse = torch.square(prediction[:,:,0] - labels[:,:,0])
y_mse = torch.square(prediction[:,:,1] - labels[:,:,1])
torch.sum(x_mse + y_mse)/prediction.shape[0]

tensor(1.2275e+09)

In [102]:
-torch.sum(labels[:,:,-12:]*torch.log(pretend_preds[:,:,-12:]))

tensor(250.8586)

In [None]:
class Yolo_Loss(nn.Module):
    def __init__(self):
        super().__init__()
        # losses and functions
        self.bcwell = nn.BCEWithLogitsLoss()
        self.mse = nn.MSELoss()
        self.cross_entropy = nn.CrossEntropyLoss()
        self.sigmoid = nn.Sigmoid()

        # lambda constants
        self.lambda_class = 1
        self.lambda_noobj = 10
        self.lambda_box = 10
        self.lambda_obj = 1


    def forward(self, prediction, label):
        """
        Computes difference between prediction and label.
        
        Input:
        =prediction=    Tensor of all prediction arrays of size (n_batches, 10647, 5+n_classes).
        =label=         Tensor of all label arryays of size (n_batches, 10647, 5+n_classes).
        
        Output:
        =loss=          Total loss computed for this batch.
        """
        batch_size = prediction.shape[0]
        # check objectness for identity function - 4th ix in labels and predictions
        obj = (label[:,:,4] == 1) # I^obj_ij
        noobj = (label[:,:,4] == 0) # I^noobj_ij

        ## box loss
        # use generic square diff loss (mse)
        x_mse = torch.square(prediction[:,:,0] - label[:,:,0])
        y_mse = torch.square(prediction[:,:,1] - label[:,:,1])
        bbox_centre_loss = torch.sum(x_mse + y_mse)

        w_mse = torch.square(torch.sqrt(prediction[:,:,2]) - torch.sqrt(label[:,:,2]))
        h_mse = torch.square(torch.sqrt(prediction[:,:,3]) - torch.sqrt(label[:,:,3]))
        bbox_dims_loss = torch.sum(w_mse + h_mse)

        bbox_loss = (1/batch_size)*(bbox_centre_loss + bbox_dims_loss)

        ## object loss
        # use binary cross entropy loss
        t1 = label[:,:,4]*torch.log(prediction[:,:,4])
        t2 = (1 - label[:,:,4])*torch.log(1 - prediction[:,:,4])
        obj_loss = -(1/batch_size)*torch.sum(t1 + t2)

        ## no object loss
        noobj_loss = -(1/batch_size)*torch.sum(t1 + t2)

        ## class loss
        # use cross entropy loss
        class_loss = -torch.sum(label[:,:,-12:]*torch.log(prediction[:,:,-12:]))

        loss = self.lambda_box*bbox_loss + self.lambda_obj*obj_loss + self.lambda_noobj*noobj_loss + self.lambda_class*class_loss

        return loss

In [None]:
def forward(self, prediction, label, anchors):
    """
    Computes difference between prediction and label.
    
    Input:
    =prediction=    Tensor of all prediction arrays of size (n_batches, 10647, 5+n_classes).
    =label=         Tensor of all label arryays of size (n_batches, 10647, 5+n_classes).
    
    Output:
    =loss=          Total loss computed for this batch.
    """
    # check objectness for identity function - 4th ix in labels and predictions
    obj = (label[:,:,4] == 1) # I^obj_ij
    noobj = (label[:,:,4] == 0) # I^noobj_ij

    ## box coordinate loss
    prediction[None] = self.sigmoid(prediction[None]) # currently x,y coords
    label[None] = torch.log((1e-16 + label[None]/anchors)) # width and height coords
    bbox_coord_loss = self.mse(prediction[None][obj], label[None][obj])

    ## object loss
    anchors = anchors.reshape(1,3,1,1,2)
    box_preds = torch.cat([self.sigmoid(prediction[None]), torch.exp(prediction[None])*anchors], dim=-1)
    result = bbox_iou(box_preds[obj], label[None][obj]).detach()
    obj_loss = self.mse(self.sigmoid(prediction[None][obj]), result*label[None][obj])

    ## no object loss
    noobj_loss = self.bcwell((prediction[None][noobj]), (label[None][noobj]))

    ## class loss
    class_loss = self.cross_entropy((prediction[None][obj]), (label[None][obj].long()))

    loss = self.lambda_box*bbox_coord_loss + self.lambda_obj*obj_loss + self.lambda_noobj*noobj_loss + self.lambda_class*class_loss

    return loss