In [None]:
!pip install pycocotools
!pip install wandb

In [None]:
%cd /kaggle/working
!git clone https://github.com/KaushalJadhav/config_for_YOLOV3

In [None]:
%cd /kaggle/working
!rm -r config_for_YOLOV3

In [3]:
from __future__ import division
import pycocotools
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
import os
import numpy as np
import torch
from torch.utils.data import Dataset
from torch.autograd import Variable
import torch.optim as optim
import torch.nn as nn
import cv2
from collections import defaultdict
import json
import tempfile
import matplotlib.pyplot as plt
import argparse
import yaml
import random
import wandb

In [4]:
def preprocess(img, imgsize, jitter, random_placing=False):
    h, w, _ = img.shape
    img = img[:, :, ::-1]
    assert img is not None

    if jitter > 0:
        # add jitter
        # resize the images to random scale. Uniform distribution used.
        dw = jitter * w
        dh = jitter * h
        new_ar = (w + np.random.uniform(low=-dw, high=dw))/(h + np.random.uniform(low=-dh, high=dh))
    else:
        new_ar = w / h

    if new_ar < 1:             # need to do this to avoid rounding errors in typecasting to integer 
        nh = imgsize
        nw = nh * new_ar
    else:
        nw = imgsize
        nh = nw / new_ar
    nw, nh = int(nw), int(nh)

    if random_placing:   # 
        dx = int(np.random.uniform(imgsize - nw))
        dy = int(np.random.uniform(imgsize - nh))
    else:
        dx = (imgsize - nw) // 2
        dy = (imgsize - nh) // 2

    img = cv2.resize(img, (nw, nh))
    sized = np.ones((imgsize, imgsize, 3), dtype=np.uint8) * 127
    sized[dy:dy+nh, dx:dx+nw, :] = img

    info_img = (h, w, nh, nw, dx, dy)
    return sized, info_img

def rand_scale(s):
    scale = np.random.uniform(low=1, high=s)
    if np.random.rand() > 0.5:
        return scale
    return 1 / scale

def random_distort(img, hue, saturation, exposure):
    dhue = np.random.uniform(low=-hue, high=hue)
    dsat = rand_scale(saturation)
    dexp = rand_scale(exposure)

    img = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
    img = np.asarray(img, dtype=np.float32) / 255.
    img[:, :, 1] *= dsat
    img[:, :, 2] *= dexp
    H = img[:, :, 0] + dhue

    if dhue > 0:
        H[H > 1.0] -= 1.0
    else:
        H[H < 0.0] += 1.0

    img[:, :, 0] = H
    img = (img * 255).clip(0, 255).astype(np.uint8)
    img = cv2.cvtColor(img, cv2.COLOR_HSV2RGB)
    img = np.asarray(img, dtype=np.float32)

    return img


In [5]:
class conv_labels():
    def __init__(self,info_img,label2box,labels=None,box=None,maxsize=None,lrflip=None):
        self.info_img=info_img
        self.labels=labels
        self.box=box
        if label2box:
            self.output=self.label2yolobox(maxsize,lrflip)
        else:
            self.output=self.yolobox2label()
    def label2yolobox(self,maxsize,lrflip):
        h, w, nh, nw, dx, dy = self.info_img
        x1,y1 = self.labels[:, 1]/w,self.labels[:, 2] / h
        x2,y2 = x1+ (self.labels[:, 3]/ w),y1+(self.labels[:, 4]/h)
        self.labels[:, 1] = (((x1 + x2) / 2) * nw + dx) / maxsize
        self.labels[:, 2] = (((y1 + y2) / 2) * nh + dy) / maxsize
        self.labels[:, 3] *= nw / w / maxsize
        self.labels[:, 4] *= nh / h / maxsize
        if lrflip:
            self.labels[:, 1] = 1 - self.labels[:, 1]
        return self.labels


    def yolobox2label(self):
        h, w, nh, nw, dx, dy = self.info_img
        y1, x1, y2, x2 = self.box
        box_h = ((y2 - y1) / nh) * h
        box_w = ((x2 - x1) / nw) * w
        y1 = ((y1 - dy) / nh) * h
        x1 = ((x1 - dx) / nw) * w
        label = [y1, x1, y1 + box_h, x1 + box_w]
        return label

In [8]:
class COCODataset(Dataset):
    def __init__(self,model,data_dir,json_file='instances_train2017.json',train=True,img_size=416,augmentation=None,min_size=1,debug=False):
        self.data_dir = data_dir
        self.json_file = json_file
        self.model = model
        
        self.coco = COCO(self.data_dir+'/annotations_trainval2017/annotations/'+self.json_file)
        self.ids = self.coco.getImgIds()
        if debug:
            self.ids = self.ids[1:2]
            print("debug mode...", self.ids)
        self.class_ids = sorted(self.coco.getCatIds())
        self.name = 'train2017' if train else 'val2017'
        self.max_labels = 50
        self.img_size = img_size
        self.min_size = min_size
        if augmentation is not None:
            self.lrflip = augmentation['LRFLIP']
            self.jitter = augmentation['JITTER']
            self.random_placing = augmentation['RANDOM_PLACING']
            self.hue = augmentation['HUE']
            self.saturation = augmentation['SATURATION']
            self.exposure = augmentation['EXPOSURE']
            self.random_distort = augmentation['RANDOM_DISTORT']


    def __len__(self):
        return len(self.ids)

    def __getitem__(self, index):
        id = self.ids[index]
        annotations = self.get_annotations(id)
        lrflip=self.get_lrflip()

        # load image and preprocess
        img=self.get_img(id)
        assert img is not None
        img,info_img=self.aug_img(img,lrflip)
        # load labels
        padded_labels=self.load_labels_with_padding(annotations,info_img,lrflip)

        return img,padded_labels,info_img,id
    
    def get_annotations(self,id):
        anno_ids = self.coco.getAnnIds(imgIds=[int(id)],iscrowd=None)
        return self.coco.loadAnns(anno_ids)
    
    def get_lrflip(self):
        if np.random.rand() > 0.5 and self.lrflip == True:
            return True
        return False
    
    def get_img(self,id):
        img_file = os.path.join(self.data_dir, self.name,self.name,'{:012}'.format(id) + '.jpg')
        img = cv2.imread(img_file)
        if self.json_file == 'instances_val5k.json' and img is None:
            img_file = os.path.join(self.data_dir, 'train2017','train2017','{:012}'.format(id) + '.jpg')
            img = cv2.imread(img_file)
        return img
    
    def aug_img(self,img,lrflip):
        img,info_img =preprocess(img,self.img_size, jitter=self.jitter,random_placing=self.random_placing)
        if self.random_distort:
            img = random_distort(img,self.hue, self.saturation, self.exposure)
        img = np.transpose(img/255.,(2, 0, 1))
        if lrflip:
            img = np.flip(img, axis=2).copy()
        return img,info_img
    
    def load_labels_with_padding(self,annotations,info_img,lrflip):
        labels = []
        for anno in annotations:
            if anno['bbox'][2] > self.min_size and anno['bbox'][3] > self.min_size:
                labels.append([])
                labels[-1].append(self.class_ids.index(anno['category_id']))
                labels[-1].extend(anno['bbox'])
        return self.get_padded_labels(labels,info_img,lrflip)
    
    def get_padded_labels(self,labels,info_img,lrflip):
        padded_labels = np.zeros((self.max_labels, 5))
        if len(labels) > 0:
            labels = np.stack(labels)
            if 'YOLO' in self.model:
                conv= conv_labels(info_img,True,labels=labels,maxsize=self.img_size,lrflip=lrflip)
                labels = conv.output
            padded_labels[range(len(labels))[:self.max_labels]] = labels[:self.max_labels]
        padded_labels = torch.from_numpy(padded_labels)
        return padded_labels

In [None]:
class YOLOLayer(nn.Module):
    def __init__(self,config_model,layer_no,in_ch,threshold=0.7):

        super(YOLOLayer, self).__init__()
        self.strides = [32,16,8] # fixed
        self.anchors = config_model['ANCHORS']
        self.layer_no=layer_no
        self.anchor_mask = config_model['ANCH_MASK'][self.layer_no]
        self.n_anchors = len(self.anchor_mask)
        self.n_classes = config_model['N_CLASSES']
        self.threshold= threshold
        self.loss={
            "L2":nn.MSELoss(size_average=False),
            "BCE":nn.BCELoss(size_average=False)
        }
        self.anchors_grid = [(w /self.strides[self.layer_no],h/self.strides[self.layer_no]) for w, h in self.anchors] # Downsampling anchors
        self.masked_anchors = [self.anchors_grid[i] for i in self.anchor_mask]
        self.ref_anchors = np.zeros((len(self.anchors_grid), 4))
        self.ref_anchors[:, 2:] = np.array(self.anchors_grid)
        self.ref_anchors = torch.FloatTensor(self.ref_anchors)
        self.conv = nn.Conv2d(in_channels=in_ch,out_channels=self.n_anchors*(self.n_classes + 5),kernel_size=1,stride=1,padding=0)

    def forward(self,x,labels=None):
        n_ch = 5+self.n_classes   # added 5 for the box co-ordinates
        self.dtype = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
        
        output = self.get_output(x,n_ch)
        # calculate pred - xywh obj cls

        x_shift = self.dtype(np.broadcast_to(np.arange(self.fsize, dtype=np.float32), output.shape[:4]))
        y_shift = self.dtype(np.broadcast_to(np.arange(self.fsize, dtype=np.float32).reshape(self.fsize, 1), output.shape[:4]))

        pred=self.prediction(output)

        if labels is None:  # not training
            pred[..., :4] *= self.strides[self.layer_no]
            return pred.contiguous().view(self.batchsize, -1, n_ch).data

        pred = pred[..., :4].data
        
        # target assignment
        target,obj_mask,tgt_mask,tgt_scale=self.initialise()
        labels = labels.cpu().data
        nlabel = (labels.sum(dim=2) > 0).sum(dim=1)  # number of objects
        # Get the ground truth labels
        truth_all=labels*self.fsize
        truth_x_all = truth_all[:, :, 1]
        truth_y_all = truth_all[:, :, 2]
        truth_w_all = truth_all[:, :, 3]
        truth_h_all = truth_all[:, :, 4]

        for b in range(self.batchsize):
            n = int(nlabel[b])
            if n == 0:
                continue
            truth_box = self.dtype(np.zeros((n, 4)))
            truth_box[:n, 2] = truth_w_all[b, :n]
            truth_box[:n, 3] = truth_h_all[b, :n]
            truth_i = truth_x_all.to(torch.int16).numpy()[b, :n]
            truth_j = truth_y_all.to(torch.int16).numpy()[b, :n]

            # calculate iou between truth and reference anchors
            best_n,best_n_mask=self.best_n(truth_box)
            
            truth_box[:n, 0] = truth_x_all[b, :n]
            truth_box[:n, 1] = truth_y_all[b, :n]
            
            pred_iou_best=self.pred_iou_best(pred[b],truth_box)
            obj_mask[b]=~pred_iou_best

            if sum(best_n_mask) == 0:
                continue

            for ti in range(best_n.shape[0]):
                if best_n_mask[ti] == 1:
                    i,j = truth_i[ti],truth_j[ti]
                    a = best_n[ti]
                    obj_mask[b, a, j, i] = 1
                    tgt_mask[b, a, j, i, :] = 1
                    target[b, a, j, i, 0] = self.roundint16(truth_x_all[b, ti])
                    target[b, a, j, i, 1] = self.roundint16(truth_y_all[b, ti])
                    target[b, a, j, i, 2] = self.log(truth_w_all,best_n,b,ti,0)
                    target[b, a, j, i, 3] = self.log(truth_h_all,best_n,b,ti,1)
                    target[b, a, j, i, 4] = 1
                    target[b, a, j, i, 5 + labels[b, ti,0].to(torch.int16).numpy()] = 1
                    tgt_scale[b, a, j, i, :] = torch.sqrt(2 - truth_w_all[b, ti] * truth_h_all[b, ti] /self.fsize /self.fsize)

        # loss calculation
        return self.get_losses(output,target,obj_mask,tgt_scale,tgt_mask,n_ch)
    
    def get_losses(self,output,target,obj_mask,tgt_scale,tgt_mask,n_ch):
        y_out= output[..., 4]*obj_mask
        n_out= output[..., 4]*(1-obj_mask)
        output[..., np.r_[0:4, 5:n_ch]] *= tgt_mask
        output[..., 2:4] *= tgt_scale
        y_tar=target[..., 4]*obj_mask
        n_tar=target[..., 4]*(1-obj_mask)
        target[..., np.r_[0:4, 5:n_ch]] *= tgt_mask
        target[..., 2:4] *= tgt_scale
        
        bceloss = nn.BCELoss(weight=tgt_scale*tgt_scale,size_average=False)  # weighted BCEloss
        loss_xy = 5*self.loss["L2"](output[..., :2], target[..., :2])
        loss_wh = 5*self.loss["L2"](output[..., 2:4], target[..., 2:4])
        loss_obj = self.loss["BCE"](y_out,y_tar)
        loss_nobj = 0.5*self.loss["BCE"](n_out,n_tar)
        loss_cls = self.loss["BCE"](output[..., 5:], target[..., 5:])
        loss_l2 = self.loss["L2"](output, target)
        loss_coord=loss_xy + loss_wh
        loss_iou=loss_obj+loss_nobj
        loss = loss_coord + loss_iou + loss_cls
        return loss,loss_coord,loss_iou,loss_cls,loss_l2
    
    def get_output(self,x,n_ch):
        """
        Args:
            x: input feature map  
            n_ch: = number of co-ordinates used to specify bounding boxes = 5+self.n_classes
        Returns:
            output 
        """
        output = self.conv(x)
        self.batchsize = output.shape[0]
        self.fsize = output.shape[2]
        output = output.view(self.batchsize, self.n_anchors,n_ch, self.fsize, self.fsize)
        output = output.permute(0, 1, 3, 4, 2)  # .contiguous()
            
        # logistic activation for xy, obj, cls
        output[..., np.r_[:2, 4:n_ch]] = torch.sigmoid(output[..., np.r_[:2, 4:n_ch]])
        return output
        
    def prediction(self,output):
        
        masked_anchors = np.array(self.masked_anchors)
        w_anchors = self.dtype(np.broadcast_to(np.reshape(masked_anchors[:, 0],(1, self.n_anchors, 1, 1)),output.shape[:4]))
        h_anchors = self.dtype(np.broadcast_to(np.reshape(masked_anchors[:, 1],(1, self.n_anchors, 1, 1)),output.shape[:4]))
        
        # calculate pred - xywh obj cls
        x_shift = self.dtype(np.broadcast_to(np.arange(self.fsize, dtype=np.float32), output.shape[:4]))
        y_shift = self.dtype(np.broadcast_to(np.arange(self.fsize, dtype=np.float32).reshape(self.fsize, 1), output.shape[:4]))
        
        # calculate the offset values 
        pred = output.clone()
        pred[..., 0] += x_shift
        pred[..., 1] += y_shift
        pred[..., 2] = torch.exp(pred[..., 2]) * w_anchors
        pred[..., 3] = torch.exp(pred[..., 3]) * h_anchors
        return pred
    
    def initialise(self):
        obj_mask = torch.ones(self.batchsize, self.n_anchors,self.fsize, self.fsize).type(self.dtype)
        tgt_mask = torch.zeros(self.batchsize, self.n_anchors,self.fsize, self.fsize, 4 + self.n_classes).type(self.dtype)
        target = torch.zeros(self.batchsize,self.n_anchors,self.fsize,self.fsize,5+self.n_classes).type(self.dtype)
        tgt_scale = torch.zeros(self.batchsize, self.n_anchors,self.fsize, self.fsize, 2).type(self.dtype)
        return target,obj_mask,tgt_mask,tgt_scale
    
    def bboxes_iou(self,bboxes_a, bboxes_b, xyxy=True):
        """
        Args:
            bbox_a : An array whose shape is =(N, 4).
                     where N is the number of bounding boxes.
                     The dtype should be numpy.float32.
            bbox_b : An array similar to bbox_a whose shape is (K, 4).
                     The dtype should be :obj:`numpy.float32`.
       Returns:
           array: An array whose shape is (N, K).An element at index (n, k) contains IoUs between 
           nth bounding box in bbox_a and kth bounding box in bbox_b.
       Reference: https://github.com/chainer/chainercv
       """
        if bboxes_a.shape[1]!= 4 or bboxes_b.shape[1]!= 4:
            raise IndexError

        if xyxy:
            # top left
            tl = torch.max(bboxes_a[:, None, :2], bboxes_b[:, :2])
            # bottom right
            br = torch.min(bboxes_a[:, None, 2:], bboxes_b[:, 2:])
            area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1)
            area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1)
        else:
            # top left
            tl = torch.max((bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2),(bboxes_b[:, :2] - bboxes_b[:, 2:] / 2))
            # bottom right
            br = torch.min((bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2),(bboxes_b[:, :2] + bboxes_b[:, 2:] / 2))
            area_a = torch.prod(bboxes_a[:, 2:], 1)
            area_b = torch.prod(bboxes_b[:, 2:], 1)
        en = (tl < br).type(tl.type()).prod(dim=2)
        area_i = torch.prod(br - tl, 2) * en  # * ((tl < br).all())
        return area_i / (area_a[:, None] + area_b - area_i)
    
    def pred_iou_best(self,pred,truth_box):
        """
        Args:
            pred: prediction 
            truth_box: ground truth bounding box
        Returns:
            best iou among the iou calculated considering given pred and given ground truth bounding boxes
        """
        pred_ious = self.bboxes_iou(pred.contiguous().view(-1, 4),truth_box,xyxy=False)
        
        # get the best iou
        pred_iou_best, _ = pred_ious.max(dim=1)
        pred_iou_best = (pred_iou_best > self.threshold) # check whether iou is greter than threshold
        pred_iou_best = pred_iou_best.view(pred.shape[:3])
        return pred_iou_best
    
    def best_n(self,truth_box):
        anchor_ious_all = self.bboxes_iou(truth_box.cpu(),self.ref_anchors)
        best_n_all=np.argmax(anchor_ious_all,axis=1)
        best_n=best_n_all % 3
        # Choose the best anchor box. Note that we can use OR operation here to optimize code. 
        best_n_mask = ((best_n_all == self.anchor_mask[0]) | (best_n_all == self.anchor_mask[1]) | (best_n_all == self.anchor_mask[2]))
        return best_n,best_n_mask 
    
    def log(self,num,den,b,ti,temp=0):
        return torch.log(num[b,ti] / torch.Tensor(self.masked_anchors)[den[ti],temp] + 1e-16)
    
    def roundint16(self,arr):
        return arr-arr.to(torch.int16).to(torch.float)

In [10]:
def add_conv(in_ch:int,out_ch:int,kernel_size,stride):
        """
        Add a conv2d,batchnorm and leaky ReLU block.
        Args:
             in_ch : number of input channels of the convolution layer.
             out_ch : number of output channels of the convolution layer.
             kernel_size: kernel size of the convolution layer.
             stride : stride of the convolution layer.
        Returns:
             stage: Sequential layers composing a convolution block.
        """
        stage = nn.Sequential()
        pad = (kernel_size-1)//2   # Zero padding
        stage.add_module('conv', nn.Conv2d(in_channels=in_ch,out_channels=out_ch,kernel_size=kernel_size,stride=stride,padding=pad,bias=False))
        stage.add_module('batch_norm',nn.BatchNorm2d(out_ch))
        stage.add_module('leaky',nn.LeakyReLU(0.1))
        return stage
    
class resblock(nn.Module):
    """
    Sequential residual blocks
    Args:
        ch: number of input and output channels. (Number of input and output channels is equal)
        nblocks: number of residual blocks. Default=1
        shortcut: if True, residual addition is enabled else disabled. Default=True
    """
    def __init__(self, ch:int, nblocks=1, shortcut=True):
        
        super().__init__()  # Inheritance
        self.shortcut = shortcut
        self.module_list = nn.ModuleList()
        for i in range(nblocks):
            '''
            Each residual block  contains 2 convolutional layers. 
            1. nn.Conv2d with input channels=ch,output channels=ch//2,kernel_size=1,stride=1,padding=0
            2. nn.Conv2d with input channels=ch//2,output channels=ch,kernel_size=3,stride=1,padding=1
            '''
            resblock = nn.ModuleList()
            resblock.append(add_conv(ch, ch//2, 1, 1))
            resblock.append(add_conv(ch//2, ch, 3, 1))
            self.module_list.append(resblock)   # Appends the residual block

    def forward(self, x):
        for module in self.module_list:
            y=x
            for res in module:
                y=res(y)
            x = x + y if self.shortcut else y
        return x




class YOLOv3(nn.Module):
    """
    YOLOv3 model module. The module list is defined by create_yolov3_modules function.The network returns 
    loss values from three YOLO layers during training and detection results during test.
    """
    def __init__(self,config_model,threshold=0.7):
        """
        Initialization of YOLOv3 class.
        Args:
            config_model (dict): used in YOLOLayer.
            threshold(float): used in YOLOLayer.
        """
        super(YOLOv3, self).__init__()
        
        self.config_model=config_model
        self.threshold=threshold

        if config_model['TYPE'] == 'YOLOv3':
            self.module_list = nn.ModuleList()
            self.create_yolov3_modules()
        else:
            raise Exception('Model name {} is not available'.format(config_model['TYPE']))

    def forward(self, x, targets=None):
        """
        Forward path of YOLOv3.
        Args:
            x: input data 
            targets: label array`
        Returns:
                output: output array
        """
        train = True if targets is not None else False
        output = []
        self.loss_dict = defaultdict(float)
        route_layers = []
        for i,module in enumerate(self.module_list):
            # yolo layers
            if i in [14, 22, 28]:
                if train:
                    x,*loss_dict = module(x, targets)
                    for name, loss in zip(['coord', 'iou','cls', 'l2'],loss_dict):
                        self.loss_dict[name] += loss
                else:
                    x = module(x)   
                output.append(x)
            else:
                x = module(x)

            # route layers
            
            if i in [6,8,12,20]:
                route_layers.append(x)
            if i == 14:
                x = route_layers[2]    # Realising shortcut connection
            if i == 22:  # yolo 2nd
                x = route_layers[3]
            if i == 16:
                x = torch.cat((x,route_layers[1]),1)
            if i == 24:
                x = torch.cat((x,route_layers[0]),1)
        if train:
            return sum(output)
        else:
            return torch.cat(output,1)
    
    def create_yolov3_modules(self):
        """
        Build yolov3 layer modules.
        Args:
            config_model: model configuration.
            threshold: used in YOLOLayer.
        Returns:
            module_list: YOLOv3 module list.
        """

       # DarkNet53
       # Reference- Table-1 of paper- YOLOv3: An Incremental Improvement
    
        self.module_list.append(add_conv(in_ch=3,out_ch=32,kernel_size=3,stride=1))      
        self.module_list.append(add_conv(in_ch=32, out_ch=64,kernel_size=3,stride=2)) 
    
        self.module_list.append(resblock(ch=64))   # Contains only one residual block 
    
        self.module_list.append(add_conv(in_ch=64,out_ch=128,kernel_size=3,stride=2))
    
        self.module_list.append(resblock(ch=128,nblocks=2))
    
        self.module_list.append(add_conv(in_ch=128,out_ch=256,kernel_size=3,stride=2))
    
        self.module_list.append(resblock(ch=256,nblocks=8))   
    
        self.module_list.append(add_conv(in_ch=256,out_ch=512,kernel_size=3,stride=2))
    
        self.module_list.append(resblock(ch=512,nblocks=8))   
    
        self.module_list.append(add_conv(in_ch=512, out_ch=1024, kernel_size=3, stride=2))
        self.module_list.append(resblock(ch=1024, nblocks=4))

        # YOLOv3
        self.module_list.append(resblock(ch=1024, nblocks=2, shortcut=False))
        self.module_list.append(add_conv(in_ch=1024, out_ch=512,kernel_size=1,stride=1))
    
        # 1st yolo branch
        self.module_list.append(add_conv(in_ch=512,out_ch=1024,kernel_size=3,stride=1))
        self.module_list.append(YOLOLayer(self.config_model,layer_no=0,in_ch=1024,threshold=self.threshold))
        self.module_list.append(add_conv(in_ch=512, out_ch=256, kernel_size=1,stride=1))
        self.module_list.append(nn.Upsample(scale_factor=2,mode='nearest'))
        self.module_list.append(add_conv(in_ch=768, out_ch=256, kernel_size=1,stride=1))
        self.module_list.append(add_conv(in_ch=256, out_ch=512, kernel_size=3,stride=1))
        self.module_list.append(resblock(ch=512, nblocks=1, shortcut=False))
        self.module_list.append(add_conv(in_ch=512, out_ch=256, kernel_size=1,stride=1))
    
        # 2nd yolo branch
        self.module_list.append(add_conv(in_ch=256, out_ch=512, kernel_size=3, stride=1))
        self.module_list.append(YOLOLayer(self.config_model, layer_no=1, in_ch=512, threshold=self.threshold))

        self.module_list.append(add_conv(in_ch=256, out_ch=128, kernel_size=1, stride=1))
        self.module_list.append(nn.Upsample(scale_factor=2,mode='nearest'))
        self.module_list.append(add_conv(in_ch=384, out_ch=128, kernel_size=1, stride=1))
        self.module_list.append(add_conv(in_ch=128, out_ch=256, kernel_size=3, stride=1))
        self.module_list.append(resblock(ch=256, nblocks=2, shortcut=False))
        self.module_list.append(YOLOLayer(self.config_model,layer_no=2,in_ch=256,threshold=self.threshold))

In [11]:
class COCOAPIEvaluator():
    """
    COCO AP Evaluation class.
    All the data in the val2017 dataset are processed and evaluated by COCO API.
    """
    def __init__(self,model,data_dir,img_size,conf_thresh,nms_thresh=0.45):
        """
        Args:
            model: model name specified in config file
            data_dir: dataset root directory
            img_size: image size after preprocess. images are resized to squares whose shape is (img_size, img_size).
            conf_thresh: confidence threshold ranging from 0 to 1,which is defined in the config file.
            nms_thresh: IoU threshold of non-max supression ranging from 0 to 1.
        """

        augmentation = {'LRFLIP': False, 'JITTER': 0, 'RANDOM_PLACING': False,
                        'HUE': 0, 'SATURATION': 0, 'EXPOSURE': 0, 'RANDOM_DISTORT': False}

        self.dataset = COCODataset(model=model,data_dir=data_dir,img_size=img_size,augmentation=augmentation,
                                   json_file='instances_val2017.json',train=False)
        self.dataloader = torch.utils.data.DataLoader(self.dataset, batch_size=1, shuffle=False, num_workers=0)
        self.img_size = img_size
        self.conf_thresh = 0.005 # from darknet
        self.nms_thresh = nms_thresh # 0.45 (darknet)
        self.num_classes=80
    
    def get_dtype(self):
        cuda = torch.cuda.is_available()
        dtype= torch.cuda.FloatTensor if cuda else torch.FloatTensor
        return dtype

    def evaluate(self, model):
        """
        COCO average precision (AP) Evaluation. Iterate inference on the test dataset
        and the results are evaluated by COCO API.
        Args:
            model : model object
        Returns:
            ap50_95 (float) : calculated COCO AP for IoU=50:95
            ap50 (float) : calculated COCO AP for IoU=50
        """
        model.eval()
        dtype=self.get_dtype()
        ids = []
        data_dict = []
        dataiterator = iter(self.dataloader)
        while True: # all the data in val2017
            try:
                img, _, info_img, id_ = next(dataiterator)  # load a batch
            except StopIteration:
                break
            info_img = [float(info) for info in info_img]
            id_ = int(id_)
            ids.append(id_)
            with torch.no_grad():
                img = Variable(img.type(dtype))
                outputs = model(img)
                outputs = self.postprocess(outputs)
                if outputs[0] is None:
                    continue
                outputs = outputs[0].cpu().data

            for output in outputs:
                x1,y1,x2,y2 = float(output[:4])
                label = self.dataset.class_ids[int(output[6])]
                conv=conv_labels(info_img,label2box=False,box=(y1, x1, y2, x2))
                box = conv.output
                bbox = [box[1], box[0], box[3] - box[1], box[2] - box[0]]
                score = float(output[4].data.item() * output[5].data.item()) # object score * class score
                A = {"image_id": id_, "category_id": label, "bbox": bbox,
                     "score": score, "segmentation": []} # COCO json format
                data_dict.append(A)

        return self.eval(data_dict)
    
    def eval(self,data_dict):
        annType = ['segm', 'bbox', 'keypoints']
        # Evaluate the Dt (detection) json comparing with the ground truth
        if len(data_dict) > 0:
            cocoGt = self.dataset.coco
            # workaround: temporarily write data to json file because pycocotools can't process dict in py36.
            _, tmp = tempfile.mkstemp()
            json.dump(data_dict, open(tmp, 'w'))
            cocoDt = cocoGt.loadRes(tmp)
            cocoEval = COCOeval(self.dataset.coco, cocoDt, annType[1])
            cocoEval.params.imgIds = ids
            cocoEval.evaluate()
            cocoEval.accumulate()
            cocoEval.summarize()
            return cocoEval.stats[0], cocoEval.stats[1]
        else:
            return 0, 0
        
    def postprocess(self,pred):
        box_corner = pred.new(pred.shape)
        box_corner[:, :, 0] = pred[:, :, 0] - pred[:, :, 2] / 2
        box_corner[:, :, 1] = pred[:, :, 1] - pred[:, :, 3] / 2
        box_corner[:, :, 2] = pred[:, :, 0] + pred[:, :, 2] / 2
        box_corner[:, :, 3] = pred[:, :, 1] + pred[:, :, 3] / 2
        pred[:, :, :4] = box_corner[:, :, :4]

        output = [None for _ in range(len(pred))]
        for i, img_pred in enumerate(pred):
             # Filter out confidence scores below threshold
            img_pred=self.filter_pred(img_pred)
            # If none are remaining => process next image
            if not img_pred.size(0):
                continue
        # Get detections with higher confidence scores than the threshold
        detections=self.get_detections(img_pred)
        
        # Iterate through all predicted classes
        unique_labels =self.get_unique_labels( detections,pred)
        for c in unique_labels:
            # Get the detections with the particular class
            detections_class = self.get_detection_class(detections,c)
            if output[i] is None:
                output[i] = detections_class
            else:
                output[i] = torch.cat((output[i], detections_class))

        return output
    
    def filter_pred(self,img_pred):
        class_pred = torch.max(img_pred[:, 5:5 + self.num_classes], 1)
        class_pred = class_pred[0]
        conf_mask = (img_pred[:, 4] * class_pred >= self.conf_thresh).squeeze()
        img_pred = img_pred[conf_mask]
        return img_pred
    
    def nms(self,bbox,score=None,limit=None):
        """Suppress bounding boxes according to their IoUs and confidence scores.
        Args:
             bbox: Bounding boxes to be transformed.
             thresh: Threshold of IoUs.
             score: An array of confidences.
             limit: The upper bound of the number of the output bounding boxes. 
                    If it is not specified, this method selects as many bounding boxes as possible.
        Returns:
             array: An array with indices of bounding boxes that are selected.
                    They are sorted by the scores of bounding boxes in descending order.
        """

        if len(bbox) == 0:
            return np.zeros((0,),dtype=np.int32)

        if score is not None:
            order = score.argsort()[::-1]
            bbox = bbox[order]   # reorder bboxes
        
        bbox_area = np.prod(bbox[:, 2:] - bbox[:, :2], axis=1) 
        selec = np.zeros(bbox.shape[0], dtype=bool)
        for i,b in enumerate(bbox):
            tl = np.maximum(b[:2],bbox[selec,:2])
            br = np.minimum(b[2:],bbox[selec,2:])
            area = np.prod(br - tl, axis=1) * (tl < br).all(axis=1)
            iou = area / (bbox_area[i] + bbox_area[selec] - area)
            if (iou >= self.nms_thresh).any():
                continue
            selec[i] = True
            if limit is not None and np.count_nonzero(selec) >= limit:
                break

        selec = np.where(selec)[0]
        if score is not None:
            selec = order[selec]
        return selec.astype(np.int32)
   
    def get_detections(self,img_pred):
        ind = (img_pred[:, 5:] * img_pred[:, 4][:, None] >= self.conf_thresh).nonzero()
        # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred)
        detections = torch.cat((img_pred[ind[:, 0], :5],img_pred[ind[:, 0], 5 + ind[:, 1]].unsqueeze(1),ind[:, 1].float().unsqueeze(1)),1)
        return detections
    
    def get_unique_labels(self,detections,pred):
        unique_labels = detections[:, -1].cpu().unique()
        if pred.is_cuda:
            unique_labels = unique_labels.cuda()
        return unique_labels
    
    def get_detection_class(self,detections,c):
        detections_class = detections[detections[:, -1] == c]
        nms_in = detections_class.cpu().numpy()
        nms_out_index = self.nms(nms_in[:, :4],score=nms_in[:, 4]*nms_in[:, 5])
        detections_class = detections_class[nms_out_index]
        return detections_class

In [12]:
checkpoint_dir="/kaggle/working/checkpoints"
cfg_path= "/kaggle/working/config_for_YOLOV3/yolov3_default.cfg"
weights_path=None
checkpoint= None
debug=False
datadir="/kaggle/input/coco2017"
cuda=True
eval_interval=100
checkpoint_interval=10

In [None]:
wandb.login()
cuda = torch.cuda.is_available()
os.makedirs(checkpoint_dir,exist_ok=True)
# Parse config settings
with open(cfg_path, 'r') as f:
    cfg = yaml.safe_load(f)
print("successfully loaded config file: ", cfg)
momentum = cfg['TRAIN']['MOMENTUM']
decay = cfg['TRAIN']['DECAY']
burn_in = cfg['TRAIN']['BURN_IN']
iter_size = cfg['TRAIN']['MAXITER']
steps = eval(cfg['TRAIN']['STEPS'])
batch_size = cfg['TRAIN']['BATCHSIZE']
subdivision = cfg['TRAIN']['SUBDIVISION']
ignore_thre = cfg['TRAIN']['IGNORETHRE']
random_resize = cfg['AUGMENTATION']['RANDRESIZE']
base_lr = cfg['TRAIN']['LR'] / batch_size / subdivision
wandb.init(name="YOLO-V3",config=cfg,project="YOLO-V3",resume='allow',id="samplerun4")
print('effective_batch_size = batch_size * iter_size = %d * %d' %(batch_size, subdivision))

# Learning rate setup
def burnin_schedule(i):
    if i < burn_in:
        factor = pow(i / burn_in, 4)
    elif i < steps[0]:
        factor = 1.0
    elif i < steps[1]:
        factor = 0.1
    else:
        factor = 0.01
    return factor

# Initiate model
model = YOLOv3(cfg['MODEL'],threshold=ignore_thre)
if checkpoint is not None:
    print("loading pytorch ckpt...",checkpoint)
    state = torch.load(checkpoint)
    if 'model_state_dict' in state.keys():
        model.load_state_dict(state['model_state_dict'])
    else:
        model.load_state_dict(state)

if cuda:
    print("using cuda") 
    model = model.cuda()

model.train()

imgsize = cfg['TRAIN']['IMGSIZE']
dataset = COCODataset(model=cfg['MODEL']['TYPE'],data_dir=datadir,img_size=imgsize,augmentation=cfg['AUGMENTATION'],debug=False)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)
dataiterator = iter(dataloader)

evaluator = COCOAPIEvaluator(model=cfg['MODEL']['TYPE'],data_dir=datadir,img_size=cfg['TEST']['IMGSIZE'],
                             conf_thresh=cfg['TEST']['CONFTHRE'],nms_thresh=cfg['TEST']['NMSTHRE'])

dtype = torch.cuda.FloatTensor if cuda else torch.FloatTensor

# optimizer setup
# set weight decay only on conv.weight
params_dict = dict(model.named_parameters())
params = []
for key, value in params_dict.items():
    if 'conv.weight' in key:
        params += [{'params':value, 'weight_decay':decay * batch_size * subdivision}]
    else:
        params += [{'params':value, 'weight_decay':0.0}]
optimizer = optim.SGD(params, lr=base_lr, momentum=momentum,dampening=0, weight_decay=decay * batch_size * subdivision)
iter_state = 0
if checkpoint is not None:
    if 'optimizer_state_dict' in state.keys():
        optimizer.load_state_dict(state['optimizer_state_dict'])
        iter_state = state['iter'] + 1
scheduler = optim.lr_scheduler.LambdaLR(optimizer, burnin_schedule)

# start training loop
for iter_i in range(iter_state, iter_size + 1):

    # COCO evaluation
    if iter_i % eval_interval == 0 and iter_i > 0:
        ap50_95, ap50 = evaluator.evaluate(model)
        model.train()
        print('val/COCOAP50', ap50, iter_i)
        print('val/COCOAP50_95', ap50_95, iter_i)
        wandb.log(
            {'val/COCOAP50': ap50,
             'val/COCOAP50_95': ap50_95,
             'val_step':iter_i/eval_interval
            }
        )

    # subdivision loop
    optimizer.zero_grad()
    total_loss=[]
    for inner_iter_i in range(subdivision):
        try:
            imgs, targets, _, _ = next(dataiterator)  # load a batch
        except StopIteration:
            dataiterator = iter(dataloader)
            imgs, targets, _, _ = next(dataiterator)  # load a batch
        imgs = Variable(imgs.type(dtype))
        targets = Variable(targets.type(dtype), requires_grad=False)
        loss = model(imgs, targets)
        wandb.log(
            {'Train_loss/Step':float(loss),
            'train_step':iter_i+inner_iter_i}
        )
        loss.backward()
        total_loss.append(float(loss))
        
    optimizer.step()
    scheduler.step()

    # logging
    current_lr = scheduler.get_last_lr()[0] * batch_size * subdivision
    print('[Iter %d/%d] [lr %f][Losses: coord %f, iou %f,cls %f, l2 %f, imgsize %d]'
          % (iter_i, iter_size, current_lr,model.loss_dict['coord'],model.loss_dict['iou'], model.loss_dict['cls'], 
             model.loss_dict['l2'], imgsize),flush=True)
    wandb.log(
        {
            'Learning Rate': current_lr,
            'Total Loss/Epoch': sum(total_loss)/len(total_loss),
            'train_epoch':iter_i}
    )
    # random resizing
    if random_resize:
            imgsize = (random.randint(0, 9) % 10 + 10) * 32
            dataset.img_shape = (imgsize, imgsize)
            dataset.img_size = imgsize
            dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)
            dataiterator = iter(dataloader)

    # save checkpoint
    if iter_i > 0 and (iter_i % checkpoint_interval == 0):
        torch.save({'iter': iter_i,'model_state_dict': model.state_dict(),'optimizer_state_dict': optimizer.state_dict(),},
                   os.path.join(checkpoint_dir, "YOLOV3-iter()"+str(iter_i)+".ckpt"))
        wandb.save(os.path.join(checkpoint_dir, "YOLOV3-iter()"+str(iter_i)+".ckpt"))
        
wandb.finish()
