In [1]:
from __future__ import division

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np

from model import parse_cfg, create_modules, EmptyLayer, DetectionLayer
from utils import *

In [None]:
blocks = parse_cfg('./cfg/model.cfg')
create_modules(blocks)[1]

## Define Network

In [38]:
class Net(nn.Module):
    def __init__(self, cfgfile):
        super(Net, self).__init__()
        self.blocks = parse_cfg(cfgfile)
        self.net_info, self.module_list = create_modules(self.blocks)

    ## Forward pass
    # 1. calculate output
    # 2. transform output detection feature maps so that it can be 
    #    processed easier 
    def forward(self, x, CUDA):
        modules = self.blocks[1:]
        outputs = {} # cache outputs for route layer
        #print("hello")

        # iterate over module_list, pass input through each module
        # in -> [module[0]] -> [module[1]] -> ...
        write = 0 # used as flag to inidicate if we've encountered first detection yet for yolo 
        for idx, module in enumerate(modules):
            module_type = (module["type"])
            #print(module_type)

            ## CONV and UPSAMPLE LAYERS:
            # pass input -> conv/upsample module -> output
            if (module_type == "convolutional") or (module_type == "upsample"):
                x = self.module_list[idx](x) # pass in 
            
            ## ROUTE LAYERS
            # case 1: layer = n
            # - output feature map from the layer n-layers backward
            # case 2: layer = n, m
            # - concat(feature map from n-layers back + mth layer) along depth dim
            elif (module_type == "route"):
                layers = module["layers"]
                layers = [int(n) for n in layers]

                if (layers[0]) > 0:
                    layers[0] = layers[0] - idx # refer to current layer/module idx 

                # for case 1
                if len(layers) == 1:
                    x = outputs[idx + (layers[0])] # pull from cache n layers ago
                
                # for case 2
                else:
                    if (layers[1]) > 0:
                        layers[1] = layers[1] - idx # refer to current layer/module idx
                    
                    feature_map_1 = outputs[idx + layers[0]] # take feature map from n-layers back
                    feature_map_2 = outputs[idx + layers[1]] # take feature map from mth layer     

                    # concat feature maps along depth dim
                    x = torch.cat((feature_map_1, feature_map_2), 1)

            ## SHORTCUT LAYERS
            # from = n
            # output = (feature map from prev layer) + (feature layer from n-layers back)
            elif (module_type == "shortcut"):
                from_ = int(module["from"])
                x = outputs[idx-1] + outputs[idx+from_]           

            ## YOLO LAYERS
            # output = conv feature map containing bbox attributes along depth of 
            #          feature map (attribute bboxes predicted are stacjed 1 by 1
            #          along each other)
            # so to access 3rd bbox at cell (6,9) requires index:
            #          map[5, 6, 2*(5+C): 3*(5+C)], where C is n_classes
            # this sucks
            # another issue: detection happens at 3 different scales
            #              ->dims of pred maps different
            #              ->use predict_transform
            elif (module_type == "yolo"):
                # concat detection maps at three diff scales into one bit tensor (possible post transform)

                # cannot init empty tensor, therefore:
                # 1. delay collector init until first detection map
                # 2. concat maps to it after subsequent detections             

                anchors = self.module_list[idx][0].anchors
                # get input dims and n_classes
                in_dims = int(self.net_info["height"])
                n_classes = int(module["classes"])

                # transform
                x = x.data
                x = predict_transform(
                    prediction=x, 
                    in_dims=in_dims, 
                    anchors=anchors, 
                    n_classes=n_classes,
                    CUDA=CUDA
                )
                # recall write=0 means collector hasnt been initatlised
                if not write:
                    detections = x
                    write = 1
                else:
                    detections = torch.cat((detections, x), 1)
            # save current output
            outputs[idx] = x
        return detections

## Test Foward Pass

In [41]:
# create dummy input functiont o test forward pass
def test_forward_pass():
    img = cv2.imread("images/0000f77c-62c2a288.jpg")
    img = cv2.resize(img, (416,416)) # resize to input dims
    reshaped_img = img[:,:,::-1].transpose((2,0,1)) # H x W x C -> C x H x W
    reshaped_img = reshaped_img[np.newaxis,:,:,:]/255.0 # add channel at 0 for batch norm
    reshaped_img = torch.from_numpy(reshaped_img).float() # convert to float
    reshaped_img = Variable(reshaped_img) # convert to variable
    return reshaped_img

In [42]:
model = Net("cfg/model.cfg")
input = test_forward_pass()
pred = model(input, torch.cuda.is_available())
pred

tensor([[[1.8070e+01, 2.6456e+01, 1.0828e+02,  ..., 4.7025e-01,
          6.3105e-01, 5.4467e-01],
         [2.1004e+01, 2.3269e+01, 1.3810e+02,  ..., 5.0359e-01,
          5.6842e-01, 5.2905e-01],
         [2.5882e+01, 1.7357e+01, 2.7089e+02,  ..., 5.4827e-01,
          5.1742e-01, 5.8414e-01],
         ...,
         [5.6657e+02, 5.6628e+02, 7.6416e+00,  ..., 4.9057e-01,
          5.3440e-01, 4.1428e-01],
         [5.6748e+02, 5.6707e+02, 1.9345e+01,  ..., 5.0483e-01,
          5.9651e-01, 5.2278e-01],
         [5.6637e+02, 5.6587e+02, 1.9670e+01,  ..., 4.9743e-01,
          4.9942e-01, 6.0571e-01]]])