In [1]:
from __future__ import division

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np

from model import parse_cfg, create_modules, EmptyLayer, DetectionLayer

In [12]:
blocks = parse_cfg('./cfg/model.cfg')
create_modules(blocks)[1]

ModuleList(
  (0): Sequential(
    (conv_0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (batch_norm_0): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (leaky_0): LeakyReLU(negative_slope=0.1, inplace=True)
  )
  (1): Sequential(
    (conv_1): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (batch_norm_1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (leaky_1): LeakyReLU(negative_slope=0.1, inplace=True)
  )
  (2): Sequential(
    (conv_2): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (batch_norm_2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (leaky_2): LeakyReLU(negative_slope=0.1, inplace=True)
  )
  (3): Sequential(
    (conv_3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (batch_norm_3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True

## Define Network

In [None]:
class Net(nn.Module):
    def __init__(self, cfgfile):
        super(Net, self).__init__()
        self.blocks = parse_cfg(cfgfile)
        self.net_info, self.module_list = create_modules(self.blocks)

    ## Forward pass
    # 1. calculate output
    # 2. transform output detection feature maps so that it can be 
    #    processed easier 
    def foward(self, x, CUDA):
        modules = self.blocks[1:]
        outputs = {} # cache outputs for route layer

        # iterate over module_list, pass input through each module
        # in -> [module[0]] -> [module[1]] -> ...
        write = 0
        for idx, module in enumerate(modules):
            module_type = (module["type"])

            ## CONV and UPSAMPLE LAYERS:
            # pass input -> conv/upsample module -> output
            if (module_type == "convolutional") or (module_type == "upsample"):
                x = self.module_list[idx](x) # pass in 
            
            ## ROUTE LAYERS
            # case 1: layer = n
            # - output feature map from the layer n-layers backward
            # case 2: layer = n, m
            # - concat(feature map from n-layers back + mth layer) along depth dim
            if (module_type == "route"):
                layers = module["layers"]
                layers = [int(n) for n in layers]

                if (layers[0]) > 0:
                    layers[0] = layers[0] - idx # refer to current layer/module idx 

                # for case 1
                if len(layers) == 1:
                    x = outputs[idx + (layers[0])] # pull from cache n layers ago
                
                # for case 2
                else:
                    if (layers[1]) > 0:
                        layers[1] = layers[1] - idx # refer to current layer/module idx
                    
                    feature_map_1 = outputs[idx + layers[0]] # take feature map from n-layers back
                    feature_map_2 = outputs[idx + layers[1]] # take feature map from mth layer     

                    # concat feature maps along depth dim
                    x = torch.cat((feature_map_1, feature_map_2), 1)

            ## SHORTCUT LAYERS
            # from = n
            # output = (feature map from prev layer) + (feature layer from n-layers back)
            from_ = int(module["from"])
            x = outputs[idx-1] + outputs[idx+from_]           

            ## YOLO LAYERS
            # output = conv feature map containing bbox attributes along depth of 
            #          feature map (attribute bboxes predicted are stacjed 1 by 1
            #          along each other)
            # so to access 3rd bbox at cell (6,9) requires index:
            #          map[5, 6, 2*(5+C): 3*(5+C)], where C is n_classes
            # this sucks
            # another issue: detection happens at 3 different scales
            #              ->dims of pred maps different
            #              ->use predict_transform
            