In [1]:
import numpy as np

# Non-Model Specific Parameters

In [2]:
# Loihi N2 contraint, configurable in N3 (on a core by core basis)
numCompartmentsPerCore = 1024
numCoresPerChip = 128

# Reduces the number of channels in each layer by a fraction
thinning = 1

# Reduces the xy dimension of each layer by a fraction
resolutionMultiplier = 1


# Tiny Yolo v3

In [3]:
def tinyYoloV3():
    """Define the Tiny Yolo v3 model
    Considering only the convolution layers (how would max pooling map to hardware?)
    """
    
    # how many filters in each layer
    allLayers = dict(numFilters = [16, 32, 64, 128, 256, 512, 1024, 256, 512, 255, 128, 256, 255],
                    filter_x = [3, 3, 3, 3, 3, 3, 3, 1, 3, 1, 1, 3, 1], # The filter x-y size and stride
                    filter_y = [3, 3, 3, 3, 3, 3, 3, 1, 3, 1, 1, 3, 1],
                    stride = [1,1,1,1,1,1,1,1,1,1,1,1,1],
                    input_w = [416, 208, 104, 52, 26, 13, 13, 13, 13, 13, 13, 26, 26],    # Input dimensions for each channel
                    input_h = [416, 208, 104, 52, 26, 13, 13, 13, 13, 13, 13, 26, 26],
                    input_c = [3, 16, 32, 64, 128, 256, 512, 1024, 256, 512, 256, 384, 256],
                    numGroups = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], # Groups and depthwise convolution are not part of Yolo model
                    depthWise = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

    return allLayers

# MobileNet v2

In [4]:
def mobileNetV2():
    """Define the MobileNet v2 model
    RESTRICTIONS???
    """
        
    def appendLayer(layer1, layer2):
        
        if layer1.keys():
            combined = dict()
            for kk in layer1.keys():
                combined.update({kk: layer1[kk] + layer2[kk]})
        else:
            combined = layer2
            
        return combined
    
    
    def mobileNetLayerGroup(t, cIn, cOut, n, s, w, h):
        """Create a group of mobileNet v2 layers as per https://arxiv.org/pdf/1801.04381.pdf
        """
        def mobileNetLayer(t, cIn, cOut, s, w, h):
            """Create a single mobileNet v2 layer, which consists of multiple intenal layers"""
            layerDict = dict(numFilters=[cIn*t, cIn*t, cOut], 
                             filter_x=[1,3,1],
                             filter_y=[1,3,1],
                             stride=[1,s,1],
                             input_w=[w,w,w/s],
                             input_h=[h,h,h/s],
                             input_c=[cIn, cIn*t, cIn*t],
                             numGroups=[1,1,1],
                             depthWise=[0,1,0])
            return layerDict

        
        # The first block in a group implements the stride
        layerDict = mobileNetLayer(t, cIn, cOut, s, w, h)
        groupDict = layerDict

        # The remaining blocks are identical and have stride 1
        layerDict = mobileNetLayer(t, cOut, cOut, s, w/s, h/s)
        for nn in range(1,n):
            groupDict = appendLayer(groupDict, layerDict)
                
        return groupDict
    
    # from https://arxiv.org/pdf/1801.04381.pdf Table 2
    t = [1, 6, 6, 6, 6, 6, 6]
    c = [32, 16, 24, 32, 64, 96, 160, 320]
    cIn  = c[:-1]
    cOut = c[1:]
    n = [1, 2, 3, 4, 3, 3, 1]
    s = [1, 2, 2, 2, 1, 2, 1]
    w = [112, 112, 56, 28, 14, 14, 7]
    h = w
    
    # First layer is full convolution
    allLayers = dict(numFilters=[32],
                   filter_x=[3],
                   filter_y=[3],
                   stride=[2],
                   input_w=[224],
                   input_h=[224],
                   input_c=[3],
                   numGroups=[1],
                   depthWise=[0])
    
    # next layers are repeated building blocks
    for gg in range(len(t)):
        groupLayers = mobileNetLayerGroup(t[gg], cIn[gg], cOut[gg], n[gg], s[gg], w[gg], h[gg])
        allLayers = appendLayer(allLayers, groupLayers) 
        
    lastLayers = dict(numFilters=[1280],
                      filter_x=[1],
                      filter_y=[1],
                      stride=[1],
                      input_w=[7],
                      input_h=[7],
                      input_c=[320],
                      numGroups=[1],
                      depthWise=[0])
    
    allLayers = appendLayer(allLayers, lastLayers) 
    

    return allLayers

## Choose which model to analyze

In [5]:
allLayers = mobileNetV2()
#allLayers = tinyYoloV3()

## Unpack parameters

In [6]:
numFilters = np.array(allLayers["numFilters"])
filter_x   = np.array(allLayers["filter_x"])
filter_y   = np.array(allLayers["filter_y"])
stride     = np.array(allLayers["stride"])
input_w    = np.array(allLayers["input_w"])
input_h    = np.array(allLayers["input_h"])
input_c    = np.array(allLayers["input_c"])
numGroups  = np.array(allLayers["numGroups"])
depthWise  = np.array(allLayers["depthWise"])

## Output parameters

Parameters without taking into account the core structure of Loihi (i.e. assumes Loihi system is one big core)

In [7]:
# numSplits is implementation specific, not model specific
# Most likely numSplits would be determined automatically by a mapper
numSplits = np.ones(shape=(numFilters.shape))

# determine how thinning and reducing resolution affects the model
numFilters = numFilters*thinning
input_c = input_c*thinning
input_h = input_h*resolutionMultiplier
input_w = input_w*resolutionMultiplier


# channels per filter
filter_c = input_c/numGroups
filter_c[depthWise==1] = 1

# How many input synapses per neuron
fanIn = filter_x*filter_y*filter_c

# How many neurons (synapses) does each input spike go to
fanOut = (filter_x/stride)*(filter_y/stride)*numFilters*filter_c

numNeurons = (input_w/stride)*(input_h/stride)*numFilters

# Each SynOp is 1 MAC, which is 2 FLOPs in darknet (1 mult + 1 add)
numSynapses = numNeurons*fanIn

numWeights = numFilters*fanIn

## Per Core

Thus far parameters indicate "averages" without quantizing to core boundaries. Actual numbers can be quite different

In [8]:
# Groups is a form of core splitting where each input only fans out to numFilters/numGroups different filters
# The weight memory is reduced to a fraction (1/numGroups)^2 because the number of filters per core reduces
# by 1/numGroups, and because each filter size reduces by 1/numGroups
# i.e. for 2 groups (numGroups=2), the "numFilters" different filters at a given x-y location could be split 
# across 2 cores, each core holding numFilters/numGroups different filters.  Each input would only need to be 
# routed to one of these cores. Weight memory gets reduced by 1/2*1/2 because number of filters per core halves
# and each individual filter sizes halves
#
# Core splitting does something similar, (splits numFilters across "numCoreSplits" cores).
# The number of filters on a single core is now numFilters/numCoreSplits, so weight memory also reduces by
# 1/numCoreSplits.
# Each filter is still full sized so fan-in fan-out remain the same.
# Each input spike will need to be routed to numCoreSplits different cores to hit all filters (plus additional 
# cores if its projection crosses core boundaries)
numCoreSplits = np.ones(shape=numFilters.shape)

# Assuming numFilters/numGroups/numCoreSplits different filters on a single core
# groups can be used to reduce memory/computation requirements at the potential cost of accuracy
# numCoreSplits can be used to split computation across cores (reduce per-core weight memory) 
# at no accuracy cost, but require more spike routing (each core need to receive the input spikes)
# splitting across cores allows each core to process more x-y region, which also reduces the need
# for routing multiple spikes at core edges, so there is a tradeoff
filtersPerCore = numFilters/numGroups/numCoreSplits

# filter extent in x and y assuming square filters. A number less than 1 cannot be accommodated. A number of 1
# means that only 1 x-y location is processed by the core, so if filters are of size x*y, each input spikes will
# need to project to x*y different cores to cover all locations (plus additional cores due to numCoreSplits)
filterExtentPerCore_x = np.sqrt(np.divide(numCompartmentsPerCore,filtersPerCore))
# x could be rounded here (floor) allowing non square y. Example 2 locations per core, x=1 y=2 instead of
# x = y = floor(sqrt(2)) = 1
filterExtentPerCore_y = np.divide(numCompartmentsPerCore,filtersPerCore)/filterExtentPerCore_x

# weights per core is a measure of core memory needed to store weights
# core splitting (numCoreSplits) and grouping (numGroups) reduce weightsPerCore
weightsPerCore = filtersPerCore*fanIn

# how many synapses on each core
# core splitting (numCoreSplits) and grouping (numGroups) reduce synapsesPerCore
synapsesPerCore = filtersPerCore*fanIn*filterExtentPerCore_x*filterExtentPerCore_y

# Everything else revolves around this number which we hardcoded above, but if we choose to fix a different number
# in the future (N3 configurable, or something else is the bottleneck) this is how it would be calculated from
# other parameters
compartmentsPerCore = filtersPerCore*filterExtentPerCore_x*filterExtentPerCore_y

# One axon routes to synapses for all filters and all x-y within extent (or core, depending on which is small)
# assumes there is not gap in input (stride<=filter_x) and (stride<=filter_y)
assert (stride<=filter_x).all and (stride<=filter_y).all, "stride must be less than filter size"
axonsPerCore = np.divide(input_c,numGroups)*(filterExtentPerCore_x*stride-1+filter_x)*(filterExtentPerCore_y*stride-1+filter_y)

# How many cores and chips for this layer
numCores = numNeurons/compartmentsPerCore
numChips = numCores/numCoresPerChip

# How many cores does each input spike need to route to?
numCoresPerSpike = (np.divide(filter_x,stride)/filterExtentPerCore_x)*(np.divide(filter_y,stride)/filterExtentPerCore_y)/numCoreSplits

## Print the results

In [9]:
np.set_printoptions(precision=2, suppress=True)
print(\
    "  #  |"\
      + "     Input Size   |"\
      + "  Filter Size \t|"\
      + "Str Grp Spl|"\
      + "\t  Weights    |"\
      + "  Compartments  |"\
      + "    Synapses    |"\
      + "Axons\t|"\
      + "cor/spk|"\
      + "core"\
     )
for ii in range(len(numFilters)):
    print(\
          "{:3d}  "                 #"|Layer number|"\
          "|{:3.0f} x {:3.0f} x {:4.0f}\t"         #"Input Size|"\
          "|{:2.0f} x {:2.0f} x {:4.0f}\t"         #"Filter Size|"\
          "|{:2.0f} {:3.0f} {:3.0f} "                 #"|Stride|"\
          "|{:1.1e}({:1.1e})"           #"|Weights|"\
          "|{:1.1e}({:1.1e})"   #"|Compartments|"\
          "|{:1.1e}({:1.1e})"             #"|Synapses|"\
          "|({:.0f})\t"               #"|Axons|"\
          "|{:.2f}\t"                 #"|Cores/spike|"\
          "|{:.0f}"                 #"|Cores|"\
          .format( \
          ii,                      #"|Layer number|"\
          input_w[ii], input_h[ii], input_c[ii],#"Input Size|"\
          filter_x[ii], filter_y[ii], filter_c[ii],#"Filter Size|"\
          stride[ii],              #"|Stride|"\
          numGroups[ii],           #"|Groups|"\
          numCoreSplits[ii],       #"|Core Split|"\
          numWeights[ii], weightsPerCore[ii],#"|Weights|"\
          numNeurons[ii], compartmentsPerCore[ii],#"|Compartments|"\
          numSynapses[ii], synapsesPerCore[ii],#"|Synapses|"\
          axonsPerCore[ii],        #"|Axons|"\
          numCoresPerSpike[ii],    #"|Cores/spike|"\
          numCores[ii]                 #"|Cores|"\
         ))
    
print(\
      "     "                 #"|Layer number|"\
      "|\t\t\t"         #"Input Size|"\
      "|\t\t"         #"Filter Size|"\
      "|\t    "                 #"|Stride|"\
      "|{:1.1e}\t     "           #"|Weights|"\
      "|{:1.1e}\t      "   #"|Compartments|"\
      "|{:1.1e}\t       "             #"|Synapses|"\
      "|\t"               #"|Axons|"\
      "|\t"                 #"|Cores/spike|"\
      "|{:.0f}"                 #"|Cores|"\
      .format( \
              np.sum(numWeights),#"|Weights|"\
              np.sum(numNeurons),#"|Compartments|"\
              np.sum(numSynapses),#"|Synapses|"\
              np.sum(numCores)                 #"|Cores|"\
             ))

  #  |     Input Size   |  Filter Size 	|Str Grp Spl|	  Weights    |  Compartments  |    Synapses    |Axons	|cor/spk|core
  0  |224 x 224 x    3	| 3 x  3 x    3	| 2   1   1 |8.6e+02(8.6e+02)|4.0e+05(1.0e+03)|1.1e+07(2.8e+04)|(532)	|0.07	|392
  1  |112 x 112 x   32	| 1 x  1 x   32	| 1   1   1 |1.0e+03(1.0e+03)|4.0e+05(1.0e+03)|1.3e+07(3.3e+04)|(1024)	|0.03	|392
  2  |112 x 112 x   32	| 3 x  3 x    1	| 1   1   1 |2.9e+02(2.9e+02)|4.0e+05(1.0e+03)|3.6e+06(9.2e+03)|(1876)	|0.28	|392
  3  |112 x 112 x   32	| 1 x  1 x   32	| 1   1   1 |5.1e+02(5.1e+02)|2.0e+05(1.0e+03)|6.4e+06(3.3e+04)|(2048)	|0.02	|196
  4  |112 x 112 x   16	| 1 x  1 x   16	| 1   1   1 |1.5e+03(1.5e+03)|1.2e+06(1.0e+03)|1.9e+07(1.6e+04)|(171)	|0.09	|1176
  5  |112 x 112 x   96	| 3 x  3 x    1	| 2   1   1 |8.6e+02(8.6e+02)|3.0e+05(1.0e+03)|2.7e+06(9.2e+03)|(6988)	|0.21	|294
  6  | 56 x  56 x   96	| 1 x  1 x   96	| 1   1   1 |2.3e+03(2.3e+03)|7.5e+04(1.0e+03)|7.2e+06(9.8e+04)|(4096)	|0.02	|74
  7  | 56 x  56 x   24	| 1 x  1 x