In [1]:
from utils import (
        tvmc_compile_and_unpack, 
        relay_soma_conv2d,
        create_demo_file, 
        parse_cli_options,
        load_or_create_random_array
        )
import tvm
import tvm.relay as relay
import tvm.relay.transform as transform
from tvm.driver.tvmc.model import TVMCModel
from tvm.driver.tvmc.compiler import compile_model
from tvm.relay.backend import Executor, Runtime
import numpy as np

# Define a model in relay

Below, you can find examples of manually constructed relay graphs with the TVM python relay API (https://tvm.apache.org/docs/reference/api/python/relay/index.html).

This is an example graph with a single (compound) 2D convolution that is supported by the digital accelerator.

In [2]:
def create_model_single_layer():
    input_shape = (1, 3, 32, 32)
    weights_shape = (5, 3, 3, 3)
    conv_channels = weights_shape[0]
    shift_bits = 0
    weights_name = "weights"
    bias_name = "bias"
    
    # define variables
    x = relay.var("input", relay.TensorType(input_shape, 'int8'))
    w = relay.var(weights_name, relay.TensorType(weights_shape, 'int8'))
    b = relay.var(bias_name, relay.TensorType((conv_channels,), 'int32'))

    # define weights and bias values
    w_value = np.random.uniform(low=-10, high=10, size=weights_shape).astype(np.int8)
    b_value = np.random.uniform(low=-10, high=10, size=conv_channels).astype(np.int32)
    params = {weights_name: tvm.nd.array(w_value), bias_name: tvm.nd.array(b_value)}

    # define diana composite convolution op
    x = relay.qnn.op.conv2d(x, w, relay.const(0), relay.const(0), relay.const(1.0), relay.const(1.0), weights_shape[-2:], channels=conv_channels, padding=(1, 1))
    x = relay.op.nn.bias_add(x, b)
    x = relay.op.right_shift(x, relay.const(shift_bits))     # power-of-two quantization scale
    x = relay.op.clip(x, a_min=-128, a_max=127)
    x = relay.op.cast(x, 'int8')
    x = relay.op.clip(x, a_min=0, a_max=127)                 # Relu

    # create an IR module from the relay expression
    mod = tvm.ir.IRModule()
    mod = mod.from_expr(x)

    return mod, params

A similar example with two convolutions. In order to make it more readable, it uses our relay_soma_conv2d utility function to construct a 2D convolution.
When the attributes of a convolution don't match the supported attributes by the digital accelerator, the convolution will be offloaded to the CPU instead. In this case, a warning will be thrown to notify the user about this.

In [3]:
def create_model_two_layers(strides_conv2=(1, 1)):
    input_shape = (1, 3, 32, 32)
    x = relay.var("input", relay.TensorType(input_shape, 'int8'))

    weights1_shape = (32, 3, 3, 3)
    weights1 = load_or_create_random_array("weights1.npy", weights1_shape, np.int8)
    x, params1 = relay_soma_conv2d(x, 'conv1', weights1_shape, 
                                   weights1,
                                   np.ones(weights1_shape[0]).astype(np.int32), 
                                   act=True, shift_bits=4)

    weights2_shape = (8, 32, 3, 3)
    weights2 = load_or_create_random_array("weights2.npy", weights2_shape, np.int8)
    x, params2 = relay_soma_conv2d(x, 'conv2', weights2_shape, 
                                   weights2,
                                   np.ones(weights2_shape[0]).astype(np.int32),
                                   strides=strides_conv2,
                                   act=False, shift_bits=4)
    params = params1
    params.update(params2)

    # create an IR module from the relay expression
    mod = tvm.ir.IRModule()
    mod = mod.from_expr(x)

    return mod, params

A similar example with two convolutions and a residual skip connection. Element-wise sums are currently offloaded to the CPU.

In [4]:
def create_model_two_layers_and_residual():
    input_shape = (1, 32, 32, 32)
    x = relay.var("input", relay.TensorType(input_shape, 'int8'))

    weights1_shape = (16, 32, 3, 3)
    weights1 = load_or_create_random_array("weights1.npy", weights1_shape, np.int8)
    y, params1 = relay_soma_conv2d(x, 'conv1', weights1_shape, 
                                   weights1,
                                   np.ones(weights1_shape[0]).astype(np.int32), 
                                   act=True, shift_bits=4)

    weights2_shape = (32, 16, 3, 3)
    weights2 = load_or_create_random_array("weights2.npy", weights2_shape, np.int8)
    y, params2 = relay_soma_conv2d(y, 'conv2', weights2_shape, 
                                   weights2,
                                   np.ones(weights2_shape[0]).astype(np.int32), 
                                   act=False, shift_bits=4)
    x = relay.add(x, y)
    
    params = params1
    params.update(params2)

    # create an IR module from the relay expression
    mod = tvm.ir.IRModule()
    mod = mod.from_expr(x)

    return mod, params

## Create and show the relay graph

In [5]:
mod, params = create_model_single_layer()
#mod, params = create_model_two_layers()
#mod, params = create_model_two_layers((1, 2))
#mod, params = create_model_two_layers_and_residual()
print(mod)

def @main(%input: Tensor[(1, 3, 32, 32), int8], %weights: Tensor[(5, 3, 3, 3), int8], %bias: Tensor[(5), int32]) {
  %0 = qnn.conv2d(%input, %weights, 0, 0, 1f, 1f, padding=[1, 1, 1, 1], channels=5, kernel_size=[3, 3], out_dtype="int32");
  %1 = nn.bias_add(%0, %bias);
  %2 = right_shift(%1, 0);
  %3 = clip(%2, a_min=-128f, a_max=127f);
  %4 = cast(%3, dtype="int8");
  clip(%4, a_min=0f, a_max=127f)
}



## Make it a TVMModel

In [6]:
model = TVMCModel(mod, params)

# Compile the model to C code

## Compilation options

In [7]:
target = "soma_dory, c"      # send supported operations to the digital accelerator, generate C code for the CPU for all other operations
#target = "c"                # generate C code for the CPU only

fuse_layers = True           # enable/disable layer fusion

## Compile the model

Compile the TVM model and unpack the generated .tar file to a given build folder (build_path).

The generated source code can be found in `build_folder/codegen/host/src`.

The output contains a number of C files named `default_libX.c` with `X` a incremental number if more files are used.

In [8]:
tvmc_compile_and_unpack(model, target=target, fuse_layers=fuse_layers, build_path='build')


Backend: Matching patterns from generated DORY ONNX to HW Nodes.

Find One other solution, It will not work for real networks with multiple strides = 2

Diana Backend: Adjusting Data Layout to HWC and CoutKCin.

Updating memory occupation and MACs of tensors in layers

Insert tiling parameters per layer inside graph nodes

DORY Backend: Renaming Weights tensors.

Mapping the layers files to their templates and copying the kernels associated.

Generating weight string for tvmgen_default_soma_dory_main_0.
def @main(%input: Tensor[(1, 3, 32, 32), int8] /* ty=Tensor[(1, 3, 32, 32), int8] */) -> Tensor[(1, 5, 32, 32), int8] {
  %0 = reshape(%input, newshape=[768, 4]) /* ty=Tensor[(768, 4), int8] */;
  %1 = reverse(%0, axis=1) /* ty=Tensor[(768, 4), int8] */;
  %2 = reshape(%1, newshape=[1, 3, 32, 32]) /* ty=Tensor[(1, 3, 32, 32), int8] */;
  %3 = @tvmgen_default_soma_dory_main_0(%2) /* ty=Tensor[(1, 5, 32, 32), int8] */;
  %4 = reshape(%3, newshape=[1280, 4]) /* ty=Tensor[(1280, 4), int8] 

[06:23:46] /esat/sol1/users/jvandelm/gitlab-runner/builds/soma_compiler/tvm-fork/src/relay/backend/aot_executor_codegen.cc:497: CreateFuncCall: tvmgen_default_fused_reshape_reverse_reshape -> tir.tvm_check_return(0, -1, tir.call_extern("tvmgen_default_fused_reshape_reverse_reshape", input_buffer_var, sid_1))

[06:23:46] /esat/sol1/users/jvandelm/gitlab-runner/builds/soma_compiler/tvm-fork/src/relay/backend/aot_executor_codegen.cc:497: CreateFuncCall: tvmgen_default_soma_dory_main_0 -> tir.tvm_check_return(0, -1, tir.call_extern("tvmgen_default_soma_dory_main_0", sid_1, sid_2))

[06:23:46] /esat/sol1/users/jvandelm/gitlab-runner/builds/soma_compiler/tvm-fork/src/relay/backend/aot_executor_codegen.cc:497: CreateFuncCall: tvmgen_default_fused_reshape_reverse_reshape_1 -> tir.tvm_check_return(0, -1, tir.call_extern("tvmgen_default_fused_reshape_reverse_reshape_1", sid_2, output_buffer_var))



# Compile generated C code for DIANA
## Copy the DORY runtime library files

The generated model code makes calls to the DORY runtime library, which contains the microkernel implementation of all supported operations for the digital accelerator. We need to include these files into the build. A clone of DORY is installed in `/dory`

In [9]:
DORY_SRC_DIR="/dory"
DORY_DST_DIR="dory"

In [10]:
!mkdir -p $DORY_DST_DIR/include
!mkdir -p $DORY_DST_DIR/src

!cp $DORY_SRC_DIR/dory/Hardware_targets/Diana/Backend_Kernels/dory-hal/include/*.h $DORY_DST_DIR/include
!cp $DORY_SRC_DIR/dory/Hardware_targets/Diana/Backend_Kernels/dory-hal/src/*.c $DORY_DST_DIR/src
!cp $DORY_SRC_DIR/dory/Hardware_targets/Diana/Diana_TVM/Utils_files/*.h $DORY_DST_DIR/include
!cp $DORY_SRC_DIR/dory/Hardware_targets/Diana/Diana_TVM/Utils_files/*.c $DORY_DST_DIR/src

## Generate a template application

For this, we:
* copy a few C files (including a malloc wrapper, prolfiling tools and some required header files)
* Generate a main template function

In [11]:
APP_DST_DIR="app"
APP_SRC_DIR="../../byoc"

In [12]:
!mkdir -p $APP_DST_DIR/src
!mkdir -p $APP_DST_DIR/include

!cp $APP_SRC_DIR/src/*.c $APP_DST_DIR/src
!cp $APP_SRC_DIR/include/*.h $APP_DST_DIR/include

In [13]:
create_demo_file(mod, APP_DST_DIR + '/src/demo.c') # generate demo.c, the template main function

Creating demo file: Inferring shapes and types...
Creating demo file: Inferred shapes:
	input (int8):
	 [1 3 32 32]
	output (int8):
	 [1 5 32 32]


## Cross-compile for DIANA

Open a terminal and execute `make -f Makefile.pulprt clean all`. Note that we don't execute this command in the notebook due to the large list of environmental variables that need to be set in advance.