## Imports - Creating a model and convert it for µTensor

In [1]:
!pip install utensor_cgen

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
from utensor_cgen.api.export import tflm_keras_export

ModuleNotFoundError: No module named 'utensor_cgen.api'

In [3]:
import tensorflow as tf
import numpy as np

from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras import Model

## Define Model

In [4]:
class MyModel(Model):
    def __init__(self):
        super(MyModel, self).__init__()
        self.conv1 = Conv2D(32, 3, activation='relu')
        self.pool = MaxPooling2D(pool_size=(5,5))
        self.flatten = Flatten()
        self.d1 = Dense(32, activation='relu')
        self.d2 = Dense(10)

    def call(self, x):
        x0 = self.conv1(x)
        x1 = self.pool(x0)
        x2 = self.flatten(x1)
        x3 = self.d1(x2)
        return self.d2(x3)

# Create an instance of the model
model = MyModel()

## Training

In [6]:
mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

# Add a channels dimension
x_train = x_train[..., tf.newaxis]
x_test = x_test[..., tf.newaxis]

In [7]:
train_ds = tf.data.Dataset.from_tensor_slices(
    (x_train, y_train)
).shuffle(10000).batch(32)

test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(32)

In [8]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam()

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')

In [9]:
@tf.function
def train_step(images, labels):
    with tf.GradientTape() as tape:
        predictions = model(images, training=True)
        loss = loss_object(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss(loss)
    train_accuracy(labels, predictions)

In [9]:
@tf.function
def test_step(images, labels):
    predictions = model(images, training=False)
    t_loss = loss_object(labels, predictions)

    test_loss(t_loss)
    test_accuracy(labels, predictions)

In [10]:
EPOCHS = 1

for epoch in range(EPOCHS):
    # Reset the metrics at the start of the next epoch
    train_loss.reset_states()
    train_accuracy.reset_states()
    test_loss.reset_states()
    test_accuracy.reset_states()

    for images, labels in train_ds:
        train_step(images, labels)

    for test_images, test_labels in test_ds:
        test_step(test_images, test_labels)

    template = 'Epoch {}, Loss: {}, Accuracy: {}, Test Loss: {}, Test Accuracy: {}'
    print(template.format(epoch+1,
                          train_loss.result(),
                          train_accuracy.result()*100,
                          test_loss.result(),
                          test_accuracy.result()*100))

NameError: name 'test_step' is not defined

In [11]:
model.save('saved_model/test_model')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: saved_model/test_model/assets


## Exporting to uTensor

In [12]:
# representative data function
num_calibration_steps = 128
calibration_dtype = tf.float32

def representative_dataset_gen():
    for _ in range(num_calibration_steps):
        rand_idx = np.random.randint(0, x_test.shape[0]-1)
        sample = x_test[rand_idx]
        sample = sample[tf.newaxis, ...]
        sample = tf.cast(sample, dtype=calibration_dtype)
        yield [sample]

In [13]:
tflm_keras_export(
    'saved_model/test_model',
    representive_dataset=representative_dataset_gen,
    model_name='my_model',
    target='utensor',
)

NameError: ignored

### Generated Cpp Files

####  `models/my_model/my_model.hpp`

```cpp
/* Auto-generated by utensor cli */
#ifndef __MY_MODEL_INTERFACE_H
#define __MY_MODEL_INTERFACE_H
#include "uTensor.h"

using namespace uTensor;

class My_model : public ModelInterface<1, 1> 
{
 public:
  enum input_names : uint8_t { input_0 };
  enum output_names : uint8_t { output_0 };
  My_model();
 protected:
  virtual void compute();
 private:
  // Operators
  TflmSymQuantOps::QuantizeOperator<int8_t, float> op_000;

  TflmSymQuantOps::FullyConnectedOperator<int8_t> op_001;

  ReferenceOperators::ReshapeOperator<int8_t> op_002;

  TflmSymQuantOps::FullyConnectedOperator<int8_t> op_003;

  TflmSymQuantOps::DepthwiseSeparableConvOperator<int8_t> op_004;

  ReferenceOperators::MaxPoolOperator<int8_t> op_005;

  TflmSymQuantOps::DequantizeOperator<float, int8_t> op_006;

  // memory allocators
  localCircularArenaAllocator<25432, uint16_t> ram_allocator;
  localCircularArenaAllocator<896, uint16_t> metadata_allocator;
};

#endif // __MY_MODEL_INTERFACE_H
```

#### `models/my_model/my_model.cpp`

```cpp
/* Auto-generated by utensor cli */
#include "uTensor.h"
#include "models/my_model/my_model.hpp"
#include "constants/my_model/params_my_model.hpp"


My_model::My_model () :
op_000()
, op_001(TFLM::TfLiteFusedActivation::kTfLiteActNone)
, op_002({ 1, 800 })
, op_003(TFLM::TfLiteFusedActivation::kTfLiteActRelu)
, op_004({ 1, 1 }, VALID, 32, { 1, 1 }, TFLM::TfLiteFusedActivation::kTfLiteActRelu)
, op_005({ 5, 5 }, { 1, 5, 5, 1 }, VALID)
, op_006()
{
  // meta_allocator and ram_allocator will limit the memory usage for
  // 1. the meta data: the meta data of the operators or the tensors, such as shape, dimensions, ...etc
  // 2. the ram: it's where to store all the values of tensors in the model
  Context::get_default_context()->set_ram_data_allocator(&ram_allocator);
  Context::get_default_context()->set_metadata_allocator(&metadata_allocator);
}

void My_model::compute()
{
  // update context in case there are multiple models being run
  Context::get_default_context()->set_ram_data_allocator(&ram_allocator);
  Context::get_default_context()->set_metadata_allocator(&metadata_allocator);
  // start rendering local snippets
  Tensor t_input_1_int80 = new RamTensor({ 1, 28, 28, 1 }, i8);
    int32_t t_input_1_int80_zp = -128;
    float t_input_1_int80_scale = 0.003921569;
    PerTensorQuantizationParams t_input_1_int80_quant_params(t_input_1_int80_zp, t_input_1_int80_scale);
    t_input_1_int80->set_quantization_params(t_input_1_int80_quant_params);


  op_000
    .set_inputs({
        { TflmSymQuantOps::QuantizeOperator<int8_t, float>::input, inputs[input_0].tensor() },
    })
    .set_outputs({
        { TflmSymQuantOps::QuantizeOperator<int8_t, float>::output, t_input_1_int80}
    })
    .eval();

  Tensor t_StatefulPartitionedCallmy_modelconv2dConv2DReadVariableOp0 = new RomTensor({ 1, 3, 3, 32 }, i8, data_StatefulPartitionedCall_my_model_conv2d_Conv2D_ReadVariableOp_0);
    int32_t arr_t_StatefulPartitionedCallmy_modelconv2dConv2DReadVariableOp0_zp[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
    float arr_t_StatefulPartitionedCallmy_modelconv2dConv2DReadVariableOp0_scale[32] = { 0.0031114144, 0.002765589, 0.0031076465, 0.004831041, 0.0031503097, 0.0045163357, 0.004590568, 0.0035090088, 0.0015731168, 0.0062146154, 0.0043545393, 0.003243946, 0.003846171, 0.0016646852, 0.0023602743, 0.004048715, 0.0039715217, 0.0019010877, 0.0022127707, 0.0020530561, 0.0020105331, 0.0035584292, 0.0019574412, 0.0036743984, 0.0039481325, 0.0046360553, 0.005471392, 0.003234954, 0.0021135923, 0.0019120594, 0.0028307263, 0.0020734943 };
    PerChannelQuantizationParams t_StatefulPartitionedCallmy_modelconv2dConv2DReadVariableOp0_quant_params(arr_t_StatefulPartitionedCallmy_modelconv2dConv2DReadVariableOp0_zp, arr_t_StatefulPartitionedCallmy_modelconv2dConv2DReadVariableOp0_scale);
    t_StatefulPartitionedCallmy_modelconv2dConv2DReadVariableOp0->set_quantization_params(t_StatefulPartitionedCallmy_modelconv2dConv2DReadVariableOp0_quant_params);


  Tensor t_StatefulPartitionedCallmy_modelconv2dConv2D_bias0 = new RomTensor({ 32 }, i32, data_StatefulPartitionedCall_my_model_conv2d_Conv2D_bias_0);
    int32_t arr_t_StatefulPartitionedCallmy_modelconv2dConv2D_bias0_zp[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
    float arr_t_StatefulPartitionedCallmy_modelconv2dConv2D_bias0_scale[32] = { 1.2201625e-05, 1.0845448e-05, 1.218685e-05, 1.894526e-05, 1.2354156e-05, 1.7711121e-05, 1.8002229e-05, 1.37608195e-05, 6.1690857e-06, 2.4371042e-05, 1.7076625e-05, 1.2721358e-05, 1.5083025e-05, 6.5281774e-06, 9.2559785e-06, 1.5877315e-05, 1.5574597e-05, 7.4552463e-06, 8.677533e-06, 8.051201e-06, 7.884444e-06, 1.39546255e-05, 7.676241e-06, 1.44094065e-05, 1.5482874e-05, 1.818061e-05, 2.145644e-05, 1.2686095e-05, 8.288598e-06, 7.4982727e-06, 1.11008885e-05, 8.13135e-06 };
    PerChannelQuantizationParams t_StatefulPartitionedCallmy_modelconv2dConv2D_bias0_quant_params(arr_t_StatefulPartitionedCallmy_modelconv2dConv2D_bias0_zp, arr_t_StatefulPartitionedCallmy_modelconv2dConv2D_bias0_scale);
    t_StatefulPartitionedCallmy_modelconv2dConv2D_bias0->set_quantization_params(t_StatefulPartitionedCallmy_modelconv2dConv2D_bias0_quant_params);


  Tensor t_StatefulPartitionedCallmy_modelconv2dRelu0 = new RamTensor({ 1, 26, 26, 32 }, i8);
    int32_t t_StatefulPartitionedCallmy_modelconv2dRelu0_zp = -128;
    float t_StatefulPartitionedCallmy_modelconv2dRelu0_scale = 0.0058112345;
    PerTensorQuantizationParams t_StatefulPartitionedCallmy_modelconv2dRelu0_quant_params(t_StatefulPartitionedCallmy_modelconv2dRelu0_zp, t_StatefulPartitionedCallmy_modelconv2dRelu0_scale);
    t_StatefulPartitionedCallmy_modelconv2dRelu0->set_quantization_params(t_StatefulPartitionedCallmy_modelconv2dRelu0_quant_params);


  op_004
    .set_inputs({
        { TflmSymQuantOps::DepthwiseSeparableConvOperator<int8_t>::in, t_input_1_int80 },
        { TflmSymQuantOps::DepthwiseSeparableConvOperator<int8_t>::filter, t_StatefulPartitionedCallmy_modelconv2dConv2DReadVariableOp0 },
        { TflmSymQuantOps::DepthwiseSeparableConvOperator<int8_t>::bias, t_StatefulPartitionedCallmy_modelconv2dConv2D_bias0 },
    })
    .set_outputs({
        { TflmSymQuantOps::DepthwiseSeparableConvOperator<int8_t>::out, t_StatefulPartitionedCallmy_modelconv2dRelu0}
    })
    .eval();

  t_StatefulPartitionedCallmy_modelconv2dConv2DReadVariableOp0.free();

  t_input_1_int80.free();

  t_StatefulPartitionedCallmy_modelconv2dConv2D_bias0.free();

  Tensor t_StatefulPartitionedCallmy_modelmax_pooling2dMaxPool0 = new RamTensor({ 1, 5, 5, 32 }, i8);
    int32_t t_StatefulPartitionedCallmy_modelmax_pooling2dMaxPool0_zp = -128;
    float t_StatefulPartitionedCallmy_modelmax_pooling2dMaxPool0_scale = 0.0058112345;
    PerTensorQuantizationParams t_StatefulPartitionedCallmy_modelmax_pooling2dMaxPool0_quant_params(t_StatefulPartitionedCallmy_modelmax_pooling2dMaxPool0_zp, t_StatefulPartitionedCallmy_modelmax_pooling2dMaxPool0_scale);
    t_StatefulPartitionedCallmy_modelmax_pooling2dMaxPool0->set_quantization_params(t_StatefulPartitionedCallmy_modelmax_pooling2dMaxPool0_quant_params);


  op_005
    .set_inputs({
        { ReferenceOperators::MaxPoolOperator<int8_t>::in, t_StatefulPartitionedCallmy_modelconv2dRelu0 },
    })
    .set_outputs({
        { ReferenceOperators::MaxPoolOperator<int8_t>::out, t_StatefulPartitionedCallmy_modelmax_pooling2dMaxPool0}
    })
    .eval();

  t_StatefulPartitionedCallmy_modelconv2dRelu0.free();

  Tensor t_StatefulPartitionedCallmy_modelmax_pooling2dMaxPool_0_Reshape00 = new RamTensor({ 1, 800 }, i8);
    int32_t t_StatefulPartitionedCallmy_modelmax_pooling2dMaxPool_0_Reshape00_zp = -128;
    float t_StatefulPartitionedCallmy_modelmax_pooling2dMaxPool_0_Reshape00_scale = 0.0058112345;
    PerTensorQuantizationParams t_StatefulPartitionedCallmy_modelmax_pooling2dMaxPool_0_Reshape00_quant_params(t_StatefulPartitionedCallmy_modelmax_pooling2dMaxPool_0_Reshape00_zp, t_StatefulPartitionedCallmy_modelmax_pooling2dMaxPool_0_Reshape00_scale);
    t_StatefulPartitionedCallmy_modelmax_pooling2dMaxPool_0_Reshape00->set_quantization_params(t_StatefulPartitionedCallmy_modelmax_pooling2dMaxPool_0_Reshape00_quant_params);


  op_002
    .set_inputs({
        { ReferenceOperators::ReshapeOperator<int8_t>::input, t_StatefulPartitionedCallmy_modelmax_pooling2dMaxPool0 },
    })
    .set_outputs({
        { ReferenceOperators::ReshapeOperator<int8_t>::output, t_StatefulPartitionedCallmy_modelmax_pooling2dMaxPool_0_Reshape00}
    })
    .eval();

  t_StatefulPartitionedCallmy_modelmax_pooling2dMaxPool0.free();

  Tensor t_StatefulPartitionedCallmy_modeldenseMatMulReadVariableOptranspose0 = new RomTensor({ 800, 32 }, i8, data_StatefulPartitionedCall_my_model_dense_MatMul_ReadVariableOp_transpose_0);
    int32_t t_StatefulPartitionedCallmy_modeldenseMatMulReadVariableOptranspose0_zp = 0;
    float t_StatefulPartitionedCallmy_modeldenseMatMulReadVariableOptranspose0_scale = 0.004301873;
    PerTensorQuantizationParams t_StatefulPartitionedCallmy_modeldenseMatMulReadVariableOptranspose0_quant_params(t_StatefulPartitionedCallmy_modeldenseMatMulReadVariableOptranspose0_zp, t_StatefulPartitionedCallmy_modeldenseMatMulReadVariableOptranspose0_scale);
    t_StatefulPartitionedCallmy_modeldenseMatMulReadVariableOptranspose0->set_quantization_params(t_StatefulPartitionedCallmy_modeldenseMatMulReadVariableOptranspose0_quant_params);


  Tensor t_StatefulPartitionedCallmy_modeldenseMatMul_bias0 = new RomTensor({ 32 }, i32, data_StatefulPartitionedCall_my_model_dense_MatMul_bias_0);
    int32_t t_StatefulPartitionedCallmy_modeldenseMatMul_bias0_zp = 0;
    float t_StatefulPartitionedCallmy_modeldenseMatMul_bias0_scale = 2.4999194e-05;
    PerTensorQuantizationParams t_StatefulPartitionedCallmy_modeldenseMatMul_bias0_quant_params(t_StatefulPartitionedCallmy_modeldenseMatMul_bias0_zp, t_StatefulPartitionedCallmy_modeldenseMatMul_bias0_scale);
    t_StatefulPartitionedCallmy_modeldenseMatMul_bias0->set_quantization_params(t_StatefulPartitionedCallmy_modeldenseMatMul_bias0_quant_params);


  Tensor t_StatefulPartitionedCallmy_modeldenseRelu0 = new RamTensor({ 1, 32 }, i8);
    int32_t t_StatefulPartitionedCallmy_modeldenseRelu0_zp = -128;
    float t_StatefulPartitionedCallmy_modeldenseRelu0_scale = 0.060061626;
    PerTensorQuantizationParams t_StatefulPartitionedCallmy_modeldenseRelu0_quant_params(t_StatefulPartitionedCallmy_modeldenseRelu0_zp, t_StatefulPartitionedCallmy_modeldenseRelu0_scale);
    t_StatefulPartitionedCallmy_modeldenseRelu0->set_quantization_params(t_StatefulPartitionedCallmy_modeldenseRelu0_quant_params);


  op_003
    .set_inputs({
        { TflmSymQuantOps::FullyConnectedOperator<int8_t>::input, t_StatefulPartitionedCallmy_modelmax_pooling2dMaxPool_0_Reshape00 },
        { TflmSymQuantOps::FullyConnectedOperator<int8_t>::filter, t_StatefulPartitionedCallmy_modeldenseMatMulReadVariableOptranspose0 },
        { TflmSymQuantOps::FullyConnectedOperator<int8_t>::bias, t_StatefulPartitionedCallmy_modeldenseMatMul_bias0 },
    })
    .set_outputs({
        { TflmSymQuantOps::FullyConnectedOperator<int8_t>::output, t_StatefulPartitionedCallmy_modeldenseRelu0}
    })
    .eval();

  t_StatefulPartitionedCallmy_modeldenseMatMul_bias0.free();

  t_StatefulPartitionedCallmy_modelmax_pooling2dMaxPool_0_Reshape00.free();

  t_StatefulPartitionedCallmy_modeldenseMatMulReadVariableOptranspose0.free();

  Tensor t_StatefulPartitionedCallmy_modeldense_1MatMulReadVariableOptranspose0 = new RomTensor({ 32, 10 }, i8, data_StatefulPartitionedCall_my_model_dense_1_MatMul_ReadVariableOp_transpose_0);
    int32_t t_StatefulPartitionedCallmy_modeldense_1MatMulReadVariableOptranspose0_zp = 0;
    float t_StatefulPartitionedCallmy_modeldense_1MatMulReadVariableOptranspose0_scale = 0.0049738125;
    PerTensorQuantizationParams t_StatefulPartitionedCallmy_modeldense_1MatMulReadVariableOptranspose0_quant_params(t_StatefulPartitionedCallmy_modeldense_1MatMulReadVariableOptranspose0_zp, t_StatefulPartitionedCallmy_modeldense_1MatMulReadVariableOptranspose0_scale);
    t_StatefulPartitionedCallmy_modeldense_1MatMulReadVariableOptranspose0->set_quantization_params(t_StatefulPartitionedCallmy_modeldense_1MatMulReadVariableOptranspose0_quant_params);


  Tensor t_StatefulPartitionedCallmy_modeldense_1MatMul_bias0 = new RomTensor({ 10 }, i32, data_StatefulPartitionedCall_my_model_dense_1_MatMul_bias_0);
    int32_t t_StatefulPartitionedCallmy_modeldense_1MatMul_bias0_zp = 0;
    float t_StatefulPartitionedCallmy_modeldense_1MatMul_bias0_scale = 0.00029873528;
    PerTensorQuantizationParams t_StatefulPartitionedCallmy_modeldense_1MatMul_bias0_quant_params(t_StatefulPartitionedCallmy_modeldense_1MatMul_bias0_zp, t_StatefulPartitionedCallmy_modeldense_1MatMul_bias0_scale);
    t_StatefulPartitionedCallmy_modeldense_1MatMul_bias0->set_quantization_params(t_StatefulPartitionedCallmy_modeldense_1MatMul_bias0_quant_params);


  Tensor t_Identity_int80 = new RamTensor({ 1, 10 }, i8);
    int32_t t_Identity_int80_zp = 4;
    float t_Identity_int80_scale = 0.13305335;
    PerTensorQuantizationParams t_Identity_int80_quant_params(t_Identity_int80_zp, t_Identity_int80_scale);
    t_Identity_int80->set_quantization_params(t_Identity_int80_quant_params);


  op_001
    .set_inputs({
        { TflmSymQuantOps::FullyConnectedOperator<int8_t>::input, t_StatefulPartitionedCallmy_modeldenseRelu0 },
        { TflmSymQuantOps::FullyConnectedOperator<int8_t>::filter, t_StatefulPartitionedCallmy_modeldense_1MatMulReadVariableOptranspose0 },
        { TflmSymQuantOps::FullyConnectedOperator<int8_t>::bias, t_StatefulPartitionedCallmy_modeldense_1MatMul_bias0 },
    })
    .set_outputs({
        { TflmSymQuantOps::FullyConnectedOperator<int8_t>::output, t_Identity_int80}
    })
    .eval();

  t_StatefulPartitionedCallmy_modeldenseRelu0.free();

  t_StatefulPartitionedCallmy_modeldense_1MatMul_bias0.free();

  t_StatefulPartitionedCallmy_modeldense_1MatMulReadVariableOptranspose0.free();

  op_006
    .set_inputs({
        { TflmSymQuantOps::DequantizeOperator<float, int8_t>::a, t_Identity_int80 },
    })
    .set_outputs({
        { TflmSymQuantOps::DequantizeOperator<float, int8_t>::b, outputs[output_0].tensor()}
    })
    .eval();

  t_Identity_int80.free();
  // end of rendering local snippets
}
```

### The input data: `input_image.h`

```cpp
// the input image pixel values, 28x28 image is flattened into a 1D array
const float arr_input_image[784] = {
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 
  0.0, 0.0, 0.0, 0.0, 0.32941176470588235, 0.7254901960784313, 0.6235294117647059, 0.592156862745098, 
  0.23529411764705882, 0.1411764705882353, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8705882352941177, 0.996078431372549, 0.996078431372549, 
  0.996078431372549, 0.996078431372549, 0.9450980392156862, 0.7764705882352941, 0.7764705882352941, 0.7764705882352941, 
  0.7764705882352941, 0.7764705882352941, 0.7764705882352941, 0.7764705882352941, 0.7764705882352941, 
  0.6666666666666666, 0.20392156862745098, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2627450980392157, 
  0.4470588235294118, 0.2823529411764706, 0.4470588235294118, 0.6392156862745098, 0.8901960784313725, 0.996078431372549, 
  .8823529411764706, 0.996078431372549, 0.996078431372549, 0.996078431372549, 0.9803921568627451, 0.8980392156862745, 
  0.996078431372549, 0.996078431372549, 0.5490196078431373, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 
  0.0, 0.0, 0.0, 0.0, 0.0, 0.06666666666666667, 0.25882352941176473, 0.054901960784313725, 0.2627450980392157, 0.2627450980392157, 
  0.2627450980392157, 0.23137254901960785, 0.08235294117647059, 0.9254901960784314, 0.996078431372549, 0.41568627450980394, 
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 
  0.3254901960784314, 0.9921568627450981, 0.8196078431372549, 0.07058823529411765, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.08627450980392157, 0.9137254901960784, 1.0, 
  0.3254901960784314, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 
  0.0, 0.0, 0.0, 0.0, 0.5058823529411764, 0.996078431372549, 0.9333333333333333, 0.17254901960784313, 0.0, 0.0, 0.0, 0.0, 
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.23137254901960785, 
  0.9764705882352941, 0.996078431372549, 0.24313725490196078, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5215686274509804, 0.996078431372549, 0.7333333333333333, 
  0.0196078431372549, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 
  0.0, 0.0, 0.0, 0.03529411764705882, 0.803921568627451, 0.9725490196078431, 0.22745098039215686, 0.0, 0.0, 0.0, 0.0, 0.0, 
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.49411764705882355, 
  0.996078431372549, 0.7137254901960784, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.29411764705882354, 0.984313725490196, 0.9411764705882353, 0.2235294117647059, 
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.07450980392156863, 
  0.8666666666666667, 0.996078431372549, 0.6509803921568628, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.011764705882352941, 0.796078431372549, 0.996078431372549, 0.8588235294117647, 0.13725490196078433, 
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.14901960784313725, 
  0.996078431372549, 0.996078431372549, 0.30196078431372547, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.12156862745098039, 0.8784313725490196, 0.996078431372549, 0.45098039215686275, 0.00392156862745098, 
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5215686274509804, 
  0.996078431372549, 0.996078431372549, 0.20392156862745098, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.23921568627450981, 0.9490196078431372, 0.996078431372549, 0.996078431372549, 0.20392156862745098, 
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4745098039215686, 
  0.996078431372549, 0.996078431372549, 0.8588235294117647, 0.1568627450980392, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4745098039215686, 0.996078431372549, 0.8117647058823529, 0.07058823529411765, 
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 
  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
```

### Write `main.cpp`

```cpp
#include <cmath>
#include <iostream>

// include the model header and uTensor
#include "models/model/model.hpp"
// include the input image data for this demo
#include "input_image.h"
#include "uTensor.h"

using namespace uTensor;

using std::cout;
using std::endl;


int main(int argc, const char** argv) {
  My_model model;
  // create the input/output tensor
  Tensor input_image = new RomTensor({1, 28, 28, 1}, flt, arr_input_image);
  Tensor logits = new RamTensor({1, 10}, flt);
  
  // setup inputs/outputs and eval
  model
    .set_inputs(
      {{My_model::input_0, input_image}})
    .set_outputs({{Model::output_0, logits}})
    .eval();
  float max_value = static_cast<float>(logits(0));
  int max_index = 0;
  for (int i = 1; i < 10; ++i) {
    float value = static_cast<float>(logits(i));
    if (value >= max_value) {
      max_value = value;
      max_index = i;
    }
  }
  cout << "pred label: " << max_index << endl;
  return 0;
}
```

### Compile and Run

![end2end-output](images/end2end_keras_output.png)