In [None]:
#  Copyright (c) 2021 Arm Limited. All rights reserved.
#  SPDX-License-Identifier: Apache-2.0
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

# Train and Deploy your NPU-enabled models

> Using the Arm Corstone-300 with Cortex-M55 and Ethos-U55.

## Summary

This notebook presents a flow to help bridge the gap between data scientists and embedded engineers.

## Training a Model

In this example we are going to train a "toy" model. We will create a basic convolutional neural network model to solve the MNIST problem.

The [MNIST database](http://yann.lecun.com/exdb/mnist/) is a dataset of handwritten digits which can be used to train a digit classifier. It is often used as a starter dataset.

Let's start of by importing the required Python dependencies. For this we will use the [TensorFlow](https://github.com/tensorflow/tensorflow) framework for the model and [TensorFlow Datasets](https://github.com/tensorflow/datasets) to download the MNIST dataset. If you're using Google Colab, these dependencies come preinstalled.

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np

We can now download the MNIST dataset using TensorFlow datasets.

In [None]:
(ds_train, ds_test), ds_info = tfds.load('mnist', split=['train', 'test'], shuffle_files=True, 
  as_supervised=True, with_info=True,
)

Once downloaded, we write a function to preprocess the MNIST dataset ready for use in a neural network. The images come in `uint8` format, and so to normalize the dataset so that all values are between `[0, 1]` we divde by `255` (the max `uint8` value).

In [None]:
def normalize_img(image, label):
  """Normalizes images: `uint8` -> `float32`."""
  return tf.cast(image, tf.float32) / 255., label

Let's apply this function to the dataset using `.map` and take a batch size of `128`.

In [None]:
ds_train = ds_train.map(
  normalize_img, num_parallel_calls=tf.data.experimental.AUTOTUNE
)
ds_train = ds_train.cache()
ds_train = ds_train.shuffle(ds_info.splits['train'].num_examples)
ds_train = ds_train.batch(128)
ds_train = ds_train.prefetch(tf.data.experimental.AUTOTUNE)

ds_test = ds_test.map(
  normalize_img, num_parallel_calls=tf.data.experimental.AUTOTUNE
)
ds_test = ds_test.batch(128)
ds_test = ds_test.cache()
ds_test = ds_test.prefetch(tf.data.experimental.AUTOTUNE)

We are now ready to create the model using the `Sequential` functionality. 

Although we could achieve a model with high accuracy using a fully connected model, this would require a lot of weights and biases. The Ethos-U55 is designed to be used with a Cortex-M55 meaning there will be memory limits. For this reason we build a convolutional network with large kernel sizes to reduce the number of weights.

In [None]:
model = tf.keras.models.Sequential([
  tf.keras.layers.InputLayer(input_shape=(28,28,1)),
  tf.keras.layers.Conv2D(32, (3, 3), activation=tf.nn.relu, input_shape=(28, 28, 1)),
  tf.keras.layers.MaxPooling2D((2, 2)),
  tf.keras.layers.Conv2D(64, (3, 3), activation=tf.nn.relu),
  tf.keras.layers.MaxPooling2D((2, 2)),
  tf.keras.layers.Conv2D(64, (3, 3), activation=tf.nn.relu),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(64, activation=tf.nn.relu),
  tf.keras.layers.Dense(10, activation=tf.nn.softmax)
])

We are now ready to train the model. For this toy example we will just train for a singular epoch.

In [None]:
model.compile(
  optimizer=tf.keras.optimizers.Adam(0.001), 
  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
)

model.fit(ds_train, epochs=1, validation_data=ds_test)

## Quantize the Model

The next step is to quantize the model. This converts the weights from floating-point numbers to integer numbers. The Ethos-U55 supports 8 bit weights, and 8 bit and 16 bit activations. 

In this example we will quantize the model into `int8` format. 

Let's first `unbatch` the dataset from 128 samples at a time. In inference we will only be running one image at a time.

In [None]:
ds_train = ds_train.unbatch()

We can then build a generator function to use in the conversion process. 

Creating a generator allows the TensorFlow Lite converter find the best weights to fall to based on the input data.

In [None]:
def representative_data_gen():
  for input_value, output_value in ds_train.batch(1).take(100):
    yield [input_value]

Finally we are ready to convert the model. We can use the `from_keras_model` method to create a converter from our model:

In [None]:
converter = tf.lite.TFLiteConverter.from_keras_model(model)

We can then set the `inference_input_type`, `inference_output_type` and `supported_ops` to `int8`:

In [None]:
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.int8
converter.inference_output_type = tf.int8

We then add the `representative_dataset` to be our generator.

In [None]:
converter.representative_dataset = representative_data_gen

The last step is to run the conversion process

In [None]:
tflite_model_quant = converter.convert()

We now have a quantized model in TFLite format. Let's save this to our files as `my_model.tflite`:

In [None]:
with open("my_model.tflite", "wb") as f:
  f.write(tflite_model_quant)

## Vela Compiler

When creating a model for use on Ethos-U55 we need to use the Vela Compiler to optimise the model.

This is a command-line tool written in Python which takes a `.tflite` file and outputs another `.tflite` file. The new file is restructured in a way that Ethos-U understands.

To do this, let's first install `ethos-u-vela` for the compiler and `xxd` which will be used to convert binary files into hexdumps.

In [None]:
!pip install ethos-u-vela
!apt install -y xxd

We can now compile the model. For this we will specify the config as `ethos-u55-128`. This is one of the commonly used templates for Ethos-U55. This configuration has 128 macs. We will create a `vela.ini` file with our system configuration description. This information helps vela to optimize model efficiently.

In [None]:
%%writefile vela.ini

[System_Config.Ethos_U55_High_End_Embedded]
core_clock=500e6
axi0_port=Sram
axi1_port=OffChipFlash
Sram_clock_scale=1.0
Sram_burst_length=32
Sram_read_latency=32
Sram_write_latency=32
OffChipFlash_clock_scale=0.125
OffChipFlash_burst_length=128
OffChipFlash_read_latency=64
OffChipFlash_write_latency=64

; Shared SRAM: the SRAM is shared between the Ethos-U and the Cortex-M software
; The non-SRAM memory is assumed to be read-only
[Memory_Mode.Shared_Sram]
const_mem_area=Axi1
arena_mem_area=Axi0
cache_mem_area=Axi0

In [None]:
%%bash
vela --accelerator-config=ethos-u55-128 \
--optimise Performance \
--memory-mode=Shared_Sram \
--system-config=Ethos_U55_High_End_Embedded \
--config vela.ini \
my_model.tflite 

We can then convert the `.tflite` binary into a hexdump C headerfile.

In [None]:
!xxd -i output/my_model_vela.tflite my_network_model.h

The last step is to do some cleaning up of the file for the application. Here we rename the model from `output_my_model_vela_tflite` to `network_model` and add some header guards to the file.

The most important is to add model variable attribute `__attribute__((aligned(16)))` for 16 bytes alignment.

In [None]:
!sed -i 's/unsigned int output_my_model_vela_tflite_len/const unsigned int network_model_len/' my_network_model.h
!sed -i 's/unsigned char output_my_model_vela_tflite\[\]/const unsigned char network_model\[\] __attribute__((aligned(16)))/' my_network_model.h

!sed -i '1s/^/#define NETWORK_MODEL_H\n/' my_network_model.h
!sed -i '1s/^/#ifndef NETWORK_MODEL_H\n/' my_network_model.h
!echo "#endif //NETWORK_MODEL_H" >> my_network_model.h