In [1]:
%%capture cap
import numpy as np
import warnings; warnings.simplefilter('ignore')
import torch
import torch.nn as nn
from torch.autograd import Variable
import os
import logging
import onnx
import tvm
import tvm.micro as micro
from tvm.contrib import graph_executor, utils
from tvm import relay
import subprocess
from tvm.micro.contrib import zephyr
import sys
%matplotlib inline

In [2]:
temp_file_directory = '/home/vagrant/test_models/un1'

In [3]:
TARGET = tvm.target.target.micro("stm32f746xx")
BOARD = "nucleo_f746zg" # or "stm32f746g_disco#"

repo_root = subprocess.check_output(["git", "rev-parse", "--show-toplevel"], encoding='utf-8').strip()
project_dir = os.path.join(repo_root, "apps", "microtvm", "zephyr", "demo_runtime")
compiler = zephyr.ZephyrCompiler(project_dir=project_dir,board=BOARD,zephyr_toolchain_variant="zephyr",)

opts = tvm.micro.default_options(f"{project_dir}/crt")
workspace = tvm.micro.Workspace()
flasher = compiler.flasher()

def onnx_relay(model,L1,L2,L3,node,i_size):
    lstm = model(L1,L2,L3)
    print(lstm)
    pytorch_total_params = sum(p.numel() for p in lstm.parameters())
    print("Total Number of Parameters: ", pytorch_total_params)
    os.chdir(str(temp_file_directory))
    torch.save(lstm, "./lstm.pt")
    model = torch.load("./lstm.pt")
    model.eval()
    inputs = Variable(torch.randn(i_size))
    torch.onnx.export(model,inputs,"lstm.onnx", verbose=False)
    model_path = os.path.join(os.getcwd(),"lstm.onnx")
    model = onnx.load(model_path)
    input_name = str(node)
    shape_dict = {input_name: tuple(i_size)}
    mod, params = relay.frontend.from_onnx(model, shape_dict)
    return mod , params


def build_flash(mod,TARGET,params,input_mcu,node):
    workspace = tvm.micro.Workspace()
    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
        graph, c_mod, c_params = relay.build(mod, target=TARGET, params=params)
        micro_binary = tvm.micro.build_static_runtime(workspace,compiler,c_mod,opts,extra_libs=[tvm.micro.get_standalone_crt_lib("memory")],)
        print(os.path.join(micro_binary.base_dir, micro_binary.binary_file))
        print("****************************************************************************")
        !~/zephyr-sdk/arm-zephyr-eabi/bin/arm-zephyr-eabi-size {os.path.join(micro_binary.base_dir, micro_binary.binary_file)}
        print("****************************************************************************")

    with tvm.micro.Session(binary=micro_binary, flasher=flasher) as session:
        graph_mod = tvm.micro.create_local_debug_executor(graph, session.get_system_lib(), session.device)
        graph_mod.set_input(**c_params)
        graph_mod.set_input(str(node), tvm.nd.array(input_mcu))
        graph_mod.run()
        tvm_output = graph_mod.get_output(0).asnumpy()
        print(tvm_output.shape)
        tvm_output = torch.unsqueeze(torch.tensor(tvm_output),0)
        print(tvm_output.shape)
    return tvm_output 

In [4]:
i_ifc = []
o_ifc = []
i_l= []
o_l= []
i_ffc = []
o_ffc = []
sequence_length = []

stack = []
splits = [0,0,0,0,0,0]
flag = 0
index = 0
mode = 0 #1
user = 0

In [5]:
print("****************************************************************************")
print("Enable User Input? 1 for Yes 0 for No")
user = int(input())
print("****************************************************************************")

****************************************************************************
Enable User Input? 1 for Yes 0 for No
1
****************************************************************************


In [6]:
if (user == 1):
    print("****************************************************************************")
    initial_fc_layers = int(input("Enter the number of fc layers before lstm blocks"))
    print("****************************************************************************")
    lstm_layers = int(input("MAX number of lstm blocks"))
    print("****************************************************************************")
    final_fc_layers = int(input("Enter the number of fc layers after the lstm blocks"))
    print("****************************************************************************")


    total_layers = initial_fc_layers + lstm_layers + final_fc_layers
    if (initial_fc_layers != 0):  
        for each in range(0,initial_fc_layers):
            if (each == 0):
                print("****************************************************************************")
                print("Enter Input size of the Neural Network")
                t1 = int(input("Size: "))
                i_ifc.append(t1)
            print("****************************************************************************")
            print("Enter Output size of initial_fc_layer: ", (each+1))
            t2 = int(input("Size: "))
            o_ifc.append(t2)
            if (each != (initial_fc_layers-1)):
                i_ifc.append(t2)
    else:
        i_ffc= [0]* total_layers
        o_ffc= [0]* total_layers


    if (lstm_layers != 0):    
        for each in range(0,lstm_layers):
            print("****************************************************************************")
            if (each == 0):
                print("Keep same time sequence processing length of all lstm blocks? 0 for NO and 1 for Yes")
                d1 = int(input())
            if (d1 != 1):
                print("Enter sequence length of lstm layer: ", each+1)
                t3 = int(input("length: "))
                sequence_length.append(t3)
            else:
                if (each == 0):
                    print("Enter common sequence length for all the blocks:")
                    t3 = int(input("length: "))
                    sequence_length = [t3]*lstm_layers
            if (each == 0):
                print("****************************************************************************")
                print("Keep same stacking of all lstm blocks? 0 for NO and 1 for Yes")
                d2 = int(input())
            if (d2 != 1):
                print("****************************************************************************")
                print("Enter stacking layers in lstm layer: ", each+1)
                t4 = int(input("stacked by: "))
                stack.append(t4)
            else:
                if (each == 0):
                    print("Enter common stacking for all the blocks:")
                    t4 = int(input("length: "))
                    stack = [t4]*lstm_layers    
                
            if (each == 0):
                if (initial_fc_layers == 0):
                    print("####################################################################")
                    print("Enter Input size of lstm_layer: ", (each+1))
                    t1 = int(input("Size: "))
                    i_l.append(t1)
                else:
                    i_l.append(o_ifc[initial_fc_layers-1])

            print("****************************************************************************")
            print("Enter Output size of lstm_layer: ", (each+1))
            t2 = int(input("Size: "))
            o_l.append(t2)



            if (each>0):
                if (sequence_length[each] == sequence_length[each-1]):
                    i_l.append(o_l[each-1])
                else:
                    a = int((sequence_length[each-1])*(o_l[each-1]))
                    b = int(sequence_length[each])
                    i_l.append(int(a/b))                   
    else:
        stack = [1]* total_layers
        sequence_length = [1]* total_layers
        i_l= [0]* total_layers
        o_l= [0]* total_layers

   
    if (final_fc_layers != 0):  
        for each in range(0,final_fc_layers):
            if (each == 0):
                if ((initial_fc_layers == 0) and (lstm_layers == 0)):    
                    print("****************************************************************************")
                    print("Enter Input size of final_fc_layer: ", (each+1))
                    t1 = int(input("Size: "))
                    i_ffc.append(t1)
                else:
                    if (lstm_layers != 0): 
                        i_ffc.append(o_l[lstm_layers-1])
                    else:
                        i_ffc.append(o_ifc[initial_fc_layers-1])

            print("****************************************************************************")
            print("Enter Output size of final_fc_layer: ", (each+1))
            t2 = int(input("Size: "))
            o_ffc.append(t2)
            if (each != (initial_fc_layers-1)):
                i_ffc.append(t2)        

    else:
        i_ffc= [0]* total_layers
        o_ffc= [0]* total_layers

else:
    print("****************************************************************************")
    print("Using Default values written in the script!!")
    initial_fc_layers = 3
    lstm_layers = 3
    final_fc_layers = 3


    i_ifc = [6,8]
    o_ifc = [8,16]
    i_l= [10,8]
    o_l= [8,4]
    i_ffc = [4,3]
    o_ffc = [3,2,1]
    sequence_length = [8,8]




    stack = [1,1]
    splits = [0,0]
    flag = 0
    index = 0
    mode = 0 #1
    user = 0

    total_layers = initial_fc_layers + lstm_layers + final_fc_layers
    print("****************************************************************************")

****************************************************************************
Enter the number of fc layers before lstm blocks4
****************************************************************************
MAX number of lstm blocks4
****************************************************************************
Enter the number of fc layers after the lstm blocks4
****************************************************************************
****************************************************************************
Enter Input size of the Neural Network
Size: 6
****************************************************************************
Enter Output size of initial_fc_layer:  1
Size: 8
****************************************************************************
Enter Output size of initial_fc_layer:  2
Size: 10
****************************************************************************
Enter Output size of initial_fc_layer:  3
Size: 12
*******************************************************

In [7]:
print("Final Network:")
if (lstm_layers != 0):
    print("Initial Fully Connected layers:")
    for each in range(0,initial_fc_layers):
        print("("+str(i_ifc[each])+")=>("+str(o_ifc[each])+")")

if (lstm_layers != 0):
    print("lstms:")
    for each in range(0,lstm_layers):
        print("("+str(i_l[each])+")=>("+str(o_l[each])+")")

if (final_fc_layers!= 0): 
    print("Final Fully Connected layers:")
    for each in range(0,final_fc_layers):
        print("("+str(i_ffc[each])+")=>("+str(o_ffc[each])+")")

Final Network:
Initial Fully Connected layers:
(6)=>(8)
(8)=>(10)
(10)=>(12)
(12)=>(16)
lstms:
(16)=>(16)
(16)=>(12)
(12)=>(10)
(10)=>(8)
Final Fully Connected layers:
(8)=>(6)
(6)=>(4)
(4)=>(2)
(2)=>(1)


In [8]:
class LSTM(nn.Module):
    def __init__(self,a,b,c):
        super(LSTM, self).__init__()
        
        self.p1 = a
        self.p2 = b
        self.p3 = c

        #Inital fc layers
        if (self.p1 != 0):
            linears1 =[]
            for i in range(0,initial_fc_layers):
                linears1.append(nn.Linear(i_ifc[i], o_ifc[i]))
            self.fc = nn.ModuleList(linears1)

        #lstm layers
        if (self.p2 != 0): 
            l_modules = []
            for i in range(0,lstm_layers):
                l_modules.append(nn.LSTM(input_size=i_l[i], hidden_size=o_l[i],
                                       num_layers=stack[i], batch_first=True))
            self.lstm = nn.ModuleList(l_modules)
        
        #Final fc layers 
        if (self.p3 != 0):
            linears2 =[]
            in1 = o_l[lstm_layers-1]
            for i in range(0,final_fc_layers):                
                linears2.append(nn.Linear(in1,o_ffc[i]))
                in1 = o_ffc[i]
            self.fc2 = nn.ModuleList(linears2)


    def forward(self, x):

        out =  [0] * (total_layers+2)

        if (self.p1 != 0):
            x = x.view(-1, i_ifc[0])
            out[0] = x
            for i in range(0,initial_fc_layers):
                print("here ifc", i)
                out[i+1]= self.fc[i](out[i])
            out2 = out[initial_fc_layers]
            out[0] = out2
        else:
            out2 =x
            
        
        if (self.p2 != 0):
            out2 = out2.view(1,sequence_length[0],i_l[0])
            for i in range(0,lstm_layers):
                h= Variable(torch.zeros(stack[i], out2.size(0), o_l[i]))
                c= Variable(torch.zeros(stack[i], out2.size(0), o_l[i]))
                print("here l", i)
                ula, (h_out1, _) = self.lstm[i](out2, (h, c))
                if (i == (lstm_layers-1)):
                    h_out = h_out1.view(-1, o_l[i]) 
                else:
                    out2 = ula.view(1,sequence_length[i+1],i_l[i+1])

            out[0] = h_out  
        
        else:
            if (self.p3 != 0) and (self.p1 == 0):
                x = x.view(-1, i_ffc[0])
                out[0] = x
                
                
                
        if (self.p3 != 0):
            out[0] = out[0].view(-1, i_ffc[0])
            for i in range(0,final_fc_layers):
                print("here ffc", i)
                out[i+1]= self.fc2[i](out[i])
            out[0]= out[final_fc_layers]

        return out[0]

In [9]:
input_size = (1,sequence_length[0],i_ifc[0])
tensor1 = torch.Tensor(np.random.uniform(low=0.1, high=0.2, size=input_size))
node = 0
mod , params = onnx_relay(LSTM,initial_fc_layers,lstm_layers,final_fc_layers,node,input_size)
tvm_output = build_flash(mod,TARGET,params,tensor1,node)

LSTM(
  (fc): ModuleList(
    (0): Linear(in_features=6, out_features=8, bias=True)
    (1): Linear(in_features=8, out_features=10, bias=True)
    (2): Linear(in_features=10, out_features=12, bias=True)
    (3): Linear(in_features=12, out_features=16, bias=True)
  )
  (lstm): ModuleList(
    (0): LSTM(16, 16, batch_first=True)
    (1): LSTM(16, 12, batch_first=True)
    (2): LSTM(12, 10, batch_first=True)
    (3): LSTM(10, 8, batch_first=True)
  )
  (fc2): ModuleList(
    (0): Linear(in_features=8, out_features=6, bias=True)
    (1): Linear(in_features=6, out_features=4, bias=True)
    (2): Linear(in_features=4, out_features=2, bias=True)
    (3): Linear(in_features=2, out_features=1, bias=True)
  )
)
Total Number of Parameters:  5797
here l 0
here l 1
here l 2
here l 3


Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('dense_pack.x86', ('TENSOR', (1, 16), 'float32'), ('TENSOR', (64, 16), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('dense_pack.x86', ('TENSOR', (1, 12), 'float32'), ('TENSOR', (48, 12), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('dense_pack.x86', ('TENSOR', (1, 10), 'float32'), ('TENSOR', (40, 10), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
Cannot find config for target=llvm -keys=cpu -link-params=0, workload=('dense_pack.x86', ('TENSOR', (1, 8), 'float32'), ('TENSOR', (32, 8), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regres

/tmp/tmpfwdpz4m3/build/runtime/zephyr/zephyr.elf
****************************************************************************
   text	   data	    bss	    dec	    hex	filename
  51018	   1429	 235616	 288063	  4653f	/tmp/tmpfwdpz4m3/build/runtime/zephyr/zephyr.elf
****************************************************************************
Node Name                                                                                                 Ops                                                                                                       Time(us)   Time(%)  Shape       Inputs  Outputs  
---------                                                                                                 ---                                                                                                       --------   -------  -----       ------  -------  
fused_nn_contrib_dense_pack_add_4                                                                         fused_nn_contrib_dense_pack_a

In [10]:
print(tvm_output)
with open('output.txt', 'w') as f:
    f.write(cap.stdout)

tensor([[[0.6193]]])


In [11]:
print(cap)




In [12]:
print(i_ffc)

[8, 6, 4, 2]


In [13]:
print(o_ffc)

[6, 4, 2, 1]
