In [1]:
from nntool.api import NNGraph
from nntool.api.utils import model_settings, quantization_options, RandomIter, qsnr
from nntool.quantization.qtype import QType
import numpy as np
import random, os
from PIL import Image
from scipy.spatial import distance
from numpy.linalg import norm
import matplotlib.pyplot as plt
import matplotlib.patches as patches
%matplotlib widget
import logging
logging.basicConfig(level=logging.ERROR)

## Load Net and Print Stats

In [2]:
G1 = NNGraph.load_graph("face_detection_front.tflite", load_quantization=False)
G2 = NNGraph.load_graph("face_detection_front.tflite", load_quantization=False)
max_activ_size, total_params = G1.total_memory_usage
ops = G1.total_ops
#G1.draw(filepath="draw", view=True)

print(f"{G1.name}:")
print(f"\tMax Active Size:\t{max_activ_size} elements")
print(f"\tTotal # Parameters:\t{total_params} elements")
print(f"\tTotal # Operations:\t{ops / 1e6:.2f} MOps")

face_detection_front:
	Max Active Size:	344064 elements
	Total # Parameters:	101390 elements
	Total # Operations:	31.86 MOps


## Quantize Net

In [3]:
stats = G1.collect_statistics(RandomIter.fake(G1))
G1.remove_nodes(G1["CONCAT_0_163"], leave=False, up=False)
G1.remove_nodes(G1["CONCAT_0_162"], leave=False, up=False)
G1.node('output_1').fixed_order = True
G1.node('output_2').fixed_order = True
G1.node('output_3').fixed_order = True
G1.node('output_4').fixed_order = True
G1.quantize(
    stats,
    graph_options=quantization_options( scheme="FLOAT", float_type="float16"),
    node_options={'input_1': {'qtype_ind': QType.from_min_max_sq(-1, 1, dtype=np.uint8)}}
    # node_options={
    #     n: quantization_options(scheme="SQ8", sq_bits=8) for n in ["input_1"]
    # }
)

G1.adjust_order()
G1.add_dimensions()
G1.fusions('scaled_match_group')
G1.fusions('expression_matcher')
G1.insert_resizer(G1[0],(480,480),spatial_axes=(1,2))



In [4]:
stats = G2.collect_statistics(RandomIter.fake(G2))
G2.remove_nodes(G2["CONCAT_0_163"], leave=False, up=False)
G2.remove_nodes(G2["CONCAT_0_162"], leave=False, up=False)

G2.node('output_1').fixed_order = True
G2.node('output_2').fixed_order = True
G2.node('output_3').fixed_order = True
G2.node('output_4').fixed_order = True
G2.quantize(
    stats,
    graph_options=quantization_options( hwc=True, scheme="FLOAT", float_type="float16"),
    node_options={'input_1': {'qtype_ind': QType.from_min_max_sq(-1, 1, dtype=np.uint8)}}
    # node_options={
    #     n: quantization_options(scheme="SQ8", sq_bits=8) for n in ["input_1"]
    # }
)

G2.adjust_order()
G2.add_dimensions()
G2.fusions('scaled_match_group')
G2.fusions('expression_matcher')
G2.insert_resizer(G2[0],(480,480),spatial_axes=(0,1))


## Eval Quantization

In [5]:
G1_francesco_1 = np.array(Image.open("../input_rgb.ppm"))
G1_francesco_1 = ((G1_francesco_1.astype(np.float32)) / 128) -1
#The NN is set as HWC so no need for transpose
G1_francesco_1 = G1_francesco_1.transpose(2, 0, 1)
G1_float_execution_0 = G1.execute([G1_francesco_1], quantize=False)
G1_quant_execution_0 = G1.execute([G1_francesco_1], quantize=True, dequantize=True)


G2_francesco_1 = np.array(Image.open("../input_rgb.ppm"))
G2_francesco_1 = ((G2_francesco_1.astype(np.float32)) / 128) -1
#The NN is set as HWC so no need for transpose
#G2_francesco_1 = G1_francesco_1.transpose(2, 0, 1)
G2_float_execution_0 = G2.execute([G2_francesco_1], quantize=False)
G2_quant_execution_0 = G2.execute([G2_francesco_1], quantize=True, dequantize=True)



## Output 1
# print(G1_float_execution_0[G1['output_1'].step_idx])
# print(G1_quant_execution_0[G1['output_1'].step_idx])


# #TO print out each layer SNR Float vs Quantized
# from nntool.graph.types import ConstantInputNode
# qsnrs_by_layer = G1.qsnrs(G1_float_execution_0, G1_quant_execution_0)
# print("QSNR layer by layer (skipping constant layers like weights and biases):")
# print(f"{'Layer Name':>30} (  #): {'QSNR':4}")
# for i, q in enumerate(qsnrs_by_layer):
#     if isinstance(G1[i], ConstantInputNode):
#         continue
#     #print(f"{G[i].name[0:20]:>30} ({i:3}): {q:4}")
#     print(f"{G1[i].name:>30} ({i:3}): {q:4}")

qsnrs = G1.qsnrs(G1_float_execution_0, G1_quant_execution_0)
print([qsnrs[node.step_idx] for node in G1.output_nodes()])

qsnrs = G2.qsnrs(G2_float_execution_0, G2_quant_execution_0)
print([qsnrs[node.step_idx] for node in G2.output_nodes()])
# G1.draw(fusions=True,filepath='graph',quant_labels=True,nodes=G.nodes())
# G1.qshow()


  result = np.matmul(im2col_buff, weights_mat).transpose((1, 0))
  slabhw = np.multiply(in_tensor_padded[cur_h * filt_dil_h:


[42, 38, 39, 42]
[42, 38, 39, 42]


In [58]:
# On the target only fully integer values - NO DEQUANTIZE
G1_int_execution = G1.execute([G1_francesco_1], dequantize=False, quantize=True)
# Autotiler options: make the autotiler allocate the input of the network and reuse that space after the first layer
# more L2 for the rest of the network
G1[0].at_options.allocate = 1
G1[0].at_options

G1_res = G1.execute_on_target(
    pmsis_os='freertos',
    platform="gvsoc",
    directory="test_run_G1",
    input_tensors=G1_int_execution[0],
    output_tensors=4,
    write_out_to_file=True,
    at_log=True,
    dont_run=False,
    settings=model_settings(graph_dump_tensor_to_file=True,l1_size=110000,l2_size=9000000,graph_l2_static_memory_budget=200000,tensor_directory="./tensors"),
    cmake=True,
    at_loglevel=1,
    print_output=True
)
for l in G1_res.at_log[-29:-1]:
    print(l)


Script started, output log file is '/tmp/tmpjgcaakzu/log.txt'.
-- [[36mCustom BSP[m] Custom BSP unused.
-- [[1;36mPython[m] Interpreter found here /home/francesco/libraries/anaconda3/envs/std_sdk/bin/python3.10
-- [[1;32mKconfig[m] KCONFIG_CONFIG environnement variable has not been exported.
-- [[1;32mKconfig[m] Using default value sdk: "sdk.config" and app: "sdk.config".
-- [[1;36mNNTOOL[m] Using nntool in /home/francesco/works/release/gitlab_sdk/tools/nntool/scripts/nntool
-- [[1;32mSFU[m] Using SFU in 
-- [1;31m[ CMAKE_C_FLAGS ][m  -march=rv32imcxgap9 -mPE=8 -mFC=1 -mint64  -fcommon -fno-jump-tables -fno-tree-loop-distribute-patterns -fno-delete-null-pointer-checks -fomit-frame-pointer -Os -fmessage-length=0 -fno-exceptions -ffunction-sections -fdata-sections -funsigned-char -Wall -Wextra -Werror -Wno-unused-parameter -Wno-unused-function -Wno-unused-variable -Wno-unused-but-set-variable -Wno-implicit-fallthrough -g -gdwarf-2 -gstrict-dwarf  -Wno-discarded-qualifiers
-

SchedulePriorTracktype called
GetDntDataPriorQueue sql exec success




	 *** NNTOOL face_detection_front Example ***

Entering main controller
FC Frequency = 370000000 Hz CL Frequency = 370000000 Hz PERIPH Frequency = 370000000 Hz
Voltage: 800mV
Constructor
L1 Promotion copy took 2179 FC Cycles
Call cluster
Start timer
Node: S1_input_1_resizer, Argument: S1_Output, Dim: 3, [1][1][3][128][128] ItemSize: 1 ----> Writing to: S1_input_1_resizer_ArgNameS1_Output_ItemSize-1_Dim3_1x1x3x128x128.dat
Node: S2_input_1_qout0, Argument: S2_Output, Dim: 2, [1][1][1][1][49152] ItemSize: 2 ----> Writing to: S2_input_1_qout0_ArgNameS2_Output_ItemSize-2_Dim2_1x1x1x1x49152.dat
Node: S5_CONV_2D_0_2_fusion, Argument: S5_Output, Dim: 3, [1][1][24][64][64] ItemSize: 2 ----> Writing to: S5_CONV_2D_0_2_fusion_ArgNameS5_Output_ItemSize-2_Dim3_1x1x24x64x64.dat
Node: S8_DW_CONV_2D_0_6, Argument: S8_Output, Dim: 3, [1][1][24][64][64] ItemSize: 2 ----> Writing to: S8_DW_CONV_2D_0_6_ArgNameS8_Output_ItemSize-2_Dim3_1x1x24x64x64.dat


KeyboardInterrupt: 

In [62]:
# On the target only fully integer values - NO DEQUANTIZE
G2_int_execution = G2.execute([G2_francesco_1], dequantize=False, quantize=True)
# Autotiler options: make the autotiler allocate the input of the network and reuse that space after the first layer
# more L2 for the rest of the network
G2[0].at_options.allocate = 1
G2[0].at_options

G2_res = G2.execute_on_target(
    pmsis_os='freertos',
    platform="gvsoc",
    directory="test_run_G2",
    input_tensors=G2_int_execution[0],
    output_tensors=4,
    write_out_to_file=True,
    at_log=True,
    dont_run=False,
    settings=model_settings(graph_dump_tensor_to_file=True,l1_size=110000,l2_size=900000,graph_l2_static_memory_budget=200000, tensor_directory="./tensors"),
    cmake=True,
    at_loglevel=1,
    print_output=True
)
for l in G2_res.at_log[-29:-1]:
    print(l)


Script started, output log file is '/tmp/tmpt20utcff/log.txt'.
-- [[36mCustom BSP[m] Custom BSP unused.
-- [[1;36mPython[m] Interpreter found here /home/francesco/libraries/anaconda3/envs/std_sdk/bin/python3.10
-- [[1;32mKconfig[m] KCONFIG_CONFIG environnement variable has not been exported.
-- [[1;32mKconfig[m] Using default value sdk: "sdk.config" and app: "sdk.config".
-- [[1;36mNNTOOL[m] Using nntool in /home/francesco/works/release/gitlab_sdk/tools/nntool/scripts/nntool
-- [[1;32mSFU[m] Using SFU in 
-- [1;31m[ CMAKE_C_FLAGS ][m  -march=rv32imcxgap9 -mPE=8 -mFC=1 -mint64  -fcommon -fno-jump-tables -fno-tree-loop-distribute-patterns -fno-delete-null-pointer-checks -fomit-frame-pointer -Os -fmessage-length=0 -fno-exceptions -ffunction-sections -fdata-sections -funsigned-char -Wall -Wextra -Werror -Wno-unused-parameter -Wno-unused-function -Wno-unused-variable -Wno-unused-but-set-variable -Wno-implicit-fallthrough -g -gdwarf-2 -gstrict-dwarf  -Wno-discarded-qualifiers
-

SchedulePriorTracktype called
GetDntDataPriorQueue sql exec success


[ 64%] [34m[1mRunning Genface_detection_front to create Kernel C files : face_detection_frontKernels.c face_detection_frontKernels.h[0m
[34m[1mRUNNING AUTOTILER MODEL - START[0m

Flash image face_detection_front_L3_Flash_Const.dat (size 202788) for device AT_MEM_L3_DEFAULTFLASH successfuly generated

      Shared L1 Memory size (Bytes)       : Given:     115712, Used:     115688
             L2 Memory size (Bytes)       : Given:     900000, Used:     899992
      L2 Static Memory Usage (Bytes)      :                              159640
         L2 Dyn Memory Usage (Bytes)      :                              740352
     DefaultRam Memory size (Bytes)       : Given:    8000000, Used:     355436
   DefaultFlash Memory size (Bytes)       : Given:   67108864, Used:     202788
    Total Flash Used at runtime (Bytes)   :                    Used:      43148 (0+0)

L3 Memory bandwidth for 1 graph run       :     552044 Bytes
L2 Memory bandwidth for 1 graph run       :   12032676 Bytes
Sum

ScheduleTimerQuery called
GetDntDataTotal sql exec success
GetDntDataQueue sql exec success


Node: S45_DW_CONV_2D_0_42, Argument: S45_Output, Dim: 3, [1][1][32][32][36] ItemSize: 2 ----> Writing to: S45_DW_CONV_2D_0_42_ArgNameS45_Output_ItemSize-2_Dim3_1x1x32x32x36.dat
Node: S48_CONV_2D_0_45, Argument: S48_Output, Dim: 3, [1][1][32][32][42] ItemSize: 2 ----> Writing to: S48_CONV_2D_0_45_ArgNameS48_Output_ItemSize-2_Dim3_1x1x32x32x42.dat
Node: S49_PADDED_ADD_0_47_trans_in1, Argument: S49_Output, Dim: 3, [1][1][1][42][1024] ItemSize: 2 ----> Writing to: S49_PADDED_ADD_0_47_trans_in1_ArgNameS49_Output_ItemSize-2_Dim3_1x1x1x42x1024.dat
Node: S50_PADDED_ADD_0_47PadBot, Argument: S50_Output, Dim: 3, [1][1][6][32][32] ItemSize: 2 ----> Writing to: S50_PADDED_ADD_0_47PadBot_ArgNameS50_Output_ItemSize-2_Dim3_1x1x6x32x32.dat
Node: S50_PADDED_ADD_0_47Body, Argument: S50_Output, Dim: 3, [1][1][36][32][32] ItemSize: 2 ----> Writing to: S50_PADDED_ADD_0_47Body_ArgNameS50_Output_ItemSize-2_Dim3_1x1x36x32x32.dat
Node: S51_PADDED_ADD_0_47_trans_out0, Argument: S51_Output, Dim: 3, [1][1][1][102

SchedulePriorTracktype called
GetDntDataPriorQueue sql exec success


Node: S107_PADDED_ADD_0_102_trans_out0, Argument: S107_Output, Dim: 3, [1][1][1][256][88] ItemSize: 2 ----> Writing to: S107_PADDED_ADD_0_102_trans_out0_ArgNameS107_Output_ItemSize-2_Dim3_1x1x1x256x88.dat
Node: S149_CONV_2D_0_148, Argument: Output_2, Dim: 3, [1][1][16][16][2] ItemSize: 2 ----> Writing to: S149_CONV_2D_0_148_ArgNameOutput_2_ItemSize-2_Dim3_1x1x16x16x2.dat
Node: S108_MAX_POOL_2D_0_107, Argument: S108_Output, Dim: 3, [1][1][88][8][8] ItemSize: 2 ----> Writing to: S108_MAX_POOL_2D_0_107_ArgNameS108_Output_ItemSize-2_Dim3_1x1x88x8x8.dat
Node: S109_PADDED_ADD_0_112_trans_in0, Argument: S109_Output, Dim: 3, [1][1][1][88][64] ItemSize: 2 ----> Writing to: S109_PADDED_ADD_0_112_trans_in0_ArgNameS109_Output_ItemSize-2_Dim3_1x1x1x88x64.dat
Node: S112_DW_CONV_2D_0_106, Argument: S112_Output, Dim: 3, [1][1][8][8][88] ItemSize: 2 ----> Writing to: S112_DW_CONV_2D_0_106_ArgNameS112_Output_ItemSize-2_Dim3_1x1x8x8x88.dat
Node: S115_CONV_2D_0_110, Argument: S115_Output, Dim: 3, [1][1][8

In [12]:
print("Out Name, QSNR NNTool vs Target run")

from nntool.graph.types import ConstantInputNode
G1_qsnrs_by_layer = G1.qsnrs(G1_res.output_tensors, G1_int_execution)
for i, q in enumerate(G1_qsnrs_by_layer):
    if isinstance(G1[i], ConstantInputNode):
        continue
    #print(f"{G1[i].name[0:20]:>30} ({i:3}): {q:4}")
    print(f"{G1[i].name:>30} ({i}): {q}")
  

for outn in G1.output_nodes():
    out_target = np.fromfile(f"test_run_G1/{outn.name.capitalize()}.bin", G1.quantization[outn.name].in_qs[0].dtype)
    out_nntool = G1_int_execution[G1[outn.name].step_idx]
    print(f"{outn.name}, {G1.qsnrs([out_target], [out_nntool[0].flatten()])}")
    #print(out_nntool[0].flatten())
    #print(out_target)

#G1.qsnrs(G1_res.output_tensors, G1_int_execution)


#print(G1_res[-1])
#print(G2_res[-1])

# print(qsnr(np.array(G2_float_execution_0[G2['output_1'].step_idx]),np.array(G1_float_execution_0[G1['output_1'].step_idx])))
# print(qsnr(np.array(G2_quant_execution_0[G2['output_1'].step_idx]),np.array(G1_quant_execution_0[G1['output_1'].step_idx])))

Out Name, QSNR NNTool vs Target run
                       input_1 (0): None
               input_1_resizer (1): -inf
                 input_1_qout0 (2): 74
            CONV_2D_0_2_fusion (5): 62
                DW_CONV_2D_0_6 (8): 57
                   CONV_2D_0_9 (11): 54
               ADD_0_10_fusion (12): 52
               DW_CONV_2D_0_14 (15): 53
                  CONV_2D_0_17 (18): 52
               PADDED_ADD_0_19 (19): 51
              MAX_POOL_2D_0_24 (20): 53
               DW_CONV_2D_0_23 (23): 56
                  CONV_2D_0_27 (26): 52
               PADDED_ADD_0_29 (27): 53
               DW_CONV_2D_0_33 (30): 53
                  CONV_2D_0_36 (33): 50
               PADDED_ADD_0_38 (34): 52
               DW_CONV_2D_0_42 (37): 52
                  CONV_2D_0_45 (40): 52
               PADDED_ADD_0_47 (41): 49
              MAX_POOL_2D_0_52 (42): 51
               DW_CONV_2D_0_51 (45): 52
                  CONV_2D_0_55 (48): 51
               PADDED_ADD_0_57 (49): 51
     

In [46]:
#TO print out each layer SNR Float vs Quantized
from nntool.graph.types import ConstantInputNode
G2_qsnrs_by_layer = G2.qsnrs(G2_res.output_tensors, G2_int_execution)
print("QSNR layer by layer (skipping constant layers like weights and biases):")
print(f"{'Layer Name':>30} (  #): {'QSNR':4}")
for i, q in enumerate(G2_qsnrs_by_layer):
    if isinstance(G2[i], ConstantInputNode):
        continue
    #print(f"{G2[i].name[0:20]:>30} ({i:3}): {q:4}")
    print(f"{G2[i].name:>30} ({i}): {q}")
  

print("Out Name, QSNR NNTool vs Target run")
for outn in G2.output_nodes():
    out_target = np.fromfile(f"test_run_G2/{outn.name.capitalize()}.bin", G2.quantization[outn.name].in_qs[0].dtype)
    out_nntool = G2_int_execution[G2[outn.name].step_idx]
    print(f"{outn.name}, {G2.qsnrs([out_target], [out_nntool[0].flatten()])}")
    #print(out_nntool[0].flatten())
    #print(out_target)

G1.draw("temp/grapG1")
G2.draw("temp/grapG2")


QSNR layer by layer (skipping constant layers like weights and biases):
                    Layer Name (  #): QSNR
                       input_1 (0): None
               input_1_resizer (1): -inf
                 input_1_qout0 (2): 74
            CONV_2D_0_2_fusion (5): 58
                DW_CONV_2D_0_6 (8): 54
                   CONV_2D_0_9 (11): 51
               ADD_0_10_fusion (12): 50
     PADDED_ADD_0_19_trans_in0 (13): 50
               DW_CONV_2D_0_14 (16): 51
                  CONV_2D_0_17 (19): 49
     PADDED_ADD_0_19_trans_in1 (20): 49
               PADDED_ADD_0_19 (21): 49
    PADDED_ADD_0_19_trans_out0 (22): 49
              MAX_POOL_2D_0_24 (23): 51
     PADDED_ADD_0_29_trans_in0 (24): 51
               DW_CONV_2D_0_23 (27): 54
                  CONV_2D_0_27 (30): 50
     PADDED_ADD_0_29_trans_in1 (31): 50
               PADDED_ADD_0_29 (32): 51
         DW_CONV_2D_0_33_trans (33): 51
               DW_CONV_2D_0_33 (36): 51
                  CONV_2D_0_36 (39): 48
     P

/snap/core20/current/lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4.29' not found (required by /lib/x86_64-linux-gnu/libproxy.so.1)
Failed to load module: /home/francesco/snap/code/common/.cache/gio-modules/libgiolibproxy.so


/snap/core20/current/lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4.29' not found (required by /lib/x86_64-linux-gnu/libproxy.so.1)
Failed to load module: /home/francesco/snap/code/common/.cache/gio-modules/libgiolibproxy.so
Gtk-Message: 16:38:05.484: Failed to load module "gail"
Gtk-Message: 16:38:05.484: Failed to load module "gail"
Gtk-Message: 16:38:05.484: Failed to load module "atk-bridge"
Gtk-Message: 16:38:05.484: Failed to load module "atk-bridge"













































GTK+ 2.x symbols detected. Using GTK+ 2.x and GTK+ 3 in the same process is not supported.

GTK+ 2.x symbols detected. Using GTK+ 2.x and GTK+ 3 in the same process is not supported.
Gtk-Message: 16:38:05.536: Failed to load module "canberra-gtk-module"
Gtk-Message: 16:38:05.536: Failed to load module "canberra-gtk-module"
[TimeStamp]-----main-----begin
[TimeStamp]-----main-----begin
svn:  rb70e8df


##################################################
##############Foxit Reader S

In [61]:
import sys
np.set_printoptions(threshold=sys.maxsize)

#primo indice layer
#print(G1_res.output_tensors[138][0])


print(qsnr(G1_res.output_tensors[138][0],G2_res.output_tensors[161][0]))

print(qsnr(G1_res.output_tensors[126][0],G2_res.output_tensors[151][0]))

print(qsnr(G1_res.output_tensors[132][0],G2_res.output_tensors[156][0]))

print(qsnr(G1_res.output_tensors[144][0],G2_res.output_tensors[166][0]))

52
56
49
53


In [50]:
print(G2_res.output_tensors[161][0])

[[ 3.0908e-01 -5.5713e-01  2.6141e+01  2.6141e+01 -1.3350e+00 -3.6895e+00
   3.0586e+00 -3.7832e+00  1.2754e+00  2.6172e-01  2.0703e-01  4.1875e+00
  -7.6055e+00 -2.3828e+00  5.7266e+00 -2.2109e+00]
 [ 8.0469e-01  4.8438e-01  3.9688e+01  3.9688e+01 -3.4824e+00 -5.1055e+00
   5.7930e+00 -4.6992e+00  5.9180e-01  1.3047e+00 -7.5781e-01  7.5625e+00
  -1.1312e+01 -3.9570e+00  1.0570e+01 -2.2422e+00]
 [ 6.6602e-01 -8.8428e-01  2.4281e+01  2.4281e+01 -1.4727e+00 -2.6348e+00
   4.7148e+00 -3.6484e+00  2.4414e+00  1.8145e+00  1.9482e+00  4.8828e+00
  -7.5820e+00 -2.8281e+00  8.9531e+00 -4.9570e+00]
 [ 6.0742e-01  5.3809e-01  3.6875e+01  3.6875e+01 -3.9531e+00 -2.9707e+00
   7.2734e+00 -4.2148e+00  2.4453e+00  3.4160e+00  1.9150e+00  8.3984e+00
  -1.1812e+01 -3.0977e+00  1.3742e+01 -5.3047e+00]
 [-7.8125e-03 -7.7686e-01  2.4422e+01  2.4422e+01 -3.1406e+00 -2.3281e+00
   3.0352e+00 -3.4766e+00  1.0840e-01  1.6172e+00  1.5710e-01  4.7539e+00
  -8.0000e+00 -2.4141e+00  9.0000e+00 -4.5664e+00]
 [-1.

SchedulePriorTracktype called
GetDntDataPriorQueue sql exec success
ScheduleTimerQuery called
GetDntDataTotal sql exec success
GetDntDataQueue sql exec success
