In [1]:
from qkeras import *
from tensorflow.keras.layers import Input, AveragePooling2D, Flatten, Softmax, Add, MaxPooling2D
import numpy as np
from collections import namedtuple
import pickle

c:\ProgramData\Miniconda3\envs\qkeras\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\ProgramData\Miniconda3\envs\qkeras\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


In [2]:
with open('../models/resnet_50fix.json', 'r') as f:
    model = utils.quantized_model_from_json(f.read())
    model.load_weights('../models/resnet50_81q.h5')

In [3]:
with open('../compile.pickle', 'rb') as f:
    compile_d = pickle.load(f)
    c = namedtuple('Compile', compile_d)(**compile_d)

In [4]:
for i, layer in enumerate(model.layers):
    print (i,layer)

0 <keras.engine.input_layer.InputLayer object at 0x000001CF117C0AC0>
1 <qkeras.qlayers.QActivation object at 0x000001CF117C1450>
2 <qkeras.qconv2d_batchnorm.QConv2DBatchnorm object at 0x000001CF117C1990>
3 <qkeras.qlayers.QActivation object at 0x000001CF117C28F0>
4 <keras.layers.pooling.max_pooling2d.MaxPooling2D object at 0x000001CF117C2950>
5 <qkeras.qlayers.QActivation object at 0x000001CF117C2F50>
6 <qkeras.qconv2d_batchnorm.QConv2DBatchnorm object at 0x000001CF117C34F0>
7 <qkeras.qlayers.QActivation object at 0x000001CF118F83A0>
8 <qkeras.qconv2d_batchnorm.QConv2DBatchnorm object at 0x000001CF117C3E80>
9 <qkeras.qlayers.QActivation object at 0x000001CF118F97B0>
10 <qkeras.qconv2d_batchnorm.QConv2DBatchnorm object at 0x000001CF118F9810>
11 <qkeras.qconv2d_batchnorm.QConv2DBatchnorm object at 0x000001CF118FAB30>
12 <keras.layers.merging.add.Add object at 0x000001CF118FBCD0>
13 <qkeras.qlayers.QActivation object at 0x000001CF118FBF10>
14 <qkeras.qconv2d_batchnorm.QConv2DBatchnorm obj

## Quantization Validation

- Pass random input
- Record intermediate outputs
- Scale the input, output and kernel using given quantizers, assert & save as integer
- Chain layers

In [5]:
XN = c.ROWS # batch size same as ROWS
x = np.random.randn(c.ROWS, *model.input.shape[1:])
x.shape

(8, 32, 32, 3)

In [6]:
for i, layer in enumerate(model.layers[1:]):
    print(i+1, layer.name)
    '''
    Get intermediate output
    '''
    temp_model = Model(inputs=model.input, outputs=layer.output)
    y = temp_model(x, training=False).numpy()
    layer.y = y

    '''
    Get inputs & outputs
    '''
    layer_input = layer.input if isinstance(layer.input, list) else [layer.input]
    layer.prev = [t.node.layer for t in layer_input]

    layer_output = layer.output if isinstance(layer.output, list) else [layer.output]
    layer.next = [n.layer for n in layer.outbound_nodes]


    '''
    Scale it to integer
    '''
    if isinstance(layer, QActivation):
        d = layer.quantizer.get_config()

        sign_bit = d['keep_negative'] if 'keep_negative' in d else (d['negative_slope'] !=0 if 'negative_slope' in d else (0))
        int_bit = d['integer'] if 'integer' in d else 0
        frac = d['bits']-int_bit-sign_bit
        layer.y_frac = frac
        layer.y_bits = d['bits']

    elif isinstance(layer, QDense) or isinstance(layer, QConv2D) or isinstance(layer, QConv2DBatchnorm):
        '''
        Kernel
        '''
        k = layer.get_folded_weights()[0] if isinstance(layer, QConv2DBatchnorm) else layer.kernel
        k = layer.kernel_quantizer_internal(k).numpy()
        k_config = layer.kernel_quantizer_internal.get_config()
        k_frac = k_config['bits']-k_config['integer']-k_config['keep_negative']
        k_int = k * 2**k_frac
        assert (k_int == k_int.astype(int)).all()
        k_int = k_int.astype(int)
        layer.k_int, layer.k_frac, layer.k_bits = k_int, k_frac, k_config['bits']

        '''
        Bias
        '''
        if layer.bias is not None:
            b = layer.get_folded_weights()[1] if isinstance(layer, QConv2DBatchnorm) else layer.bias
            b = layer.bias_quantizer_internal(b).numpy()
            b_config = layer.bias_quantizer_internal.get_config()
            b_frac = b_config['bits']-b_config['integer']-b_config['keep_negative']
            b_int = b * 2**b_frac
            assert (b_int == b_int.astype(int)).all()
            b_int = b_int.astype(int)
            layer.b_int, layer.b_frac, layer.b_bits = b_int, b_frac, b_config['bits']
        else:
            layer.b_int, layer.b_frac, layer.b_bits = None, None, None

        '''
        Outputs
        '''
        x_frac = layer.prev[0].y_frac
        y_frac = x_frac + k_frac
        layer.y_frac = y_frac

        adds = np.prod(np.array(layer.kernel.shape[:-1]))
        layer.y_bits = int(layer.k_bits + layer.prev[0].y_bits + np.ceil(np.log2(adds)))

    elif isinstance(layer, InputLayer):
        pass
    else:
        def all_same(items):
            return len(set(items)) < 2
        
        assert all_same([l.y_frac for l in layer.prev])
        layer.y_frac = layer.prev[0].y_frac
        layer.y_bits = layer.prev[0].y_bits + 1 if isinstance(layer, Add) else layer.prev[0].y_bits
    
    '''
    Calculate and store y_int
    '''
    if not (isinstance(layer, Activation) or isinstance(layer, AveragePooling2D)): # skip Keras Activation
        y_int = y * 2** layer.y_frac
        assert (y_int == y_int.astype(int)).all(), layer.name
        y_int = y_int.astype(int)
        layer.y_int = y_int


1 q_activation
2 q_conv2d_batchnorm
3 q_activation_1
4 max_pooling2d
5 q_activation_2
6 q_conv2d_batchnorm_2
7 q_activation_3
8 q_conv2d_batchnorm_3
9 q_activation_4
10 q_conv2d_batchnorm_4
11 q_conv2d_batchnorm_1
12 add
13 q_activation_5
14 q_conv2d_batchnorm_5
15 q_activation_6
16 q_conv2d_batchnorm_6
17 q_activation_7
18 q_conv2d_batchnorm_7
19 q_activation_8
20 add_1
21 q_activation_9
22 q_conv2d_batchnorm_8
23 q_activation_10
24 q_conv2d_batchnorm_9
25 q_activation_11
26 q_conv2d_batchnorm_10
27 q_activation_12
28 add_2
29 q_activation_13
30 q_conv2d_batchnorm_12
31 q_activation_14
32 q_conv2d_batchnorm_13
33 q_activation_15
34 q_conv2d_batchnorm_14
35 q_conv2d_batchnorm_11
36 add_3
37 q_activation_16
38 q_conv2d_batchnorm_15
39 q_activation_17
40 q_conv2d_batchnorm_16
41 q_activation_18
42 q_conv2d_batchnorm_17
43 q_activation_19
44 add_4
45 q_activation_20
46 q_conv2d_batchnorm_18
47 q_activation_21
48 q_conv2d_batchnorm_19
49 q_activation_22
50 q_conv2d_batchnorm_20
51 q_activa

## Bundling

Group the layers into a list of dicts, to be made into bundles

In [7]:
q_bundles = [] # (conv_dense, act, (add_input_bundle, add_act), maxpool)
q_adds = {}

i = -1
for layer in model.layers:
    if isinstance(layer, QDense) or isinstance(layer, QConv2D) or isinstance(layer, QConv2DBatchnorm):

        bundle = {
            'type':'dense' if isinstance(layer, QDense) else 'conv', 
            'strides': None, 'add_bundle_i': None, 
            'flatten': None, 'softmax': None, 'last_layer_name': None, 'prev_layer_name': layer.prev[0].name,
            'quant_details': None, 'act_details': None, 'pool_details': None,
            }

        bundle['x'] = [layer.prev[0].y_int, layer.prev[0].y_bits, layer.prev[0].y_frac]
        bundle['w'] = [layer.k_int        , layer.k_bits        , layer.k_frac        ]
        bundle['b'] = [layer.b_int        , layer.b_bits        , layer.b_frac        ]
        bundle['y'] = [layer.y_int        , layer.y_bits        , layer.y_frac        ]  

        if hasattr(layer, 'strides') and not np.all(layer.strides == (1,1)):
            bundle['strides'] = tuple(layer.strides)

        i+=1
        n_layer = layer
        next_layers = layer.next
        while len(next_layers) == 1 and not (isinstance(next_layers[0], QDense) or isinstance(next_layers[0], QConv2D) or isinstance(next_layers[0], QConv2DBatchnorm)):
            
            prev_layer = n_layer
            n_layer = next_layers[0]

            if isinstance(n_layer, QActivation):
                if isinstance(n_layer.quantizer, quantized_bits):
                    bundle['quant_details'] = {'bits': n_layer.y_bits, 'frac': n_layer.y_frac}
                else:
                    if 'relu' in str(n_layer.quantizer.__class__): 
                        bundle['act_details'] = {'type': 'relu', 'slope': n_layer.quantizer.negative_slope, 'bits': n_layer.y_bits, 'frac': n_layer.y_frac}
                    else:
                        raise Exception(n_layer.name, n_layer.quantizer.__class__, 'Only relu is supported yet')

            elif isinstance(n_layer, Add):
                key = n_layer.output.name

                def chain_bundle(j):
                    bundle['add_bundle_i'] = j
                    assert isinstance(n_layer.next[0], QActivation)
                    assert bundle['act_details'] is None

                if key in q_adds:
                    chain_bundle(q_adds[key])

                else: # met Add layer first time
                    '''
                    Check if other input of Add layer belongs to previously created bundle
                    '''
                    found = False
                    for add_prev in n_layer.prev:
                        if add_prev.name != prev_layer.name: # skip immediate above layer
                            for j, qb in enumerate(q_bundles):
                                if qb['last_layer_name'] == add_prev.name:
                                    chain_bundle(j)
                                    found = True
                    if not found:
                        q_adds[key] = i
                        n_layer = prev_layer
                        break

            elif isinstance(n_layer, MaxPooling2D):
                bundle['pool_details'] = {'type': 'max', 'size':tuple(n_layer.pool_size), 'strides':tuple(n_layer.strides), 'bits': n_layer.y_bits, 'frac': n_layer.y_frac}
                if isinstance(n_layer.next[0], QActivation):
                    next_layers = next_layers[0].next
                    prev_layer = n_layer
                    n_layer = next_layers[0]

            elif isinstance(n_layer, QAveragePooling2D):
                assert isinstance(n_layer.next[0], QActivation), "Quantized_bits should follow AveragePooling"
                bundle['pool_details'] = {'type': 'avg', 'size':tuple(n_layer.pool_size), 'strides':tuple(n_layer.strides), 'bits': n_layer.y_bits, 'frac': n_layer.y_frac}
                next_layers = next_layers[0].next
                prev_layer = n_layer
                n_layer = next_layers[0]

            elif isinstance(n_layer, Flatten):
                bundle['flatten'] = n_layer

            elif isinstance(n_layer, Activation):
                if n_layer.activation.__name__ == 'softmax':
                    bundle['softmax'] = True
                else:
                    raise Exception('Only softmax is supported among non-quantized activations')

            else:
                print(n_layer.name, 'was not added to bundle')

            next_layers = next_layers[0].next

        bundle['last_layer_name'] = (n_layer if n_layer else layer).name
        bundle['o_arr'          ] = (n_layer if n_layer else layer).y
        bundle['o_frac'         ] = (n_layer if n_layer else layer).y_frac
        bundle['o_bits'         ] = (n_layer if n_layer else layer).y_bits
        q_bundles += [bundle]
        print(i, bundle['last_layer_name'])



0 q_activation_2
1 q_activation_3
2 q_activation_4
3 q_conv2d_batchnorm_4
4 q_activation_5
5 q_activation_6
6 q_activation_7
7 q_activation_9
8 q_activation_10
9 q_activation_11
10 q_activation_13
11 q_activation_14
12 q_activation_15
13 q_conv2d_batchnorm_14
14 q_activation_16
15 q_activation_17
16 q_activation_18
17 q_activation_20
18 q_activation_21
19 q_activation_22
20 q_activation_24
21 q_activation_25
22 q_activation_26
23 q_activation_28
24 q_activation_29
25 q_activation_30
26 q_conv2d_batchnorm_27
27 q_activation_31
28 q_activation_32
29 q_activation_33
30 q_activation_35
31 q_activation_36
32 q_activation_37
33 q_activation_39
34 q_activation_40
35 q_activation_41
36 q_activation_43
37 q_activation_44
38 q_activation_45
39 q_activation_47
40 q_activation_48
41 q_activation_49
42 q_activation_51
43 q_activation_52
44 q_activation_53
45 q_conv2d_batchnorm_46
46 q_activation_54
47 q_activation_55
48 q_activation_56
49 q_activation_58
50 q_activation_59
51 q_activation_60
52 fla

## Bundle

In [8]:
import numpy as np
from collections import namedtuple

class Bundle:
    def __init__(self, type, strides, add_bundle_i, flatten, softmax, bundles, last_layer_name, prev_layer_name, x, w, b, y, quant_details, act_details, pool_details, o_arr, o_bits, o_frac):

        self.type = type        
        self.last_layer_name = last_layer_name
        self.softmax = softmax
        self.strides = strides

        '''
        Find prev bundle
        '''
        self.prev_bundle_i, self.prev_bundle = None, None
        for i, bundle in enumerate(bundles):
            if bundle.last_layer_name == prev_layer_name:
                self.prev_bundle_i, self.prev_bundle  = i, bundle

        self.add_bundle  = bundles[add_bundle_i] if add_bundle_i else None
        self.flatten = flatten

        self.x = x
        self.w = w
        self.b = b
        self.y = y
        self.f = flatten
        # self.quant = quant
        self.quant_details = quant_details
        self.act_details = act_details
        self.pool_details = pool_details

        '''
        Bundle output
        '''
        if softmax:
            self.o_arr, self.o_bits, self.o_frac = o_arr, 1, 0
        else:
            self.o_arr, self.o_bits, self.o_frac = o_arr, o_bits, o_frac


        if self.type == 'conv':
            self.KH, self.KW, self.CI, self.CO = self.w[0].shape
            self.XN, self.XH, self.XW, self.CI = self.x[0].shape
            self.XN, self.YH, self.YW, _       = self.y[0].shape
            self.SH                            = self.XH//self.YH
            self.SW                            = self.XW//self.YW
            self.RAM_WEIGHTS                   = self.KH*self.CI
            self.RAM_EDGES                     = self.CI* self.XW* int(np.ceil(self.XH//self.XN-1)) if self.KH != 0 else 0
        else:
            self.CI, self.CO = self.w[0].shape
            self.XH, self.CI = self.x[0].shape
            self.SH = self.SW = self.XN = self.KH = self.KW = self.XW = self.YW = 1
            self.YH = self.XH
            self.RAM_WEIGHTS = 0 #self.KH*self.CI # need to update
            self.RAM_EDGES = 0

    def process(self, function, x_arr):
        x_bits, x_frac = self.x[1:]
        w_arr, w_bits, w_frac = self.w

        out_arr = function(x_arr, self.w[0])
        return self.post_process(out_arr)


    def post_process(self, out_arr):

        def quantize(x, bits, frac):
            x = x.astype(np.float32)
            x /= 2 ** frac
            x = np.around(x)
            x = np.clip(x, -2**(bits-1), 2**(bits-1)-1)
            x = x.astype(int)
            return x

        x_bits, x_frac = self.x[1:]
        w_bits, w_frac = self.w[1:]
        out_bits, out_frac = x_bits + w_bits, x_frac + w_frac

        if self.b[0] is not None:
            b_arr, b_bits, b_frac = self.b
            out_arr += b_arr * 2** (out_frac - b_frac)

        if self.strides:
            SH, SW = self.strides
            N, XH, XW, C = out_arr.shape
            YH, YW = XH//SH, XW//SW
            out_arr = out_arr.reshape(N, YH, SH, YW, SW, C)
            out_arr = out_arr[:,:,-1,:,-1,:]

        if self.quant_details:
            out_arr = quantize(x=out_arr, bits=self.quant_details['bits'], frac=out_frac-self.quant_details['frac'])
            out_frac = out_frac-self.quant_details['frac']
            out_bits = self.quant_details['bits']

        if self.add_bundle:
            a_arr, a_bits, a_frac = self.add_bundle.out, self.add_bundle.o_bits, self.add_bundle.o_frac
            out_arr += a_arr * 2** (out_frac - a_frac)

        if self.act_details:
            frac, bits = self.act_details['frac'], self.act_details['bits']

            if self.act_details['type'] == 'relu':
                out_arr = out_arr/2**(out_frac-frac)
                out_arr = np.clip(out_arr,-2**(bits-1), 2**(bits-1)-1)

                out_arr = np.maximum(out_arr * self.act_details['slope'], out_arr)
                out_arr = np.around(out_arr)
                out_arr = np.clip(out_arr,-2**(bits-1), 2**(bits-1)-1).astype(int)

                out_frac, out_bits = frac, bits

            else:
                raise Exception('Only relu is supported yet')

        if self.pool_details:
            if self.pool_details['type'] == 'max':
                import math
                Stride = 2

                def findMax(InArray, p, q):
                    results = np.zeros((InArray.shape[0], InArray.shape[3]))
                    results -= math.inf
                    for i in range(p, p+3):
                        for j in range(q, q+3):
                            if i >=0 and j>=0 and i < InArray.shape[1] and j < InArray.shape[2]:
                                cand = InArray[:,i,j,:]
                                results = np.maximum(results, cand)
                    return results
                def HotFixMaxPool2D(InArray):
                    pad = 1
                    inShape = InArray.shape
                    assert len(inShape) == 4
                    OutArray = np.zeros((inShape[0], (inShape[1]+pad)//Stride, (inShape[2]+pad)//Stride, inShape[3]))
                    for i in range(OutArray.shape[1]):
                        for j in range(OutArray.shape[2]):
                            # p, q = i*Stride-1, j*Stride-1
                            p, q = i*Stride, j*Stride
                            OutArray[:,i,j,:] = findMax(InArray, p, q)
                    return OutArray
                
                out_arr = HotFixMaxPool2D(out_arr).astype(int)

            elif self.pool_details['type'] == 'avg':
                assert self.pool_details['size'] == self.pool_details['strides']
                KH, KW = self.pool_details['size']
                N, H, W, C = out_arr.shape
                out_arr = out_arr.reshape(N, H//KH, KH, W//KW, KW, C).mean(axis=(2,4))

                bits = self.pool_details['bits']
                out_arr = np.clip(out_arr,-2**(bits-1), 2**(bits-1)-1)
                out_arr = np.around(out_arr)
                out_arr = np.clip(out_arr,-2**(bits-1), 2**(bits-1)-1).astype(int)
            
        if self.flatten:
            out_arr = out_arr.reshape(out_arr.shape[0],-1)

        if self.softmax:
            out_arr = out_arr / 2**out_frac
            exp = np.exp(out_arr - out_arr.max())
            out_arr = exp/np.sum(exp, axis=1)[0]
        
        self.out = out_arr
        return out_arr
    

    @staticmethod
    def get_compile_params(bundles, ROWS, COLS):

        def clog2(x):
            return int(np.ceil(np.log2(x)))
        
        IN_BITS               = 64
        CONFIG_BEATS          = 1
        X_BITS = K_BITS       = max([b.x[1] for b in bundles])
        KW_MAX                = max([b.KW   for b in bundles])
        KH_MAX                = max([b.KH   for b in bundles])
        SW_MAX                = max([b.SW   for b in bundles])
        SH_MAX                = max([b.SH   for b in bundles])
        CI_MAX                = max([b.CI   for b in bundles])
        XW_MAX                = max([b.XW   for b in bundles])
        XH_MAX                = max([b.XH   for b in bundles])
        XN_MAX                = max([b.XN   for b in bundles])
        BRAM_WEIGHTS_DEPTH    = max([b.RAM_WEIGHTS + CONFIG_BEATS for b in bundles])
        RAM_EDGES_DEPTH       = max([b.RAM_EDGES                  for b in bundles])
        
        L_MAX                 = clog2(XH_MAX//ROWS)
        X_PAD                 = clog2(KH_MAX//2)
        BITS_KW2              = clog2((KW_MAX+1)/2)
        BITS_KH2              = clog2((KH_MAX+1)/2)
        BITS_SW               = clog2(SW_MAX)
        BITS_SH               = clog2(SH_MAX)
        BITS_CIN_MAX          = clog2(CI_MAX)
        BITS_COLS_MAX         = clog2(XW_MAX)
        BITS_BLOCKS_MAX       = clog2( L_MAX)
        BITS_XN_MAX           = clog2(XN_MAX)
        BITS_BRAM_WEIGHTS_ADDR= clog2(BRAM_WEIGHTS_DEPTH)

        params = locals()
        params = {k:params[k] for k in params if not ('__' in k or k in ['bundles', 'params', 'clog2'])}
        c = namedtuple('Compile', params)(**params)
        return c

    def export (self):

        if self.type != 'conv':
            print('Conv -> Dense Reshape')
            CI, CO = self.w[0].shape
            XN, _ = self.x[0].shape
            self.w[0] = self.w[0].reshape(1,1,CI,CO) # (CI,CO) -> (KH,KW,CI,CO)
            self.x[0] = self.x[0].reshape(XN,1,1,CI) # (XN,CI) -> (XN, XH, XW, CI)
            self.y[0] = self.y[0].reshape(XN,1,1,CO) # (XN,CI) -> (XN, XH, XW, CI)
        
        self.c = c
        self.r = self.get_runtime_params(self.c, self.w[0], self.x[0], self.y[0])
        self.r = self.create_headers(self.c, self.r)

        print(self.r)
        self.check_sparsity(self.w[0], self.x[0])

        self.we = self.reorder_w_q2e_conv(self.w[0], self.c, self.r)
        self.ye_exp_shape = (self.r.IT, self.r.XN, self.r.L, self.r.XW*self.r.CO_PRL, c.ROWS)
        self.ye_hw = np.zeros(self.ye_exp_shape)
        self.num_t = self.we.shape[0] # iterations

        self.r = self.r._asdict()
        self.c = self.c._asdict()

    @staticmethod
    def get_runtime_params(c, w, x, y):

        SW = SH = 1 # for bundle
        KH, KW, CI, CO = w.shape
        print('weights initial (KH, KW, CI, CO) =', w.shape)

        CO_PRL         = c.COLS * SW // KW                        # SW cols are processed in parallel
        EG             = int(np.floor( c.COLS / (KW + SW - 1)))   # elastic groups
        IT             = int(np.ceil( CO / (SW*EG)))              # iterations needed
        CO_PAD         = IT * CO_PRL                              # output cols padded

        print(f'{KH=}, {KW=}, {CI=}, {CO=}, {CO_PRL=}, {EG=}, {IT=}, {CO_PAD}')

        XN, XH, XW, CI = x.shape
        print('initial (XN, XH, XW, CI)=', x.shape)
        SH_OUT, SW_OUT = x.shape[1]//y.shape[1], x.shape[2]//y.shape[2]

        LH     = c.ROWS*SH              # Block height
        L      = int(np.ceil(XH/LH))    # Blocks
        XH_PAD = LH*L
        BRAM_WEIGHTS_ADDR_MAX  = c.CONFIG_BEATS + SW*KH*CI-1

        '''
        Pack all local variables into a namedtuple
        '''
        params = locals()
        params = {k:params[k] for k in params if not ('__' in k or k in ['w', 'x', 'y', 'c', 'params'])}
        print (params)
        r = namedtuple('Runtime', params)(**params)
        return r


    @staticmethod
    def create_headers(c, r):
        '''
        Create headers
        '''
        def pack_bits(arr):
            sum_width = 0
            packed = 0
            for val, width in arr:
                packed |= val << sum_width
                sum_width += width
            return packed
        
        ''' Weights Config'''
        w_config = pack_bits([
            (r.KW//2, c.BITS_KW2),
            (r.CI-1 , c.BITS_CIN_MAX),
            (r.XW-1 , c.BITS_COLS_MAX),
            (r.L -1 , c.BITS_BLOCKS_MAX),
            (r.XN-1 , c.BITS_XN_MAX),
            (r.BRAM_WEIGHTS_ADDR_MAX, c.BITS_BRAM_WEIGHTS_ADDR)
        ])
        w_config = format(w_config, f'#0{c.IN_BITS}b')
        w_config_words = [int(w_config[i:i+c.K_BITS], 2) for i in range(0, len(w_config), c.K_BITS)]
        w_config_words.reverse()
        w_config_words = np.array(w_config_words,dtype=np.int8)
        w_config_words = np.repeat(w_config_words[np.newaxis,...],repeats=r.IT,axis=0)

        '''Input Config'''
        x_config = pack_bits([
            (r.KH//2, c.BITS_KH2),
            (r.CI-1 , c.BITS_CIN_MAX),
            (r.XW-1 , c.BITS_COLS_MAX),
            (r.L -1 , c.BITS_BLOCKS_MAX),
        ])
        assert c.IN_BITS >= c.BITS_KW2 + c.BITS_CIN_MAX + c.BITS_COLS_MAX + c.BITS_BLOCKS_MAX

        x_config = format(x_config, f'#0{c.IN_BITS}b')
        x_config_words = [int(x_config[i:i+c.X_BITS], 2) for i in range(0, len(x_config), c.X_BITS)]
        x_config_words.reverse()

        d = {'w_config':w_config, 'w_config_words':w_config_words, 'x_config':x_config, 'x_config_words': x_config_words}
        n = namedtuple('Runtime', d)(**d)
        r = namedtuple("Runtime", r._fields + n._fields)(*(r + n))
        return r


    @staticmethod
    def check_sparsity(w, x):
        w_sparse = (w==0).sum()/w.size
        x_sparse = (x==0).sum()/x.size

        p_both_zero = x_sparse * w_sparse
        p_only_one_zero = (1-x_sparse) * w_sparse  +  (1-w_sparse) * x_sparse
        p_neither_zero = (1-x_sparse) * (1-w_sparse)
        zero_result = 1-p_neither_zero

        print(f'''
        w_sparsity   : {w_sparse*100:.2f}%
        x_sparsity   : {x_sparse*100:.2f}%

        both_zero    : {p_both_zero*100:.2f}%
        only_one_zero: {p_only_one_zero*100:.2f}%
        neither_zero : {p_neither_zero*100:.2f}%
        zero_result  : {zero_result*100:.2f}%
        ''')


    @staticmethod
    def reorder_w_q2e_conv(w, c, r):

        w = np.pad(w, ((0,0),(0,0),(0,0),(0,r.CO_PAD-r.CO)))        # (KH, KW, CI, CO_PAD)
        print(w.shape, (r.KH, r.KW, r.CI, r.IT, r.CO_PRL))
        w = w.reshape(r.KH, r.KW, r.CI, r.IT, r.CO_PRL)             # (KH, KW, CI, IT, CO_PRL)
        w = np.flip(w, axis=4)
        w = w.transpose(0,2,3,4,1)                                  # (KH, CI, IT, CO_PRL, KW)

        w = w.reshape  (r.KH, r.CI, r.IT, r.CO_PRL*r.KW)            # (KH, CI, IT, CO_PRL*KW)
        w = np.pad(w, ((0,0),(0,0),(0,0),(0,c.COLS-r.CO_PRL*r.KW))) # (KH, CI, IT, c.COLS)
        w = w.transpose(2,1,0,3)                                    # (IT, CI, KH, c.COLS)
        w = w.reshape (r.IT, r.CI*r.KH, c.COLS)                       # (IT, CI*KH, c.COLS)
        
        w = np.pad(w, ((0,0),(c.CONFIG_BEATS,0),(0,0)))             # (IT, c.CONFIG_BEATS+CI*KH, c.COLS)
        w = w.reshape (r.IT, (r.CI*r.KH+c.CONFIG_BEATS)*c.COLS)     # (IT, (CI*KH+c.CONFIG_BEATS)*c.COLS)

        w = np.concatenate([r.w_config_words, w], axis=1)             # (IT, 8 + CI*KH*c.COLS)
        assert w.shape == (r.IT, c.IN_BITS/c.K_BITS + (r.CI*r.KH+c.CONFIG_BEATS)*c.COLS)
        return w


    @staticmethod
    def reorder_x_q2e_conv(x, c, r):
        print('input initial (XN, XH, XW, CI)=', x.shape)

        x = np.pad(x, ((0,0),(0,r.XH_PAD-r.XH),(0,0),(0,0)))   # (XN, L*HL , XW, CI)
        x = x.reshape  (r.XN, r.L, r.LH, r.XW, r.CI)               # (XN, L, HL, XW, CI)

        zeros = np.zeros((r.XN,r.L,c.ROWS+c.X_PAD,r.XW,r.CI),x.dtype)  # (XN,L,c.ROWS+X_PAD,XW,CI)
        zeros[:,:,:c.ROWS,:,:] = x

        ''' Fill bot rows from next '''
        for l in range(r.L):
            if l == r.L-1:
                zeros[:,l, c.ROWS: ,:,:] = np.zeros((r.XN,c.X_PAD,r.XW,r.CI),x.dtype)
            else:
                zeros[:,l, c.ROWS: ,:,:] = x[:,l+1,:c.X_PAD,:,:]

        x = zeros                  # (XN,L,c.ROWS+X_PAD,XW,CI)
        x = x.transpose(0,1,3,4,2) # (XN,L,XW,CI,c.ROWS+X_PAD)

        x = x.reshape((r.XN*r.L*r.XW*r.CI*(c.ROWS+c.X_PAD)))
        x = np.concatenate([np.array(r.x_config_words, dtype=np.uint8), x.flatten()])
        assert x.shape == (c.IN_BITS/c.X_BITS + r.XN*r.L*r.XW*r.CI*(c.ROWS+c.X_PAD),)
        return x


    @staticmethod
    def reorder_y_q2e_conv(y, c, r):
        YH, YW = r.XH_PAD//r.SH_OUT, r.XW//r.SW_OUT

        if r.SH_OUT != 1:
            print("Striding not yet supported")
            return None

        y = np.pad(y, ((0,0),(0,r.LH*r.L-r.XH),(0,0),(0,r.CO_PAD-r.CO)))     # (XN, L*HL , XW, CO_PAD)
        y = y.reshape((r.XN, r.L, c.ROWS, r.XW, r.CO_PAD))                   # (XN,L,c.ROWS,XW,CO_PAD)
        y = y.reshape((r.XN, r.L, c.ROWS, r.XW, r.IT, r.CO_PRL))             # (XN,L,c.ROWS,XW,IT,CO_PRL)
        y = y.transpose(4,0,1,3,5,2)                                         # (IT,XN,L,XW,CO_PRL,c.ROWS)

        assert y.shape == (r.IT,r.XN,r.L,r.XW,r.CO_PRL,c.ROWS)

        y_w_last = y[:,:,:,-(r.KW//2+1):,:,:]
        y_w_last = y_w_last.transpose(0,1,2,4,3,5).reshape(r.IT,r.XN,r.L,(r.KW//2+1)*r.CO_PRL,c.ROWS)

        y = y.reshape(r.IT,r.XN,r.L,r.XW*r.CO_PRL,c.ROWS)
        y[:,:,:,-(r.KW//2+1)*r.CO_PRL:,:] = y_w_last
        return y
    
    @staticmethod
    def reorder_y_e2q_conv(y, c, r):
        y = y.reshape(r.IT,r.XN,r.L,r.XW*r.CO_PRL,c.ROWS)

        y_w_last = y[:,:,:,-(r.KW//2+1)*r.CO_PRL:,:]
        y_w_last = y_w_last.reshape(r.IT,r.XN,r.L,r.CO_PRL,(r.KW//2+1),c.ROWS)
        y_w_last = y_w_last.transpose(0,1,2,4,3,5)   #(r.IT,r.XN,r.L,(r.KW//2+1),r.CO_PRL,c.ROWS)
        y_w_last = y_w_last.reshape(r.IT,r.XN,r.L,(r.KW//2+1),r.CO_PRL,c.ROWS)
        y_w_last = y_w_last.reshape(r.IT,r.XN,r.L,(r.KW//2+1)*r.CO_PRL,c.ROWS)
        
        y[:,:,:,-(r.KW//2+1)*r.CO_PRL:,:] = y_w_last

        y = y.reshape(r.IT,r.XN,r.L,r.XW,r.CO_PRL,c.ROWS)
        y = y.transpose(1,2,5,3,0,4)
        y = y.reshape((r.XN, r.L*c.ROWS, r.XW, r.CO_PAD))
        y = y[:,:r.XH,:,:r.CO]

        return y

    @staticmethod
    def reorder_y_e2e_conv(y, c, r):
        pass

    @staticmethod
    def reorder_y_e2e_conv2dense(y, c, r):
        pass

In [9]:
bundles = []
for qb in q_bundles:
    bundles += [Bundle(**qb, bundles=bundles)]

def conv(x,w):
    return tf.keras.backend.conv2d(x, w, padding='same').numpy()


bundle = bundles[53]
out = bundle.process(function=conv if bundle.type=='conv' else (lambda x, w : x @ w), x_arr=bundle.x[0])
expected = model.get_layer(bundle.last_layer_name).y * 2**bundle.o_frac


if out.dtype == int:
    print(bundle.last_layer_name, np.all(out == expected))
else:
    print(bundle.last_layer_name, np.all(np.argmax(expected, axis=-1) == np.argmax(out, axis=-1)))

activation True


## Chained Bundle Check

In [10]:
# print(len(bundles))

xq = bundles[0].x[0]

for i, bundle in enumerate(bundles):
    if i == 0:
        bundle.chained_input = xq
    else:
        bundle.chained_input = bundle.prev_bundle.chained_output

    out = bundle.chained_output = bundle.process(function=conv if bundle.type=='conv' else (lambda x, w : x @ w), x_arr = bundle.chained_input)

    expected = model.get_layer(bundle.last_layer_name).y * 2**bundle.o_frac
    if out.dtype == int:
        print(i, bundle.last_layer_name, np.all(out == expected))
    else:
        print(i, bundle.last_layer_name, np.all(np.argmax(expected, axis=-1) == np.argmax(out, axis=-1)))

    x = out

np.argmax(x, axis=1)

0 q_activation_2 True
1 q_activation_3 True
2 q_activation_4 True
3 q_conv2d_batchnorm_4 True
4 q_activation_5 True
5 q_activation_6 True
6 q_activation_7 True
7 q_activation_9 True
8 q_activation_10 True
9 q_activation_11 True
10 q_activation_13 True
11 q_activation_14 True
12 q_activation_15 True
13 q_conv2d_batchnorm_14 True
14 q_activation_16 True
15 q_activation_17 True
16 q_activation_18 True
17 q_activation_20 True
18 q_activation_21 True
19 q_activation_22 True
20 q_activation_24 True
21 q_activation_25 True
22 q_activation_26 True
23 q_activation_28 True
24 q_activation_29 True
25 q_activation_30 True
26 q_conv2d_batchnorm_27 True
27 q_activation_31 True
28 q_activation_32 True
29 q_activation_33 True
30 q_activation_35 True
31 q_activation_36 True
32 q_activation_37 True
33 q_activation_39 True
34 q_activation_40 True
35 q_activation_41 True
36 q_activation_43 True
37 q_activation_44 True
38 q_activation_45 True
39 q_activation_47 True
40 q_activation_48 True
41 q_activation_

array([1, 1, 1, 9, 1, 9, 1, 1], dtype=int64)

In [11]:
for bundle in bundles:
    bundle.export()
    # bundle.x[0] = None
    # bundle.y[0] = None
    

weights initial (KH, KW, CI, CO) = (3, 3, 3, 64)
KH=3, KW=3, CI=3, CO=64, CO_PRL=8, EG=8, IT=8, 64
initial (XN, XH, XW, CI)= (8, 32, 32, 3)
{'SW': 1, 'SH': 1, 'KH': 3, 'KW': 3, 'CI': 3, 'CO': 64, 'CO_PRL': 8, 'EG': 8, 'IT': 8, 'CO_PAD': 64, 'XN': 8, 'XH': 32, 'XW': 32, 'SH_OUT': 2, 'SW_OUT': 2, 'LH': 8, 'L': 4, 'XH_PAD': 32, 'BRAM_WEIGHTS_ADDR_MAX': 9}
Runtime(SW=1, SH=1, KH=3, KW=3, CI=3, CO=64, CO_PRL=8, EG=8, IT=8, CO_PAD=64, XN=8, XH=32, XW=32, SH_OUT=2, SW_OUT=2, LH=8, L=4, XH_PAD=32, BRAM_WEIGHTS_ADDR_MAX=9, w_config='0b00000000000000000000000000000000010010111111111100000000010001', w_config_words=array([[ 17, -64,  -1,  18,   0,   0,   0,   0],
       [ 17, -64,  -1,  18,   0,   0,   0,   0],
       [ 17, -64,  -1,  18,   0,   0,   0,   0],
       [ 17, -64,  -1,  18,   0,   0,   0,   0],
       [ 17, -64,  -1,  18,   0,   0,   0,   0],
       [ 17, -64,  -1,  18,   0,   0,   0,   0],
       [ 17, -64,  -1,  18,   0,   0,   0,   0],
       [ 17, -64,  -1,  18,   0,   0,   0,   

## Independant Bundle Check

In [12]:
print(len(bundles))

for i, bundle in enumerate(bundles[:54]):
    out = bundle.process(function=conv if bundle.type=='conv' else (lambda x, w : x @ w), x_arr=bundle.x[0])
    expected = model.get_layer(bundle.last_layer_name).y * 2**bundle.o_frac
    if out.dtype == int:
        print(bundle.last_layer_name, np.all(out == expected))
    else:
        print(bundle.last_layer_name, np.all(np.argmax(expected, axis=-1) == np.argmax(out, axis=-1)))

54
q_activation_2 True
q_activation_3 True
q_activation_4 True
q_conv2d_batchnorm_4 True
q_activation_5 True
q_activation_6 True
q_activation_7 True
q_activation_9 True
q_activation_10 True
q_activation_11 True
q_activation_13 True
q_activation_14 True
q_activation_15 True
q_conv2d_batchnorm_14 True
q_activation_16 True
q_activation_17 True
q_activation_18 True
q_activation_20 True
q_activation_21 True
q_activation_22 True
q_activation_24 True
q_activation_25 True
q_activation_26 True
q_activation_28 True
q_activation_29 True
q_activation_30 True
q_conv2d_batchnorm_27 True
q_activation_31 True
q_activation_32 True
q_activation_33 True
q_activation_35 True
q_activation_36 True
q_activation_37 True
q_activation_39 True
q_activation_40 True
q_activation_41 True
q_activation_43 True
q_activation_44 True
q_activation_45 True
q_activation_47 True
q_activation_48 True
q_activation_49 True
q_activation_51 True
q_activation_52 True
q_activation_53 True
q_conv2d_batchnorm_46 True
q_activation_54

In [13]:
import pickle
pickle.dump(bundles, open("../models/bundles.pickle","wb"))

In [14]:
c

Compile(X_BITS=8, K_BITS=8, Y_BITS=32, ROWS=8, COLS=24, KW_MAX=11, CI_MAX=2048, XW_MAX=32, XH_MAX=32, XN_MAX=16, IN_BITS=64, OUT_BITS=64, RAM_WEIGHTS_DEPTH=2049, RAM_EDGES_DEPTH=288, VALID_PROB=100, READY_PROB=1, KH_MAX=11, L_MAX=4, CONFIG_BEATS=1, X_PAD=5, BITS_KW2=3, BITS_KH2=3, BITS_CIN_MAX=11, BITS_COLS_MAX=5, BITS_BLOCKS_MAX=2, BITS_XN_MAX=4, BITS_BRAM_WEIGHTS_ADDR=12)