In [1]:
from qkeras import *
from tensorflow.keras.layers import Input, AveragePooling2D, Flatten, Softmax, Add, MaxPooling2D
import numpy as np
from collections import namedtuple
import pickle

from pynq import Overlay
from pynq import allocate

## Bundle

In [2]:
import numpy as np
from collections import namedtuple

class Bundle:
    def __init__(self, type, strides, add_bundle_i, flatten, softmax, bundles, last_layer_name, prev_layer_name, x, w, b, y, quant_details, act_details, pool_details, o_arr, o_bits, o_frac):

        self.type = type        
        self.last_layer_name = last_layer_name
        self.softmax = softmax
        self.strides = strides

        '''
        Find prev bundle
        '''
        self.prev_bundle_i, self.prev_bundle = None, None
        for i, bundle in enumerate(bundles):
            if bundle.last_layer_name == prev_layer_name:
                self.prev_bundle_i, self.prev_bundle  = i, bundle

        self.add_bundle  = bundles[add_bundle_i] if add_bundle_i else None
        self.flatten = flatten

        self.x = x
        self.w = w
        self.b = b
        self.y = y
        self.f = flatten
        # self.quant = quant
        self.quant_details = quant_details
        self.act_details = act_details
        self.pool_details = pool_details

        '''
        Bundle output
        '''
        if softmax:
            self.o_arr, self.o_bits, self.o_frac = o_arr, 1, 0
        else:
            self.o_arr, self.o_bits, self.o_frac = o_arr, o_bits, o_frac


        if self.type == 'conv':
            self.KH, self.KW, self.CI, self.CO = self.w[0].shape
            self.XN, self.XH, self.XW, self.CI = self.x[0].shape
            self.XN, self.YH, self.YW, _       = self.y[0].shape
            self.SH                            = self.XH//self.YH
            self.SW                            = self.XW//self.YW
            self.RAM_WEIGHTS                   = self.KH*self.CI
            self.RAM_EDGES                     = self.CI* self.XW* int(np.ceil(self.XH//self.XN-1)) if self.KH != 0 else 0
        else:
            self.CI, self.CO = self.w[0].shape
            self.XH, self.CI = self.x[0].shape
            self.SH = self.SW = self.XN = self.KH = self.KW = self.XW = self.YW = 1
            self.YH = self.XH
            self.RAM_WEIGHTS = 0 #self.KH*self.CI # need to update
            self.RAM_EDGES = 0

    def process(self, function, x_arr):
        x_bits, x_frac = self.x[1:]
        w_arr, w_bits, w_frac = self.w

        out_arr = function(x_arr, self.w[0])
        return self.post_process(out_arr)


    def post_process(self, out_arr):

        def quantize(x, bits, frac):
            x = x.astype(np.float32)
            x /= 2 ** frac
            x = np.around(x)
            x = np.clip(x, -2**(bits-1), 2**(bits-1)-1)
            x = x.astype(int)
            return x

        x_bits, x_frac = self.x[1:]
        w_bits, w_frac = self.w[1:]
        out_bits, out_frac = x_bits + w_bits, x_frac + w_frac

        if self.b[0] is not None:
            b_arr, b_bits, b_frac = self.b
            out_arr += b_arr * 2** (out_frac - b_frac)

        if self.strides:
            SH, SW = self.strides
            N, XH, XW, C = out_arr.shape
            YH, YW = XH//SH, XW//SW
            out_arr = out_arr.reshape(N, YH, SH, YW, SW, C)
            out_arr = out_arr[:,:,-1,:,-1,:]

        if self.quant_details:
            out_arr = quantize(x=out_arr, bits=self.quant_details['bits'], frac=out_frac-self.quant_details['frac'])
            out_frac = out_frac-self.quant_details['frac']
            out_bits = self.quant_details['bits']

        if self.add_bundle:
            a_arr, a_bits, a_frac = self.add_bundle.out, self.add_bundle.o_bits, self.add_bundle.o_frac
            out_arr += a_arr * 2** (out_frac - a_frac)

        if self.act_details:
            frac, bits = self.act_details['frac'], self.act_details['bits']

            if self.act_details['type'] == 'relu':
                out_arr = out_arr/2**(out_frac-frac)
                out_arr = np.clip(out_arr,-2**(bits-1), 2**(bits-1)-1)

                out_arr = np.maximum(out_arr * self.act_details['slope'], out_arr)
                out_arr = np.around(out_arr)
                out_arr = np.clip(out_arr,-2**(bits-1), 2**(bits-1)-1).astype(int)

                out_frac, out_bits = frac, bits

            else:
                raise Exception('Only relu is supported yet')

        if self.pool_details:
            if self.pool_details['type'] == 'max':
                import math
                Stride = 2

                def findMax(InArray, p, q):
                    results = np.zeros((InArray.shape[0], InArray.shape[3]))
                    results -= math.inf
                    for i in range(p, p+3):
                        for j in range(q, q+3):
                            if i >=0 and j>=0 and i < InArray.shape[1] and j < InArray.shape[2]:
                                cand = InArray[:,i,j,:]
                                results = np.maximum(results, cand)
                    return results
                def HotFixMaxPool2D(InArray):
                    pad = 1
                    inShape = InArray.shape
                    assert len(inShape) == 4
                    OutArray = np.zeros((inShape[0], (inShape[1]+pad)//Stride, (inShape[2]+pad)//Stride, inShape[3]))
                    for i in range(OutArray.shape[1]):
                        for j in range(OutArray.shape[2]):
                            # p, q = i*Stride-1, j*Stride-1
                            p, q = i*Stride, j*Stride
                            OutArray[:,i,j,:] = findMax(InArray, p, q)
                    return OutArray
                
                out_arr = HotFixMaxPool2D(out_arr).astype(int)

            elif self.pool_details['type'] == 'avg':
                assert self.pool_details['size'] == self.pool_details['strides']
                KH, KW = self.pool_details['size']
                N, H, W, C = out_arr.shape
                out_arr = out_arr.reshape(N, H//KH, KH, W//KW, KW, C).mean(axis=(2,4))

                bits = self.pool_details['bits']
                out_arr = np.clip(out_arr,-2**(bits-1), 2**(bits-1)-1)
                out_arr = np.around(out_arr)
                out_arr = np.clip(out_arr,-2**(bits-1), 2**(bits-1)-1).astype(int)
            
        if self.flatten:
            out_arr = out_arr.reshape(out_arr.shape[0],-1)

        if self.softmax:
            out_arr = out_arr / 2**out_frac
            exp = np.exp(out_arr - out_arr.max())
            out_arr = exp/np.sum(exp, axis=1)[0]
        
        self.out = out_arr
        return out_arr
    

    @staticmethod
    def get_compile_params(bundles, ROWS, COLS):

        def clog2(x):
            return int(np.ceil(np.log2(x)))
        
        IN_BITS               = 64
        CONFIG_BEATS          = 1
        X_BITS = K_BITS       = max([b.x[1] for b in bundles])
        KW_MAX                = max([b.KW   for b in bundles])
        KH_MAX                = max([b.KH   for b in bundles])
        SW_MAX                = max([b.SW   for b in bundles])
        SH_MAX                = max([b.SH   for b in bundles])
        CI_MAX                = max([b.CI   for b in bundles])
        XW_MAX                = max([b.XW   for b in bundles])
        XH_MAX                = max([b.XH   for b in bundles])
        XN_MAX                = max([b.XN   for b in bundles])
        BRAM_WEIGHTS_DEPTH    = max([b.RAM_WEIGHTS + CONFIG_BEATS for b in bundles])
        RAM_EDGES_DEPTH       = max([b.RAM_EDGES                  for b in bundles])
        
        L_MAX                 = clog2(XH_MAX//ROWS)
        X_PAD                 = clog2(KH_MAX//2)
        BITS_KW2              = clog2((KW_MAX+1)/2)
        BITS_KH2              = clog2((KH_MAX+1)/2)
        BITS_SW               = clog2(SW_MAX)
        BITS_SH               = clog2(SH_MAX)
        BITS_CIN_MAX          = clog2(CI_MAX)
        BITS_COLS_MAX         = clog2(XW_MAX)
        BITS_BLOCKS_MAX       = clog2( L_MAX)
        BITS_XN_MAX           = clog2(XN_MAX)
        BITS_BRAM_WEIGHTS_ADDR= clog2(BRAM_WEIGHTS_DEPTH)

        params = locals()
        params = {k:params[k] for k in params if not ('__' in k or k in ['bundles', 'params', 'clog2'])}
        c = namedtuple('Compile', params)(**params)
        return c

    def export (self):

        if self.type != 'conv':
            print('Conv -> Dense Reshape')
            CI, CO = self.w[0].shape
            XN, _ = self.x[0].shape
            self.w[0] = self.w[0].reshape(1,1,CI,CO) # (CI,CO) -> (KH,KW,CI,CO)
            self.x[0] = self.x[0].reshape(XN,1,1,CI) # (XN,CI) -> (XN, XH, XW, CI)
            self.y[0] = self.y[0].reshape(XN,1,1,CO) # (XN,CI) -> (XN, XH, XW, CI)
        
        self.c = c
        self.r = self.get_runtime_params(self.c, self.w[0], self.x[0], self.y[0])
        self.r = self.create_headers(self.c, self.r)

        print(self.r)
        self.check_sparsity(self.w[0], self.x[0])

        self.we = self.reorder_w_q2e_conv(self.w[0], self.c, self.r)
        self.ye_exp_shape = (self.r.IT, self.r.XN, self.r.L, self.r.XW*self.r.CO_PRL, c.ROWS)
        self.ye_hw = np.zeros(self.ye_exp_shape)
        self.num_t = self.we.shape[0] # iterations

        self.r = self.r._asdict()
        self.c = self.c._asdict()

    @staticmethod
    def get_runtime_params(c, w, x, y):

        SW = SH = 1 # for bundle
        KH, KW, CI, CO = w.shape
        print('weights initial (KH, KW, CI, CO) =', w.shape)

        CO_PRL         = c.COLS * SW // KW                        # SW cols are processed in parallel
        EG             = int(np.floor( c.COLS / (KW + SW - 1)))   # elastic groups
        IT             = int(np.ceil( CO / (SW*EG)))              # iterations needed
        CO_PAD         = IT * CO_PRL                              # output cols padded

        print(f'{KH=}, {KW=}, {CI=}, {CO=}, {CO_PRL=}, {EG=}, {IT=}, {CO_PAD}')

        XN, XH, XW, CI = x.shape
        print('initial (XN, XH, XW, CI)=', x.shape)
        SH_OUT, SW_OUT = x.shape[1]//y.shape[1], x.shape[2]//y.shape[2]

        LH     = c.ROWS*SH              # Block height
        L      = int(np.ceil(XH/LH))    # Blocks
        XH_PAD = LH*L
        BRAM_WEIGHTS_ADDR_MAX  = c.CONFIG_BEATS + SW*KH*CI-1

        '''
        Pack all local variables into a namedtuple
        '''
        params = locals()
        params = {k:params[k] for k in params if not ('__' in k or k in ['w', 'x', 'y', 'c', 'params'])}
        print (params)
        r = namedtuple('Runtime', params)(**params)
        return r


    @staticmethod
    def create_headers(c, r):
        '''
        Create headers
        '''
        def pack_bits(arr):
            sum_width = 0
            packed = 0
            for val, width in arr:
                packed |= val << sum_width
                sum_width += width
            return packed
        
        ''' Weights Config'''
        w_config = pack_bits([
            (r.KW//2, c.BITS_KW2),
            (r.CI-1 , c.BITS_CIN_MAX),
            (r.XW-1 , c.BITS_COLS_MAX),
            (r.L -1 , c.BITS_BLOCKS_MAX),
            (r.XN-1 , c.BITS_XN_MAX),
            (r.BRAM_WEIGHTS_ADDR_MAX, c.BITS_BRAM_WEIGHTS_ADDR)
        ])
        w_config = format(w_config, f'#0{c.IN_BITS}b')
        w_config_words = [int(w_config[i:i+c.K_BITS], 2) for i in range(0, len(w_config), c.K_BITS)]
        w_config_words.reverse()
        w_config_words = np.array(w_config_words,dtype=np.int8)
        w_config_words = np.repeat(w_config_words[np.newaxis,...],repeats=r.IT,axis=0)

        '''Input Config'''
        x_config = pack_bits([
            (r.KH//2, c.BITS_KH2),
            (r.CI-1 , c.BITS_CIN_MAX),
            (r.XW-1 , c.BITS_COLS_MAX),
            (r.L -1 , c.BITS_BLOCKS_MAX),
        ])
        assert c.IN_BITS >= c.BITS_KW2 + c.BITS_CIN_MAX + c.BITS_COLS_MAX + c.BITS_BLOCKS_MAX

        x_config = format(x_config, f'#0{c.IN_BITS}b')
        x_config_words = [int(x_config[i:i+c.X_BITS], 2) for i in range(0, len(x_config), c.X_BITS)]
        x_config_words.reverse()

        d = {'w_config':w_config, 'w_config_words':w_config_words, 'x_config':x_config, 'x_config_words': x_config_words}
        n = namedtuple('Runtime', d)(**d)
        r = namedtuple("Runtime", r._fields + n._fields)(*(r + n))
        return r


    @staticmethod
    def check_sparsity(w, x):
        w_sparse = (w==0).sum()/w.size
        x_sparse = (x==0).sum()/x.size

        p_both_zero = x_sparse * w_sparse
        p_only_one_zero = (1-x_sparse) * w_sparse  +  (1-w_sparse) * x_sparse
        p_neither_zero = (1-x_sparse) * (1-w_sparse)
        zero_result = 1-p_neither_zero

        print(f'''
        w_sparsity   : {w_sparse*100:.2f}%
        x_sparsity   : {x_sparse*100:.2f}%

        both_zero    : {p_both_zero*100:.2f}%
        only_one_zero: {p_only_one_zero*100:.2f}%
        neither_zero : {p_neither_zero*100:.2f}%
        zero_result  : {zero_result*100:.2f}%
        ''')


    @staticmethod
    def reorder_w_q2e_conv(w, c, r):

        w = np.pad(w, ((0,0),(0,0),(0,0),(0,r.CO_PAD-r.CO)))        # (KH, KW, CI, CO_PAD)
        print(w.shape, (r.KH, r.KW, r.CI, r.IT, r.CO_PRL))
        w = w.reshape(r.KH, r.KW, r.CI, r.IT, r.CO_PRL)             # (KH, KW, CI, IT, CO_PRL)
        w = np.flip(w, axis=4)
        w = w.transpose(0,2,3,4,1)                                  # (KH, CI, IT, CO_PRL, KW)

        w = w.reshape  (r.KH, r.CI, r.IT, r.CO_PRL*r.KW)            # (KH, CI, IT, CO_PRL*KW)
        w = np.pad(w, ((0,0),(0,0),(0,0),(0,c.COLS-r.CO_PRL*r.KW))) # (KH, CI, IT, c.COLS)
        w = w.transpose(2,1,0,3)                                    # (IT, CI, KH, c.COLS)
        w = w.reshape (r.IT, r.CI*r.KH, c.COLS)                       # (IT, CI*KH, c.COLS)
        
        w = np.pad(w, ((0,0),(c.CONFIG_BEATS,0),(0,0)))             # (IT, c.CONFIG_BEATS+CI*KH, c.COLS)
        w = w.reshape (r.IT, (r.CI*r.KH+c.CONFIG_BEATS)*c.COLS)     # (IT, (CI*KH+c.CONFIG_BEATS)*c.COLS)

        w = np.concatenate([r.w_config_words, w], axis=1)             # (IT, 8 + CI*KH*c.COLS)
        assert w.shape == (r.IT, c.IN_BITS/c.K_BITS + (r.CI*r.KH+c.CONFIG_BEATS)*c.COLS)
        return w


    @staticmethod
    def reorder_x_q2e_conv(x, c, r):
        print('input initial (XN, XH, XW, CI)=', x.shape)

        x = np.pad(x, ((0,0),(0,r.XH_PAD-r.XH),(0,0),(0,0)))   # (XN, L*HL , XW, CI)
        x = x.reshape  (r.XN, r.L, r.LH, r.XW, r.CI)               # (XN, L, HL, XW, CI)

        zeros = np.zeros((r.XN,r.L,c.ROWS+c.X_PAD,r.XW,r.CI),x.dtype)  # (XN,L,c.ROWS+X_PAD,XW,CI)
        zeros[:,:,:c.ROWS,:,:] = x

        ''' Fill bot rows from next '''
        for l in range(r.L):
            if l == r.L-1:
                zeros[:,l, c.ROWS: ,:,:] = np.zeros((r.XN,c.X_PAD,r.XW,r.CI),x.dtype)
            else:
                zeros[:,l, c.ROWS: ,:,:] = x[:,l+1,:c.X_PAD,:,:]

        x = zeros                  # (XN,L,c.ROWS+X_PAD,XW,CI)
        x = x.transpose(0,1,3,4,2) # (XN,L,XW,CI,c.ROWS+X_PAD)

        x = x.reshape((r.XN*r.L*r.XW*r.CI*(c.ROWS+c.X_PAD)))
        x = np.concatenate([np.array(r.x_config_words, dtype=np.uint8), x.flatten()])
        assert x.shape == (c.IN_BITS/c.X_BITS + r.XN*r.L*r.XW*r.CI*(c.ROWS+c.X_PAD),)
        return x


    @staticmethod
    def reorder_y_q2e_conv(y, c, r):
        YH, YW = r.XH_PAD//r.SH_OUT, r.XW//r.SW_OUT

        if r.SH_OUT != 1:
            print("Striding not yet supported")
            return None

        y = np.pad(y, ((0,0),(0,r.LH*r.L-r.XH),(0,0),(0,r.CO_PAD-r.CO)))     # (XN, L*HL , XW, CO_PAD)
        y = y.reshape((r.XN, r.L, c.ROWS, r.XW, r.CO_PAD))                   # (XN,L,c.ROWS,XW,CO_PAD)
        y = y.reshape((r.XN, r.L, c.ROWS, r.XW, r.IT, r.CO_PRL))             # (XN,L,c.ROWS,XW,IT,CO_PRL)
        y = y.transpose(4,0,1,3,5,2)                                         # (IT,XN,L,XW,CO_PRL,c.ROWS)

        assert y.shape == (r.IT,r.XN,r.L,r.XW,r.CO_PRL,c.ROWS)

        y_w_last = y[:,:,:,-(r.KW//2+1):,:,:]
        y_w_last = y_w_last.transpose(0,1,2,4,3,5).reshape(r.IT,r.XN,r.L,(r.KW//2+1)*r.CO_PRL,c.ROWS)

        y = y.reshape(r.IT,r.XN,r.L,r.XW*r.CO_PRL,c.ROWS)
        y[:,:,:,-(r.KW//2+1)*r.CO_PRL:,:] = y_w_last
        return y
    
    @staticmethod
    def reorder_y_e2q_conv(y, c, r):
        y = y.reshape(r.IT,r.XN,r.L,r.XW*r.CO_PRL,c.ROWS)

        y_w_last = y[:,:,:,-(r.KW//2+1)*r.CO_PRL:,:]
        y_w_last = y_w_last.reshape(r.IT,r.XN,r.L,r.CO_PRL,(r.KW//2+1),c.ROWS)
        y_w_last = y_w_last.transpose(0,1,2,4,3,5)   #(r.IT,r.XN,r.L,(r.KW//2+1),r.CO_PRL,c.ROWS)
        y_w_last = y_w_last.reshape(r.IT,r.XN,r.L,(r.KW//2+1),r.CO_PRL,c.ROWS)
        y_w_last = y_w_last.reshape(r.IT,r.XN,r.L,(r.KW//2+1)*r.CO_PRL,c.ROWS)
        
        y[:,:,:,-(r.KW//2+1)*r.CO_PRL:,:] = y_w_last

        y = y.reshape(r.IT,r.XN,r.L,r.XW,r.CO_PRL,c.ROWS)
        y = y.transpose(1,2,5,3,0,4)
        y = y.reshape((r.XN, r.L*c.ROWS, r.XW, r.CO_PAD))
        y = y[:,:r.XH,:,:r.CO]

        return y

    @staticmethod
    def reorder_y_e2e_conv(y, c, r):
        pass

    @staticmethod
    def reorder_y_e2e_conv2dense(y, c, r):
        pass

In [3]:
with open('bundles.pickle', 'rb') as f:
    bundles = pickle.load(f)
    
for bundle in bundles:
    bundle.r = namedtuple('Runtime', bundle.r)(**bundle.r)
    bundle.c = namedtuple('Compile', bundle.c)(**bundle.c)

2023-07-25 06:35:37.849219: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 9437184 exceeds 10% of free system memory.
2023-07-25 06:35:39.378255: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 9437184 exceeds 10% of free system memory.
2023-07-25 06:35:40.455834: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 9437184 exceeds 10% of free system memory.
2023-07-25 06:35:58.061640: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 8388608 exceeds 10% of free system memory.


Path to site packages: /usr/local/share/pynq-venv/lib64/python3.10/site-packages

In [4]:
myOverlay = Overlay('design_1.bit')
y_recv = myOverlay.dma_weights_out.recvchannel
x_send = myOverlay.dma_pixels.sendchannel
w_send = myOverlay.dma_weights_out.sendchannel

In [5]:
xq = bundles[0].x[0]

for ib, bundle in enumerate(bundles):
    
    x_arr = xq if ib==0 else bundle.prev_bundle.chained_output
    
    if bundle.type == 'conv':
        
        bundle.xe = bundle.reorder_x_q2e_conv(x_arr, bundle.c, bundle.r)
        
        '''
        RUN ENGINE
        '''
        w_buf = allocate(shape=(bundle.we[0].shape), dtype=np.int8)
        x_buf = allocate(shape=(bundle.xe.shape), dtype=np.int8)
        y_buf = allocate(shape=bundle.ye_exp_shape[1:], dtype=np.int32)

        for t in range(bundle.num_t):
            w_buf[:] = bundle.we[t][:]
            x_buf[:] = bundle.xe[:]
            y_buf[:] = 0
            w_buf.flush()
            x_buf.flush()
            y_buf.flush()

            y_recv.transfer(y_buf)
            w_send.transfer(w_buf)
            w_send.wait()
            x_send.transfer(x_buf)
            x_send.wait()
            y_buf.invalidate()

            bundle.ye_hw[t] = y_buf

        bundle.ye_hwr = bundle.reorder_y_e2q_conv(bundle.ye_hw, bundle.c, bundle.r)

        
    else:
        '''
        DENSE on CPU
        '''
        print(x_arr.shape, bundle.w[0].shape)
        bundle.ye_hwr = x_arr @ bundle.w[0].reshape(bundle.w[0].shape[2],bundle.w[0].shape[3])
    
    out = bundle.out = bundle.chained_output = bundle.post_process(bundle.ye_hwr)
    
    if bundle.softmax:
        out = out.reshape(out.shape[0], out.shape[-1])
        print(ib, np.all(np.argmax(bundle.o_arr, axis=-1) == np.argmax(out, axis=-1)))
    else:
        out_f = out/2**bundle.o_frac
        print(ib, 'ERROR ------------------ ', np.sum(np.abs(bundle.o_arr - out_f)), '-------------------\n')       

    

input initial (XN, XH, XW, CI)= (8, 32, 32, 3)
0 ERROR ------------------  0.0 -------------------

input initial (XN, XH, XW, CI)= (8, 8, 8, 64)
1 ERROR ------------------  0.0 -------------------

input initial (XN, XH, XW, CI)= (8, 8, 8, 64)
2 ERROR ------------------  0.0 -------------------

input initial (XN, XH, XW, CI)= (8, 8, 8, 64)
3 ERROR ------------------  0.0 -------------------

input initial (XN, XH, XW, CI)= (8, 8, 8, 64)
4 ERROR ------------------  0.0 -------------------

input initial (XN, XH, XW, CI)= (8, 8, 8, 256)
5 ERROR ------------------  0.0 -------------------

input initial (XN, XH, XW, CI)= (8, 8, 8, 64)
6 ERROR ------------------  0.0 -------------------

input initial (XN, XH, XW, CI)= (8, 8, 8, 64)
7 ERROR ------------------  0.0 -------------------

input initial (XN, XH, XW, CI)= (8, 8, 8, 256)
8 ERROR ------------------  0.0 -------------------

input initial (XN, XH, XW, CI)= (8, 8, 8, 64)
9 ERROR ------------------  0.0 -------------------

input i