In [1]:
import numpy as np
from dataclasses import dataclass

In [2]:
DATA_W_H, DATA_W_W, DATA_W_OC, DATA_W_IC = (3, 3, 8, 64)
DATA_IFM_H, DATA_IFM_W, DATA_IFM_IC = (34, 34, 64)
DATA_OFM_H, DATA_OFM_W, DATA_OFM_OC = (32, 32, 8)

In [3]:
# conv_input_wl8_fl5 = np.round(np.random.uniform(low=-4, high=3.96875, size=(1, 64, 34, 34))*2**5)/2**5
# conv_kernel_wl8_fl7 = np.round(np.random.uniform(low=-1, high=1-1/2**7, size=(64, 3, 3, 8))*2**7)/2**7
# np.save("conv_input_wl8_fl5.npy", conv_input_wl8_fl5)
# np.save("conv_kernel_wl8_fl7.npy", conv_kernel_wl8_fl7)

In [4]:
def getData():
    data_ifm = np.load("conv_input_wl8_fl5.npy")
    data_ker = np.load("conv_kernel_wl8_fl7.npy")
    data_ofm = np.load("conv_out.npy")

    return data_ifm, data_ker, data_ofm

In [9]:
def baseline(data_ifm, data_ker):

    _, ifm_c, ifm_n, ifm_m = data_ifm.shape
    ker_ic, ker_y, ker_x, ker_oc = data_ker.shape

    ofm_n = ofm_m = ifm_n - ker_y + 1
    ofm_c = ker_oc

    print("ifm shape: {}*{}*{}\n".format(ifm_c, ifm_n, ifm_m))
    print("ker shape: {}*{}*{}*{}\n".format(ker_ic, ker_y, ker_x, ker_oc))
    print("ofm shape: {}*{}*{}\n".format(ofm_c, ofm_n, ofm_m))

    data_ofm = np.zeros((1, ofm_c, ofm_n, ofm_m))

    for oc in range(ofm_c):
        for nn in range(ofm_n):
            for mm in range(ofm_m):
                for ic in range(ker_ic):
                    for yy in range(ker_y):
                        for xx in range(ker_x):
                            data_ofm[0][oc][nn][mm] += data_ker[ic][yy][xx][oc] * data_ifm[0][ic][nn+yy][mm+xx]

    return np.transpose(data_ofm, (0, 2, 3, 1))

In [10]:
def opuOrig(data_ifm, data_ker):

    _, ifm_c, ifm_n, ifm_m = data_ifm.shape
    ker_ic, ker_y, ker_x, ker_oc = data_ker.shape

    ofm_n = ofm_m = ifm_n - ker_y + 1
    ofm_c = ker_oc

    # print("ifm shape: {}*{}*{}\n".format(ifm_c, ifm_n, ifm_m))
    # print("ker shape: {}*{}*{}*{}\n".format(ker_ic, ker_y, ker_x, ker_oc))
    # print("ofm shape: {}*{}*{}\n".format(ofm_c, ofm_n, ofm_m))
    data_opu = np.zeros((1, ofm_c, ofm_n, ofm_m))

    for oc in range(ker_oc):
        for yy in range(ker_y):
            for xx in range(ker_x):
                # compute all ifm corresponding to kernel[yy][xx]
                for ic in range(ker_ic):
                    # ic could be compute in parallel
                    for nn in range(ofm_n):
                        for mm in range(ofm_m):
                            conv_out = data_ker[ic][yy][xx][oc] * data_ifm[0][ic][nn+yy][mm+xx]
                            data_opu[0][oc][nn][mm] += conv_out
    return np.transpose(data_opu, (0, 2, 3, 1))

In [11]:
def ddrInit(data_ifm, data_ker):
    _, ifm_c, ifm_n, ifm_m = data_ifm.shape
    ker_ic, ker_y, ker_x, ker_oc = data_ker.shape

    ofm_n = ofm_m = ifm_n - ker_y + 1
    ofm_c = ker_oc

    ddr_ifm = np.transpose(data_ifm, (0, 2, 3, 1)).flatten()
    ddr_ker = np.transpose(data_ker, (1, 2, 3, 0)).flatten()

    return ddr_ifm, ddr_ker

In [12]:
def MU(FM, W1, W2):
    return FM*W1, FM*W2

In [13]:
def PE(pe_FM, pe_W1, pe_W2):
    mu_res0 = np.zeros(16)
    mu_res1 = np.zeros(16)
    for mu in range(16):
        mu_res0[mu], mu_res1[mu] = MU(pe_FM[mu], pe_W1[mu], pe_W2[mu])

    add0_lv3 = np.zeros(2**3)
    add0_lv2 = np.zeros(2**2)
    add0_lv1 = np.zeros(2**1)
    add0_lv0 = 0

    add1_lv3 = np.zeros(2**3)
    add1_lv2 = np.zeros(2**2)
    add1_lv1 = np.zeros(2**1)
    add1_lv0 = 0

    for adder in range(2**3):
        add0_lv3[adder] = mu_res0[2*adder] + mu_res0[2*adder+1]
        add1_lv3[adder] = mu_res1[2*adder] + mu_res1[2*adder+1]

    for adder in range(2**2):
        add0_lv2[adder] = add0_lv3[2*adder] + add0_lv3[2*adder+1]
        add1_lv2[adder] = add1_lv3[2*adder] + add1_lv3[2*adder+1]
    
    for adder in range(2**1):
        add0_lv1[adder] = add0_lv2[2*adder] + add0_lv2[2*adder+1]
        add1_lv1[adder] = add1_lv2[2*adder] + add1_lv2[2*adder+1]
    
    for adder in range(2**0):
        add0_lv0 = add0_lv1[2*adder] + add0_lv1[2*adder+1]
        add1_lv0 = add1_lv1[2*adder] + add1_lv1[2*adder+1]

    fm_w0 = add0_lv0
    fm_w1 = add1_lv0

    return fm_w0, fm_w1

In [14]:
def ComputationUnit(cu_FM, cu_W1, cu_W2):
    pe_res0 = np.zeros(16)
    pe_res1 = np.zeros(16)
    for pe in range(16):
#         breakpoint()
        pe_res0[pe], pe_res1[pe] = PE(cu_FM[pe*16: pe*16+16], cu_W1[pe*16: pe*16+16], cu_W2[pe*16: pe*16+16])

    add0_lv3 = np.zeros(2**3)
    add0_lv2 = np.zeros(2**2)
    add0_lv1 = np.zeros(2**1)
    add0_lv0 = 0

    add1_lv3 = np.zeros(2**3)
    add1_lv2 = np.zeros(2**2)
    add1_lv1 = np.zeros(2**1)
    add1_lv0 = 0

    for adder in range(2**3):
        add0_lv3[adder] = pe_res0[2*adder] + pe_res0[2*adder+1]
        add1_lv3[adder] = pe_res1[2*adder] + pe_res1[2*adder+1]

    for adder in range(2**2):
        add0_lv2[adder] = add0_lv3[2*adder] + add0_lv3[2*adder+1]
        add1_lv2[adder] = add1_lv3[2*adder] + add1_lv3[2*adder+1]
    
    for adder in range(2**1):
        add0_lv1[adder] = add0_lv2[2*adder] + add0_lv2[2*adder+1]
        add1_lv1[adder] = add1_lv2[2*adder] + add1_lv2[2*adder+1]
    
    for adder in range(2**0):
        add0_lv0 = add0_lv1[2*adder] + add0_lv1[2*adder+1]
        add1_lv0 = add1_lv1[2*adder] + add1_lv1[2*adder+1]

    fm_w0 = add0_lv0
    fm_w1 = add1_lv0

    fm_w0_64 = add0_lv2
    fm_w1_64 = add1_lv2

#     return np.column_stack((fm_w0_64, fm_w1_64)).flatten()
    return np.concatenate([fm_w0_64, fm_w1_64])

In [15]:
def opuConv(ddr_ifm, ddr_ker):
    ifm_len = len(ddr_ifm)
    ker_len = len(ddr_ker)
    ddr_ofm = np.zeros(32*32*8)

    for blk_ker_n in range(3):
        for blk_ker_m in range(3):
            blk_ker_num = (blk_ker_n*3+blk_ker_m)*8*64
            blk_ker = ddr_ker[blk_ker_num: blk_ker_num+512]
            for blk_ofm_n in range(32):
                for blk_ofm_m in range(32):
                    blk_ifm_num = ((blk_ofm_n+blk_ker_n)*34+(blk_ofm_m+blk_ker_m))*64
                    blk_ofm_num = blk_ofm_n*32*8 + blk_ofm_m*8
                    
                    blk_ifm = np.tile(ddr_ifm[blk_ifm_num: blk_ifm_num+64], 4)
                    cu_Res = ComputationUnit(blk_ifm, blk_ker[:256], blk_ker[256:])
                    ddr_ofm[blk_ofm_num: blk_ofm_num+8] += cu_Res
    
    return ddr_ofm

In [16]:
def opuConvTest(ddr_ifm, ddr_ker):
    ifm_len = len(ddr_ifm)
    ker_len = len(ddr_ker)
    ddr_ofm = np.zeros((3, 3, 32*32*8))

    for blk_ker_n in range(3):
        for blk_ker_m in range(3):
            blk_ker_num = (blk_ker_n*3+blk_ker_m)*8*64
            blk_ker = ddr_ker[blk_ker_num: blk_ker_num+8*64]
            for blk_ofm_n in range(32):
                for blk_ofm_m in range(32):
                    blk_ifm_num = ((blk_ofm_n+blk_ker_n)*34+(blk_ofm_m+blk_ker_m))*64
                    blk_ofm_num = blk_ofm_n*32*8 + blk_ofm_m*8
                    
                    blk_ifm = np.tile(ddr_ifm[blk_ifm_num: blk_ifm_num+64], 4)
                    cu_Res = ComputationUnit(blk_ifm, blk_ker[:256], blk_ker[256:])
                    ddr_ofm[blk_ker_n][blk_ker_m][blk_ofm_num: blk_ofm_num+8] = cu_Res
    
    return ddr_ofm

In [17]:
def verify(data_ofm, data_res):
    return (data_ofm-data_res).var()

In [18]:
data_ifm, data_ker, data_ofm = getData()

_, ifm_c, ifm_n, ifm_m = data_ifm.shape
ker_ic, ker_y, ker_x, ker_oc = data_ker.shape

ofm_n = ofm_m = ifm_n - ker_y + 1
ofm_c = ker_oc

In [19]:
ofm_base = baseline(data_ifm, data_ker)
ofm_opuOrig = opuOrig(data_ifm, data_ker)

ifm shape: 64*34*34

ker shape: 64*3*3*8

ofm shape: 8*32*32



In [16]:
%pdb on

Automatic pdb calling has been turned ON


In [20]:
ddr_ifm, ddr_ker = ddrInit(data_ifm, data_ker)

In [21]:
ddr_ofm = opuConv(ddr_ifm, ddr_ker)

In [19]:
ddr_ofm_test = opuConvTest(ddr_ifm, ddr_ker)

In [20]:
print(verify(ddr_ofm, ofm_base.flatten()))
print(verify(ddr_ofm, ofm_opuOrig.flatten()))
print(verify(ofm_base.flatten(), ofm_opuOrig.flatten()))

0.0
0.0
0.0


In [21]:
data_ifm_tran = np.transpose(data_ifm, (0, 2, 3, 1))
data_ker_tran = np.transpose(data_ker, (1, 2, 3, 0))

In [22]:
ddr_ofm_test[0][0][8:16]

array([-11.89892578,   7.40649414,   5.32202148,   2.45703125,
         1.86401367,  27.46411133,   6.77148438,  -6.59716797])

In [23]:
(data_ker_tran[0][0]*data_ifm_tran[0][0][0]).shape

(8, 64)

In [24]:
tmp_full = np.zeros((3, 3, 32*32*8))
for yy in range(3):
    for xx in range(3):
        for nn in range(32):
            for mm in range(32):
                for oc in range(8):
                    tmp_full[yy][xx][nn*32*8+mm*8+oc] = np.sum(np.multiply(data_ifm_tran[0][nn+yy][mm+xx], data_ker_tran[yy][xx][oc]))

In [25]:
tmp_full == ddr_ofm_test

array([[[ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True]],

       [[ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True]],

       [[ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True]]])

In [26]:
blk_ker_n = 0
blk_ker_m = 1

In [27]:
blk_ker_num = (blk_ker_n*3+blk_ker_m)*8*64
blk_ker = ddr_ker[blk_ker_num: blk_ker_num+8*64]

In [28]:
# data_ker_tran[0][1].flatten() == blk_ker

In [29]:
tmp = np.sum(data_ker_tran[0][0]*data_ifm_tran[0][0][0], axis=1)
display(tmp)

array([  0.796875  ,  -5.18603516, -15.1394043 ,  -5.32104492,
         0.98925781, -19.68798828,  31.37573242, -22.77661133])