# Broadcast Add

In [5]:
import numpy as np
import tvm
from tvm import te

In [6]:
def broadcast_add(shape1, shape2):
    assert len(shape1) == 2 and len(shape2) == 2, "Only 2 Dimension is supported"

    for i in range(len(shape1)):
        assert shape1[i]==shape2[i] or shape1[i] == 1 or shape2[i] == 1, "Shape doesn't fit broadcasting shape"

    A = te.placeholder(shape1, name='A')
    B = te.placeholder(shape2, name='B')
    # (3, 1) * (3 * 4)
    # m = 3, n = 4
    m = shape1[0] if shape2[0] == 1 else shape2[0]
    n = shape1[1] if shape2[1] == 1 else shape2[1]
    f = lambda x, y: A[0 if shape1[0] == 1 else x, 0 if shape1[1]==1 else y] + B[0 if shape2[0] == 1 else x, 0 if shape2[1] == 1 else y]
    C = te.compute((m,n),f, name='c')

    return A, B, C

In [7]:
m = 3
n = 4
shape1 = (m, 1)
shape2 = (m, n)
A, B, C = broadcast_add(shape1, shape2)
s = te.create_schedule(C.op)
print(tvm.lower(s, [A, B], simple_mode=True))
mod = tvm.build(s, [A, B, C])

@main = primfn(A_1: handle, B_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(float32), float32, [3], []),
             B: Buffer(B_2: Pointer(float32), float32, [12], [])}
  buffer_map = {A_1: A, B_1: B}
  preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [3, 1], []), B_1: B_3: Buffer(B_2, float32, [3, 4], [])} {
  allocate(c: Pointer(global float32), float32, [12]), storage_scope = global;
  for (x: int32, 0, 3) {
    for (y: int32, 0, 4) {
      let cse_var_1: int32 = ((x*4) + y)
      c_1: Buffer(c, float32, [12], [], align=32)[cse_var_1] = (A[x] + B[cse_var_1])
    }
  }
}




# Matrix Multiplication

In [10]:
def matmul(n, m, l):
    """Return the computing expression of matrix multiplication
    A : n x l matrix
    B : l x m matrix
    C : n x m matrix with C = A B
    """

    k = te.reduce_axis((0, l), name='k')
    A = te.placeholder((n, l), name='A')
    B = te.placeholder((l, m), name='B')
    C = te.compute((n, m),
                    lambda x, y: te.sum(A[x, k] * B[k, y], axis=k),
                    name='C')
    return A, B, C

In [11]:
n = 100
A, B, C = matmul(n, n, n)
s = te.create_schedule(C.op)
print(tvm.lower(s, [A, B], simple_mode=True))
mod = tvm.build(s, [A, B, C])

@main = primfn(A_1: handle, B_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(float32), float32, [10000], []),
             B: Buffer(B_2: Pointer(float32), float32, [10000], [])}
  buffer_map = {A_1: A, B_1: B}
  preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [100, 100], []), B_1: B_3: Buffer(B_2, float32, [100, 100], [])} {
  allocate(C: Pointer(global float32), float32, [10000]), storage_scope = global;
  for (x: int32, 0, 100) {
    for (y: int32, 0, 100) {
      C_1: Buffer(C, float32, [10000], [])[((x*100) + y)] = 0f32
      for (k: int32, 0, 100) {
        let cse_var_2: int32 = (x*100)
        let cse_var_1: int32 = (cse_var_2 + y)
        C_1[cse_var_1] = (C_1[cse_var_1] + (A[(cse_var_2 + k)]*B[((k*100) + y)]))
      }
    }
  }
}




#### C

C_1: Buffer(C, float32, [10000], [])[((x*100) + y)] = 0f32
Total elements is 10000, hence that much buffer is allocated, and the stride is 100

# Convolution

In [None]:
import numpy as np
import tvm
from tvm import te

The convolution (CONV) operator is the one of the most expensive and popular operators in neural networks.

#### Padding

As a prerequisite to convolution, let’s first implement padding, which visually surrounds the targeting tensor with a “shell” surrounding it. The padding values are normally 0.

if the matrix height (i.e. number of rows) is  𝑛ℎ
  and width (i.e. number of columns) is  𝑛𝑤
 , then we will pad  𝑝ℎ
  rows with 0s on top and bottom, and  𝑝𝑤
  columns with 0s on left and right to make its height and width to  𝑛ℎ+2𝑝ℎ
  and  𝑛𝑤+2𝑝𝑤
 , respectively.

- we assume the last two dimensions are rows and columns, 0s are only padded on these two dimensions.

```
X.shape = (Sample_N, C, N, M)
X.shape[-2] = N = Row (height)
X.shape[-1] = M = Columns (width)
```

In [12]:
def padding(X, ph, pw, val=0):
    """Pad X with the given value in 2-D

    ph, pw : height and width padding
    val : padding value, default 0
    """
    assert len(X.shape) >= 2

    nh, nw = X.shape[-2], X.shape[-1]

    return te.compute(
        (*X.shape[0: -2], nh + ph * 2, nw + pw * 2),
        lambda *i: te.if_then_else(
            te.any(i[-2] < ph, i[-2]>=nh+ph, i[-1]<pw, i[-1]>=nw+pw), 
            val, X[i[:-2]+ (i[-2]-ph, i[-1]-pw)]), 
        name='PaddedX')

Final X value: i[:-2]+ (i[-2]-ph, i[-1]-pw)

Here, we are appending the first 2 dimension of the tuples.

In [13]:
A = te.placeholder((2,3,4))
B = padding(A, 1, 2)
s = te.create_schedule(B.op)
mod = tvm.build(s, [A, B])

a = tvm.nd.array(np.ones((2,3,4), dtype='float32'))
b = tvm.nd.array(np.empty((2,5,8), dtype='float32'))
mod(a, b)
print(b)

[[[0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 1. 1. 1. 0. 0.]
  [0. 0. 1. 1. 1. 1. 0. 0.]
  [0. 0. 1. 1. 1. 1. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 1. 1. 1. 0. 0.]
  [0. 0. 1. 1. 1. 1. 0. 0.]
  [0. 0. 1. 1. 1. 1. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0.]]]


#### General Convolution Formula

⌊(𝑛ℎ−𝑘ℎ+2𝑝ℎ)/𝑠ℎ+1⌋×⌊(𝑛𝑤−𝑘𝑤+2𝑝𝑤)/𝑠𝑤+1⌋



In [14]:
def conv_out_size(n, k, p, s):
    return (n - k + 2 * p)//s + 1

In [16]:
def conv(oc, ic, nh, nw, kh, kw, ph=0, pw=0, sh=1, sw=1):
    """Convolution

    oc, ic : output and input channels
    nh, nw : input width and height
    kh, kw : kernel width and height
    ph, pw : height and width padding sizes, default 0
    sh, sw : height and width strides, default 1
    """

    #reduction axes (we reduce in a all dimension when we multiple with the kernel to produce 1 value)
    ric = te.reduce_axis((0, ic), name='ric')
    rkh = te.reduce_axis((0, kh), name='rkh')
    rkw = te.reduce_axis((0, kw), name='rkw')

    # output height and width
    oh = conv_out_size(nh, kh, ph, sh)
    ow = conv_out_size(nw, kw, pw, sw)

    # Padding
    X = te.placeholder((ic, nh, nw), name='X')
    K = te.placeholder((oc, ic, kh, kw), name='K')
    PaddedX = padding(X, ph, pw) if ph * pw != 0 else X

    Y = te.compute(
        (oc, oh, ow),
        lambda c, i, j: te.sum(PaddedX[ric, i*sh+rkh, j*sw+rkw] * K[c, ric, rkh, rkw], axis=[ric, rkh, rkw]), name='Y'
    )

    return X, K, Y, PaddedX

In [17]:
def get_conv_data(oc, ic, n, k, p=0, s=1, constructor=None):
    """Return random 3-D data tensor, 3-D kernel tenor and empty 3-D output
    tensor with the shapes specified by input arguments.

    oc, ic : output and input channels
    n : input width and height
    k : kernel width and height
    p : padding size, default 0
    s : stride, default 1
    constructor : user-defined tensor constructor
    """
    np.random.seed(0)
    data = np.random.normal(size=(ic, n, n)).astype('float32')
    weight = np.random.normal(size=(oc, ic, k, k)).astype('float32')
    on = conv_out_size(n, k, p, s)
    out = np.empty((oc, on, on), dtype='float32')
    if constructor:
        data, weight, out = (constructor(x) for x in [data, weight, out])
    return data, weight, out

In [18]:
# Schedule a conv function with below parameters
oc, ic, n, k, p, s = 4, 6, 12, 3, 1, 1
X, K, Y, _ = conv(oc, ic, n, n, k, k, p, p, s, s)

sch = te.create_schedule(Y.op)
mod = tvm.build(sch, [X, K, Y])
print(tvm.lower(sch, [X, K, Y], simple_mode=True))

data, weight, out = get_conv_data(oc, ic, n, k, p, s, tvm.nd.array)
mod(data, weight, out)

@main = primfn(X_1: handle, K_1: handle, Y_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {X: Buffer(X_2: Pointer(float32), float32, [864], []),
             K: Buffer(K_2: Pointer(float32), float32, [216], []),
             Y: Buffer(Y_2: Pointer(float32), float32, [576], [])}
  buffer_map = {X_1: X, K_1: K, Y_1: Y}
  preflattened_buffer_map = {X_1: X_3: Buffer(X_2, float32, [6, 12, 12], []), K_1: K_3: Buffer(K_2, float32, [4, 6, 3, 3], []), Y_1: Y_3: Buffer(Y_2, float32, [4, 12, 12], [])} {
  allocate(PaddedX: Pointer(global float32), float32, [1176]), storage_scope = global {
    for (i0: int32, 0, 6) {
      for (i1: int32, 0, 14) {
        for (i2: int32, 0, 14) {
          PaddedX_1: Buffer(PaddedX, float32, [1176], [])[(((i0*196) + (i1*14)) + i2)] = @tir.if_then_else(((((i1 < 1) || (13 <= i1)) || (i2 < 1)) || (13 <= i2)), 0f32, X[((((i0*144) + (i1*12)) + i2) - 13)], dtype=float32)
        }
      }
    }
    

In [19]:
import mxnet as mx

def get_conv_data_mxnet(oc, ic, n, k, p, s, ctx='cpu'):
    ctx = getattr(mx, ctx)()
    data, weight, out = get_conv_data(oc, ic, n, k, p, s,
                                      lambda x: mx.nd.array(x, ctx=ctx))
    data, out = data.expand_dims(axis=0), out.expand_dims(axis=0)
    bias = mx.nd.zeros(out.shape[1], ctx=ctx)
    return data, weight, bias, out

# Save to the d2ltvm package.
def conv_mxnet(data, weight, bias, out, k, p, s):
    mx.nd.Convolution(data, weight, bias, kernel=(k,k), stride=(s,s),
                      pad=(p,p), num_filter=out.shape[1], out=out)

data, weight, bias, out_mx = get_conv_data_mxnet(oc, ic, n, k, p, s)
conv_mxnet(data, weight, bias, out_mx, k, p, s)

In [20]:
np.testing.assert_allclose(out_mx[0].asnumpy(), out.asnumpy(), atol=1e-5)