In [None]:
import os
import sys
cur_dir = os.getcwd()
aries_path = cur_dir + "/../../../../"
sys.path.append(aries_path)
from frontend import *
from IPython import get_ipython

## 🧮 Vector Addition Example

In this example, we perform element-wise addition of two 256-element vectors `A` and `B` to produce a result vector `C`, where:

```
C[i] = A[i] + B[i], for i in 0 to 255
```

<img src="../images/vadd.png" alt="Vector Adder" width="400"/>

### ARIES Tile Programming for Vector Addition

We divide the vector into **8 grids**, each containing **32 elements** and map it into tile computations in AI Engine-based hardware.

<img src="../images/vadd_grid.png" alt="Vector Adder" width="400"/>

In [None]:
# Vector Add: C[i0] += A[i0] * B[i0]
I = 256
TI = 32
grid = (I // TI, ) # grid must be a tuple

In [None]:
@task_kernel()
def kernel_add(TileA: float32[TI], TileB: float32[TI], TileC: float32[TI]):
    for i0 in range(0, TI):
        TileC[i0] = TileA[i0] + TileB[i0]

### Describe data movement between L3 (External memory) <-> L1 (AIE Local Memory)
Support **hyper-rectangular** data slicing

<img src="../images/vadd_tile.png" alt="Vector Adder" width="800"/>

In [None]:
@task_tile()
def vadd(A: float32[I], B: float32[I], C: float32[I], **kwargs):
    i = aries.tile_ranks(**kwargs)
    
    L1_A = aries.buffer((TI, ), "float32")
    L1_B = aries.buffer((TI, ), "float32")
    L1_C = aries.buffer((TI, ), "float32")
    
    # Compute tile slices for multiple dimensions
    ti = aries.arange(i*TI, (i+1)*TI)  # I tile range
    
    # Move data between L3 and L1
    L1_A = aries.load(A, (ti, ))
    L1_B = aries.load(B, (ti, ))
    kernel_add(L1_A, L1_B, L1_C)
    aries.store(L1_C, C, (ti, ))

In [None]:
@task_top()
def top(A: float32[I], B: float32[I], C: float32[I]):
    gemm_vadd = vadd[grid](A, B, C)
    return gemm_vadd

In [None]:
# Get the input cells that contains the decorators
cell_codes = get_ipython().user_ns["In"][2:6]
# Join them into one string, with a newline between each cell
all_code = "\n".join(cell_codes)

In [None]:
# Initialize the buffers
np.random.seed(0)
A = np.random.rand(I).astype(np.float32)
B = np.random.rand(I).astype(np.float32)
C = np.zeros((I)).astype(np.float32)

# Execute on CPU
vadd_task = top(A, B, C)
golden_C = np.add(A, B)
print("ARIES vadd output matches golden reference:", np.allclose(C, golden_C))

# Generate files for on-board test
aries.gen_sim([A, B, golden_C])

In [None]:
# Apply schedulings
sch = Schedule(vadd_task)
sch.to("VCK190")
sch.ioWidth(vadd_task, 128)
sch.axiWidth(vadd_task, 512)
sch.parallel(vadd_task, [4, ])

# Set the project dir and template dir
prj_dir= cur_dir + '/project_vadd'
temp_dir= aries_path + '/templates'
# Generate Initial MLIR and ARIES Opts
sch.build(all_code, prj_dir, temp_dir)

In [None]:
# View the MLIR code generated by ARIES IR builder
sch.print_mlir(all_code)

In [None]:
### By setting the versal image path, users can run compilation in Jupyter cell
# versal_image_path = '/home/arclab/research/'
# sch.compile(aries_path, prj_dir, versal_image_path, "hw_emu")

sch.compile(aries_path, prj_dir)