In [None]:
import os
import sys
cur_dir = os.getcwd()
aries_path = cur_dir + "/../../../../"
sys.path.append(aries_path)
from frontend import *
from IPython import get_ipython

## 🧮 GEMM Example

In this example, we perform a General Matrix Multiplication (GEMM) on input matrices `A` and `B` to compute the result matrix `C`. The algorithm follows the standard GEMM formula:

```
C[i, j] += A[i, k] * B[k, j]
```

In [None]:
# GEMM: C[i0, j0] += A[i0, k0] * B[k0, j0]
I, J, K = 256, 256, 256
TI, TJ, TK = 32, 32, 32
grid = (I // TI, J // TJ, K // TK)  # grid must be a tuple

In [None]:
@task_kernel()
def kernel_gemm(TileA: float32[TI, TK], 
                TileB: float32[TK, TJ], 
                TileC: float32[TI, TJ]):
    for i0 in range(0, TI):
        for j0 in range(0, TJ):
            TileC[i0, j0] = float32(0)
            for k0 in range(0, TK):
                TileC[i0, j0] += TileA[i0, k0] * TileB[k0, j0]

In ARIES, the computation is structured using a tiled programming model:

<img src="../images/gemm.png" alt="GEMM" width="600"/>

In [None]:
@task_tile()
def gemm(A: float32[I, K], B: float32[K, J], 
         C: float32[I, J], **kwargs):
    i, j, k = aries.tile_ranks(**kwargs)
    
    L1_A = aries.buffer((TI, TK), "float32")
    L1_B = aries.buffer((TK, TJ), "float32")
    L1_C = aries.buffer((TI, TJ), "float32")
    
    ############# Fill this part #################
    ti = aries.arange(i*TI, (i+1)*TI)  # I tile range aries.arrage(start, stop)
    tk = aries.arange(k*TK, (k+1)*TK)  # K tile range
    ############# Fill this part #################
    tj = aries.arange(j*TJ, (j+1)*TJ)  # J tile range
    
    L1_A = aries.load(A, (ti, tk))
    L1_B = aries.load(B, (tk, tj))
    kernel_gemm(L1_A, L1_B, L1_C)
    aries.accstore(L1_C, C, (ti, tj))

In [None]:
@task_top()
def top(A: float32[I, K], B: float32[K, J], C: float32[I, J]):
    gemm_task = gemm[grid](A, B, C)
    return gemm_task

In [None]:
# Get the input cells that contains the decorators
cell_codes = get_ipython().user_ns["In"][2:6]
# Join them into one string, with a newline between each cell
all_code = "\n".join(cell_codes)

In [None]:
# Initialize the buffers
np.random.seed(0)
A = np.random.rand(I, K).astype(np.float32)
B = np.random.rand(K, J).astype(np.float32)
C = np.zeros((I, J)).astype(np.float32)

# Execute on CPU
gemm_task = top(A, B, C)
D = np.matmul(A, B)

# Compare the program with golden file
print(np.allclose(C, D))

# Generate files for on-board test
aries.gen_sim([A, B, D])

In [None]:
# Apply schedulings
sch = Schedule(gemm_task)
sch.to("VCK190")

# This is used in MLIR-AIE Auto Vectorizer for single AIE optimization
sch.aieUnroll(factor=8) # To guarantee memory aligment
sch.aieVector(factor=8) # To translate to AIE vector instructions

In [None]:
# Set the project dir and template dir
prj_dir= cur_dir + '/project_gemm_hw_emu'
temp_dir= aries_path + '/templates'
# Generate Initial MLIR and ARIES Opts
sch.build(all_code, prj_dir, temp_dir)
sch.compile(aries_path, prj_dir)