In [1]:
from __future__ import annotations
from exo import *
from exo.libs.memories import DRAM_STATIC
from exo.platforms.x86 import *
from exo.syntax import *

In [3]:
@proc
def SGEMM(
    M: size,
    N: size,
    K: size,
    A: f32[M, K],
    B: f32[K, N],
    C: f32[M, N]
):
    assert M >= 1
    assert N >= 1
    assert K >= 1
    assert stride(A, 1) == 1
    assert stride(B, 1) == 1
    assert stride(C, 1) == 1

    for i in par(0, M):
        for j in par(0, N):
            for k in par(0, K):
                C[i, j] += A[i, k] * B[k, j]
print(SGEMM.c_code_str())


#include <stdint.h>
#include <stdbool.h>

// Compiler feature macros adapted from Hedley (public domain)
// https://github.com/nemequ/hedley

#if defined(__has_builtin)
#  define EXO_HAS_BUILTIN(builtin) __has_builtin(builtin)
#else
#  define EXO_HAS_BUILTIN(builtin) (0)
#endif

#if EXO_HAS_BUILTIN(__builtin_assume)
#  define EXO_ASSUME(expr) __builtin_assume(expr)
#elif EXO_HAS_BUILTIN(__builtin_unreachable)
#  define EXO_ASSUME(expr) \
      ((void)((expr) ? 1 : (__builtin_unreachable(), 1)))
#else
#  define EXO_ASSUME(expr) ((void)(expr))
#endif

typedef struct c_code_str_Context { 

} c_code_str_Context;


// SGEMM(
//     M : size,
//     N : size,
//     K : size,
//     A : f32[M,K]  @DRAM,
//     B : f32[K,N]  @DRAM,
//     C : f32[M,N]  @DRAM
// )
void SGEMM( c_code_str_Context *ctxt, int_fast32_t M, int_fast32_t N, int_fast32_t K, float* A, float* B, float* C );


static int _floor_div(int num, int quot) {
  int off = (num>=0)? 0 : quot-1;
  return (num-off)/quot;
}

static 

In [63]:
M_c = 64
K_c = 64
N_r = 4
M_r = 16
sgemm_win = (
    SGEMM.rename('sgemm_win')
        .set_window('A', True)
        .set_window('B', True)
        .set_window('C', True)
)

In [73]:
#Multiply a M_C*K_C block of A and a K_C*N panel of B. Calls the microkernel for each strip of B
microkernel = (sgemm_win
                .rename('microkernel')
                .partial_eval(N_r,M_r)
                .simplify())
GEBP = (SGEMM
            .rename("GEBP")
            #Partial eval
            .partial_eval(M=M_c)
            .partial_eval(K=K_c)
            #Tile the block of A and the panel of B
            .split('i', N_r, ['io', 'ii'], tail='cut_and_guard')
            .split('j', M_r, ['jo', 'ji'], tail='cut_and_guard')
            #Handle edge case
            .fission_after('for jo in _: _', n_lifts=2)
            #reorder so 
            .reorder('ii','jo')
            #.unroll('io') #Can't unroll loops that lack a constant bound
            .replace_all(microkernel)
            .simplify()

)
print(GEBP.c_code_str())


#include <stdint.h>
#include <stdbool.h>

// Compiler feature macros adapted from Hedley (public domain)
// https://github.com/nemequ/hedley

#if defined(__has_builtin)
#  define EXO_HAS_BUILTIN(builtin) __has_builtin(builtin)
#else
#  define EXO_HAS_BUILTIN(builtin) (0)
#endif

#if EXO_HAS_BUILTIN(__builtin_assume)
#  define EXO_ASSUME(expr) __builtin_assume(expr)
#elif EXO_HAS_BUILTIN(__builtin_unreachable)
#  define EXO_ASSUME(expr) \
      ((void)((expr) ? 1 : (__builtin_unreachable(), 1)))
#else
#  define EXO_ASSUME(expr) ((void)(expr))
#endif

struct exo_win_2f32{
    float *data;
    int_fast32_t strides[2];
};
typedef struct c_code_str_Context { 

} c_code_str_Context;


// GEBP(
//     N : size,
//     A : f32[64,64]  @DRAM,
//     B : f32[64,N]  @DRAM,
//     C : f32[64,N]  @DRAM
// )
void GEBP( c_code_str_Context *ctxt, int_fast32_t N, float* A, float* B, float* C );


static int _floor_div(int num, int quot) {
  int off = (num>=0)? 0 : quot-1;
  return (num-off)/quot;
}

s