# BOAST Interactive Tutorial
## Symmetric Result DGEMM
Based on the work of Eric Bainville

In [None]:
require 'BOAST'
include BOAST

In [None]:
set_lang(C)
set_array_start(0)

In [None]:
def inner_kernel(opts={})
  default = {vector_length: 2, tile_size_x: 4, tile_size_y:2, unroll: 4}
  opts = default.merge(opts)
  vl = opts[:vector_length]
  tsx = opts[:tile_size_x]
  tsy = opts[:tile_size_y]
  unroll = opts[:unroll]
  
  a_inner = Real :a_inner, vector_length: vl, dim: [Dim(tsy)], dir: :in
  b_inner = Real :b_inner, vector_length: vl, dim: [Dim(tsx)], dir: :in
  res_inner = tsx.times.collect { |k|
    tsy.times.collect { |l|
      Real :"res_inner_#{k}_#{l}", vector_length: vl, dir: :in_out
    }
  }

  p_inn = Procedure( :"inner_block_#{vl}_#{tsx}_#{tsy}", [a_inner, b_inner] + res_inner.flatten, local: true, inline: true  ) {
    tmp_a = tsy.times.collect { |l|
      Real :"tmpa_#{l}", vector_length: vl
    }
    tmp_b = tsx.times.collect { |l|
      Real :"tmpb_#{l}", vector_length: vl
    }
    decl *tmp_a
    decl *tmp_b
    loaded = {}
    tsy.times { |i|
      pr tmp_a[i] === a_inner[i]
      tsx.times { |j|
        unless loaded[j]
          pr tmp_b[j] === b_inner[j]
          loaded[j] = true
        end
        pr res_inner[j][i] = res_inner[j][i] + tmp_a[i] * tmp_b[j]
      }
    }
  }
  
  nvec = Int :n, dir: :in
  a = Real :a, vector_length: vl, dim: [Dim(tsy), Dim(nvec)], dir: :in
  b = Real :b, vector_length: vl, dim: [Dim(tsx), Dim(nvec)], dir: :in
  c = Real :c, dim: [Dim(tsx), Dim(tsy)], dir: :in_out

  p = Procedure( :"inner_#{vl}_#{tsx}_#{tsy}_#{unroll}", [nvec, a, b, c]) {
    tmp_res = tsx.times.collect { |k|
      tsy.times.collect { |l|
        Real :"tmpres_#{k}_#{l}", vector_length: vl
      }
    }
    i = Int :i
    decl *tmp_res.flatten    
    decl i
    tmp_res.flatten.each { |tmp|
      pr tmp.set 0.0
    }
    pr For(i, 0, nvec, step: unroll) {
      unroll.times { |j|
        pr p_inn.call(a[0, i+j].address, b[0, i+j].address, *tmp_res.flatten)
      }
    }
    tsy.times { |k|
      tsx.times { |j|
        pr c[j,k] === vl.times.collect { |l| tmp_res[j][k][l] }.reduce(:+)
      }
    }
  }
  
  k = CKernel::new(:includes => "immintrin.h") {
    pr p_inn
    pr p
  }
  k.procedure = p
  k
end

In [None]:
k = inner_kernel

In [None]:
#set_verbose(true)
#set_debug_source(true)
k.build