diff --git a/Project.toml b/Project.toml
index 7b2e11e..20eb595 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,14 +1,15 @@
 name = "Octavian"
 uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
 authors = ["Mason Protter", "Chris Elrod", "Dilum Aluthge", "contributors"]
-version = "0.2.20"
+version = "0.3.0"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
 IfElse = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
+ManualMemory = "d125e4d3-2237-4719-b19c-fa641b8a4667"
+Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
 Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
-StrideArraysCore = "7792a7ef-975c-4747-a70f-980b88e8d1da"
 ThreadingUtilities = "8290d209-cae3-49c0-8002-c8c24d57dab5"
 VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 
@@ -16,9 +17,10 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 ArrayInterface = "3.1.14"
 IfElse = "0.1"
 LoopVectorization = "0.12.34"
+ManualMemory = "0.1.1"
+Polyester = "0.3.5"
 Static = "0.2"
-StrideArraysCore = "0.1.11"
-ThreadingUtilities = "0.4"
+ThreadingUtilities = "0.4.6"
 VectorizationBase = "0.20.16"
 julia = "1.6"
 
diff --git a/src/Octavian.jl b/src/Octavian.jl
index 4954b98..933cb1c 100644
--- a/src/Octavian.jl
+++ b/src/Octavian.jl
@@ -8,14 +8,12 @@ using VectorizationBase: align, AbstractStridedPointer, zstridedpointer, vsub_ns
 using LoopVectorization: preserve_buffer, CloseOpen, UpperBoundedInteger
 using ArrayInterface: size, strides, offsets, indices, axes
 using IfElse: ifelse
-
+using Polyester
 using Static: StaticInt, Zero, One, StaticBool, True, False, gt, eq, StaticFloat64,
     roundtostaticint, floortostaticint
-using StrideArraysCore: MemoryBuffer
+using ManualMemory: MemoryBuffer, load, store!
 
-using ThreadingUtilities:
-    _atomic_add!, _atomic_load, _atomic_store!,    
-    launch, wait, load, store!
+using ThreadingUtilities: _atomic_add!, _atomic_load, _atomic_store!, launch, wait
 
 export StaticInt
 export matmul!
diff --git a/src/block_sizes.jl b/src/block_sizes.jl
index 4d7756f..7d0587f 100644
--- a/src/block_sizes.jl
+++ b/src/block_sizes.jl
@@ -220,15 +220,15 @@ end
 Splits both `M` and `N` into blocks when trying to spawn a large number of threads relative to the size of the matrices.
 """
 @inline function divide_blocks(::Val{T}, M, Ntotal, _nspawn, W) where {T}
-    _nspawn == num_cores() && return find_first_acceptable(Val(T), M, W)
-    mᵣ, nᵣ = matmul_params(Val(T))
-    Miter = clamp(div_fast(M, W*mᵣ * MᵣW_mul_factor()), 1, _nspawn)
+  _nspawn == num_cores() && return find_first_acceptable(Val(T), M, W)
+  mᵣ, nᵣ = matmul_params(Val(T))
+  Miter = clamp(div_fast(M, W*mᵣ * MᵣW_mul_factor()), 1, _nspawn)
+  nspawn = div_fast(_nspawn, Miter)
+  if (nspawn ≤ 1) & (Miter < _nspawn)
+    # rebalance Miter
+    Miter = cld_fast(_nspawn, cld_fast(_nspawn, Miter))
     nspawn = div_fast(_nspawn, Miter)
-    if (nspawn ≤ 1) & (Miter < _nspawn)
-        # rebalance Miter
-        Miter = cld_fast(_nspawn, cld_fast(_nspawn, Miter))
-        nspawn = div_fast(_nspawn, Miter)
-    end
-    Miter, cld_fast(Ntotal, max(2, cld_fast(Ntotal, nspawn)))
+  end
+  Miter, cld_fast(Ntotal, max(2, cld_fast(Ntotal, nspawn)))
 end
 
diff --git a/src/funcptrs.jl b/src/funcptrs.jl
index 9a03f9e..fade6ef 100644
--- a/src/funcptrs.jl
+++ b/src/funcptrs.jl
@@ -1,103 +1,101 @@
 
-
 struct LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd} <: Function end
 function (::LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd})(p::Ptr{UInt}) where {P,TC,TA,TB,Α,Β,Md,Kd,Nd}
-    offset, C = load(p, TC, 2*sizeof(UInt))
-    offset, A = load(p, TA, offset)
-    offset, B = load(p, TB, offset)
-    offset, α = load(p, Α, offset)
-    offset, β = load(p, Β, offset)
-    offset, M = load(p, Md, offset)
-    offset, K = load(p, Kd, offset)
-    offset, N = load(p, Nd, offset)
-    _call_loopmul!(C, A, B, α, β, M, K, N, Val{P}())
-    nothing
+  offset, C = load(p, TC, 2*sizeof(UInt))
+  offset, A = load(p, TA, offset)
+  offset, B = load(p, TB, offset)
+  offset, α = load(p, Α, offset)
+  offset, β = load(p, Β, offset)
+  offset, M = load(p, Md, offset)
+  offset, K = load(p, Kd, offset)
+  offset, N = load(p, Nd, offset)
+  _call_loopmul!(C, A, B, α, β, M, K, N, Val{P}())
+  nothing
 end
 @inline _call_loopmul!(C, A, B, α, β, M, K, N, ::Val{false}) = loopmul!(C, A, B, α, β, M, K, N)
 @inline function _call_loopmul!(C::StridedPointer{T}, A, B, α, β, M, K, N, ::Val{true}) where {T}
-    if M*K < first_cache_size(Val(T)) * R₂Default()
-        packaloopmul!(C, A, B, α, β, M, K, N)
-        return
-    else
-        matmul_st_only_pack_A!(C, A, B, α, β, M, K, N, W₁Default(), W₂Default(), R₁Default(), R₂Default())
-        return
-    end
+  if M*K < first_cache_size(Val(T)) * R₂Default()
+    packaloopmul!(C, A, B, α, β, M, K, N)
+    return
+  else
+    matmul_st_only_pack_A!(C, A, B, α, β, M, K, N, W₁Default(), W₂Default(), R₁Default(), R₂Default())
+    return
+  end
 end
 call_loopmul!(C, A, B, α, β, M, K, N, ::Val{P}) where {P} = _call_loopmul!(C, A, B, α, β, M, K, N, Val{P}())
 
 struct SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂} <: Function end
 function (::SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂})(p::Ptr{UInt}) where {TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂}
-    offset, C = load(p, TC, 2*sizeof(UInt))
-    offset, A = load(p, TA, offset)
-    offset, B = load(p, TB, offset)
-    offset, α = load(p, Α, offset)
-    offset, β = load(p, Β, offset)
-    offset, M = load(p, Md, offset)
-    offset, K = load(p, Kd, offset)
-    offset, N = load(p, Nd, offset)
-    offset, atomicp = load(p, Ptr{UInt32}, offset)
-    offset, bcachep = load(p, BCP, offset)
-    offset, id = load(p, ID, offset)
-    offset, total_ids = load(p, TT, offset)
-    sync_mul!(C, A, B, α, β, M, K, N, atomicp, bcachep, id, total_ids, StaticFloat64{W₁}(), StaticFloat64{W₂}(), StaticFloat64{R₁}(), StaticFloat64{R₂}())
-    nothing
+  offset, C = load(p, TC, 2*sizeof(UInt))
+  offset, A = load(p, TA, offset)
+  offset, B = load(p, TB, offset)
+  offset, α = load(p, Α, offset)
+  offset, β = load(p, Β, offset)
+  offset, M = load(p, Md, offset)
+  offset, K = load(p, Kd, offset)
+  offset, N = load(p, Nd, offset)
+  offset, atomicp = load(p, Ptr{UInt32}, offset)
+  offset, bcachep = load(p, BCP, offset)
+  offset, id = load(p, ID, offset)
+  offset, total_ids = load(p, TT, offset)
+  sync_mul!(C, A, B, α, β, M, K, N, atomicp, bcachep, id, total_ids, StaticFloat64{W₁}(), StaticFloat64{W₂}(), StaticFloat64{R₁}(), StaticFloat64{R₂}())
+  nothing
 end
 
 @generated function cfuncpointer(::T) where {T}
-    precompile(T(), (Ptr{UInt},))
-    quote
-        $(Expr(:meta,:inline))
-        @cfunction($(T()), Cvoid, (Ptr{UInt},))
-    end
+  precompile(T(), (Ptr{UInt},))
+  quote
+    $(Expr(:meta,:inline))
+    @cfunction($(T()), Cvoid, (Ptr{UInt},))
+  end
 end
 
 @inline function setup_matmul!(p::Ptr{UInt}, C::TC, A::TA, B::TB, α::Α, β::Β, M::Md, K::Kd, N::Nd, ::Val{P}) where {P,TC,TA,TB,Α,Β,Md,Kd,Nd}
-    offset = store!(p, cfuncpointer(LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd}()), sizeof(UInt))
-    offset = store!(p, C, offset)
-    offset = store!(p, A, offset)
-    offset = store!(p, B, offset)
-    offset = store!(p, α, offset)
-    offset = store!(p, β, offset)
-    offset = store!(p, M, offset)
-    offset = store!(p, K, offset)
-    offset = store!(p, N, offset)
-    nothing
+  offset = store!(p, cfuncpointer(LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd}()), sizeof(UInt))
+  offset = store!(p, C, offset)
+  offset = store!(p, A, offset)
+  offset = store!(p, B, offset)
+  offset = store!(p, α, offset)
+  offset = store!(p, β, offset)
+  offset = store!(p, M, offset)
+  offset = store!(p, K, offset)
+  offset = store!(p, N, offset)
+  nothing
 end
 
 @inline function setup_syncmul!(
-    p::Ptr{UInt}, C::TC, A::TA, B::TB, α::Α, β::Β, M::Md, K::Kd, N::Nd,
-    ap::Ptr{UInt32},bcp::BCP,id::ID,tt::TT,::StaticFloat64{W₁},::StaticFloat64{W₂},::StaticFloat64{R₁},::StaticFloat64{R₂}
+  p::Ptr{UInt}, C::TC, A::TA, B::TB, α::Α, β::Β, M::Md, K::Kd, N::Nd,
+  ap::Ptr{UInt32},bcp::BCP,id::ID,tt::TT,::StaticFloat64{W₁},::StaticFloat64{W₂},::StaticFloat64{R₁},::StaticFloat64{R₂}
 ) where {TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂}
-    offset = store!(p, cfuncpointer(SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂}()), sizeof(UInt))
-    offset = store!(p, C, offset)
-    offset = store!(p, A, offset)
-    offset = store!(p, B, offset)
-    offset = store!(p, α, offset)
-    offset = store!(p, β, offset)
-    offset = store!(p, M, offset)
-    offset = store!(p, K, offset)
-    offset = store!(p, N, offset)
-    offset = store!(p, ap,  offset)
-    offset = store!(p, bcp, offset)
-    offset = store!(p, id,  offset)
-    offset = store!(p, tt,  offset)
-    nothing
+  offset = store!(p, cfuncpointer(SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂}()), sizeof(UInt))
+  offset = store!(p, C, offset)
+  offset = store!(p, A, offset)
+  offset = store!(p, B, offset)
+  offset = store!(p, α, offset)
+  offset = store!(p, β, offset)
+  offset = store!(p, M, offset)
+  offset = store!(p, K, offset)
+  offset = store!(p, N, offset)
+  offset = store!(p, ap,  offset)
+  offset = store!(p, bcp, offset)
+  offset = store!(p, id,  offset)
+  offset = store!(p, tt,  offset)
+  nothing
 end
 
-function launch_thread_mul!(C, A, B, α, β, M, K, N, tid::Int, ::Val{P}) where {P}
-    launch(tid, C, A, B, α, β, M, K, N, Val{P}()) do p, C, A, B, α, β, M, K, N, VP
-        setup_matmul!(p, C, A, B, α, β, M, K, N, VP)
-    end
+@inline function launch_thread_mul!(C, A, B, α, β, M, K, N, tid::UInt32, ::Val{P}) where {P}
+  launch(setup_matmul!, tid, C, A, B, α, β, M, K, N, Val{P}())
 end
-function launch_thread_mul!(
-    C, A, B, α, β, M, K, N, ap, bcp, tid, tt,::StaticFloat64{W₁},::StaticFloat64{W₂},::StaticFloat64{R₁},::StaticFloat64{R₂}
+@inline function launch_thread_mul!(
+  C, A, B, α, β, M, K, N, ap, bcp, tid, id, tt, ::StaticFloat64{W₁},::StaticFloat64{W₂},::StaticFloat64{R₁},::StaticFloat64{R₂}
 ) where {W₁,W₂,R₁,R₂}
-    launch(tid+one(tid), C, A, B, α, β, M, K, N, ap, bcp, tid, tt) do p, C, A, B, α, β, M, K, N, ap, bcp, tid, tt
-        setup_syncmul!(
-            p, C, A, B, α, β, M, K, N, ap, bcp, tid, tt,
-            StaticFloat64{W₁}(),StaticFloat64{W₂}(),StaticFloat64{R₁}(),StaticFloat64{R₂}()
-        )
-    end
+  launch(tid, C, A, B, α, β, M, K, N, ap, bcp, id, tt) do p, C, A, B, α, β, M, K, N, ap, bcp, id, tt
+    Base.@_inline_meta
+    setup_syncmul!(
+      p, C, A, B, α, β, M, K, N, ap, bcp, id, tt,
+      StaticFloat64{W₁}(),StaticFloat64{W₂}(),StaticFloat64{R₁}(),StaticFloat64{R₂}()
+    )
+  end
 end
 
 
diff --git a/src/global_constants.jl b/src/global_constants.jl
index 2f65b54..c9e5445 100644
--- a/src/global_constants.jl
+++ b/src/global_constants.jl
@@ -73,7 +73,7 @@ bcache_count() = VectorizationBase.num_cache(second_cache())
 const BCACHEPTR = Ref{Ptr{Cvoid}}(C_NULL)
 const BCACHE_LOCK = Threads.Atomic{UInt}(zero(UInt))
 
-if Sys.WORD_SIZE ≤ 32
-    const ACACHEPTR = Ref{Ptr{Cvoid}}(C_NULL)
+@static if Sys.WORD_SIZE ≤ 32
+  const ACACHEPTR = Ref{Ptr{Cvoid}}(C_NULL)
 end
 
diff --git a/src/init.jl b/src/init.jl
index 8673d3d..dce3d26 100644
--- a/src/init.jl
+++ b/src/init.jl
@@ -1,45 +1,45 @@
 function __init__()
-    init_acache()
-    init_bcache()
-    nt = init_num_tasks()
-    if nt < num_cores() && ("OCTAVIAN_WARNING" ∈ keys(ENV))
-        msg = string(
-            "Your system has $(num_cores()) physical cores, but `Octavian.jl` only has ",
-            "$(nt > 1 ? "$(nt) threads" : "$(nt) thread") available. ",
-            "For the best performance, you should start Julia with at least $(num_cores()) threads.",
-        )
-        @warn msg
-    end
-    reseet_bcache_lock!()
+  init_acache()
+  init_bcache()
+  nt = init_num_tasks()
+  if nt < num_cores() && ("OCTAVIAN_WARNING" ∈ keys(ENV))
+    msg = string(
+      "Your system has $(num_cores()) physical cores, but `Octavian.jl` only has ",
+      "$(nt > 1 ? "$(nt) threads" : "$(nt) thread") available. ",
+      "For the best performance, you should start Julia with at least $(num_cores()) threads.",
+    )
+    @warn msg
+  end
+  reseet_bcache_lock!()
 end
 
 function init_bcache()
-    if bcache_count() ≢ Zero()
-        BCACHEPTR[] = VectorizationBase.valloc(second_cache_size() * bcache_count(), Cvoid, ccall(:jl_getpagesize, Int, ()))
-    end
-    nothing
+  if bcache_count() ≢ Zero()
+    BCACHEPTR[] = VectorizationBase.valloc(second_cache_size() * bcache_count(), Cvoid, ccall(:jl_getpagesize, Int, ()))
+  end
+  nothing
 end
 
-if Sys.WORD_SIZE ≤ 32
-    function init_acache()
-        ACACHEPTR[] = VectorizationBase.valloc(first_cache_size() * init_num_tasks(), Cvoid, ccall(:jl_getpagesize, Int, ()))
-        nothing
-    end
+@static if Sys.WORD_SIZE ≤ 32
+  function init_acache()
+    ACACHEPTR[] = VectorizationBase.valloc(first_cache_size() * init_num_tasks(), Cvoid, ccall(:jl_getpagesize, Int, ()))
+    nothing
+  end
 else
-    init_acache() = nothing
+  init_acache() = nothing
 end
 
 function init_num_tasks()
-    num_tasks = _read_environment_num_tasks()::Int
-    OCTAVIAN_NUM_TASKS[] = num_tasks
+  num_tasks = _read_environment_num_tasks()::Int
+  OCTAVIAN_NUM_TASKS[] = num_tasks
 end
 
 function _read_environment_num_tasks()
-    environment_variable = get(ENV, "OCTAVIAN_NUM_TASKS", "")::String
-    nt = min(Threads.nthreads(), VectorizationBase.num_cores())::Int
-    if isempty(environment_variable)
-        return nt
-    else
-        return min(parse(Int, environment_variable)::Int, nt)
-    end
+  environment_variable = get(ENV, "OCTAVIAN_NUM_TASKS", "")::String
+  nt = min(Threads.nthreads(), VectorizationBase.num_cores())::Int
+  if isempty(environment_variable)
+    return nt
+  else
+    return min(parse(Int, environment_variable)::Int, nt)
+  end
 end
diff --git a/src/matmul.jl b/src/matmul.jl
index f413dcc..52604b0 100644
--- a/src/matmul.jl
+++ b/src/matmul.jl
@@ -296,134 +296,147 @@ end
 end
 
 # This funciton is sort of a `pun`. It splits aggressively (it does a lot of "splitin'"), which often means it will split-N.
-function matmulsplitn!(C::AbstractStridedPointer{T}, A, B, α, β, ::StaticInt{Mc}, M, K, N, nspawn, ::Val{PACK}) where {T, Mc, PACK}
-    Mᵣ, Nᵣ = matmul_params(Val(T))
-    W = pick_vector_width(T)
-    MᵣW = Mᵣ*W
-    _Mblocks, Nblocks = divide_blocks(Val(T), M, cld_fast(N, Nᵣ), nspawn, W)
-    Mbsize, Mrem, Mremfinal, Mblocks = split_m(M, _Mblocks, W)
-    # Nblocks = min(N, _Nblocks)
-    Nbsize, Nrem = divrem_fast(N, Nblocks)
-
-    _nspawn = Mblocks * Nblocks
-    Mbsize_Mrem, Mbsize_ = promote(Mbsize +     W, Mbsize)
-    Nbsize_Nrem, Nbsize_ = promote(Nbsize + One(), Nbsize)
-
-    let _A = A, _B = B, _C = C, n = 0, tnum = 0, Nrc = Nblocks - Nrem, Mrc = Mblocks - Mrem, __Mblocks = Mblocks - One()
-        while true
-            nsize = ifelse(Nblocks > Nrc, Nbsize_Nrem, Nbsize_); Nblocks -= 1
-            let _A = _A, _C = _C, __Mblocks = __Mblocks
-                while __Mblocks != 0
-                    msize = ifelse(__Mblocks ≥ Mrc, Mbsize_Mrem, Mbsize_); __Mblocks -= 1
-                    launch_thread_mul!(_C, _A, _B, α, β, msize, K, nsize, (tnum += 1), Val{PACK}())
-                    _A = gesp(_A, (msize, Zero()))
-                    _C = gesp(_C, (msize, Zero()))
-                end
-                if Nblocks != 0
-                    launch_thread_mul!(_C, _A, _B, α, β, Mremfinal, K, nsize, (tnum += 1), Val{PACK}())
-                else
-                    call_loopmul!(_C, _A, _B, α, β, Mremfinal, K, nsize, Val{PACK}())
-                    waitonmultasks(CloseOpen(One(), _nspawn))
-                    return
-                end
-            end
-            _B = gesp(_B, (Zero(), nsize))
-            _C = gesp(_C, (Zero(), nsize))
+function matmulsplitn!(C::AbstractStridedPointer{T}, A, B, α, β, ::StaticInt{Mc}, M, K, N, threads, ::Val{PACK}) where {T, Mc, PACK}
+  Mᵣ, Nᵣ = matmul_params(Val(T))
+  W = pick_vector_width(T)
+  MᵣW = Mᵣ*W
+  _Mblocks, Nblocks = divide_blocks(Val(T), M, cld_fast(N, Nᵣ), threads.i%Int + 1, W)
+  Mbsize, Mrem, Mremfinal, Mblocks = split_m(M, _Mblocks, W)
+  # Nblocks = min(N, _Nblocks)
+  Nbsize, Nrem = divrem_fast(N, Nblocks)
+
+  _nspawn = Mblocks * Nblocks
+  Mbsize_Mrem, Mbsize_ = promote(Mbsize +     W, Mbsize)
+  Nbsize_Nrem, Nbsize_ = promote(Nbsize + One(), Nbsize)
+  (tnum, tuu) = Polyester.initial_state(threads)
+  let _A = A, _B = B, _C = C, n = 0, Nrc = Nblocks - Nrem, Mrc = Mblocks - Mrem, __Mblocks = Mblocks - One()
+    while true
+      nsize = ifelse(Nblocks > Nrc, Nbsize_Nrem, Nbsize_); Nblocks -= 1
+      let _A = _A, _C = _C, __Mblocks = __Mblocks
+        while __Mblocks != 0
+          msize = ifelse(__Mblocks ≥ Mrc, Mbsize_Mrem, Mbsize_); __Mblocks -= 1
+          (tnum, tuu) = Polyester.iter(tnum, tuu)
+          launch_thread_mul!(_C, _A, _B, α, β, msize, K, nsize, tnum, Val{PACK}())
+          _A = gesp(_A, (msize, Zero()))
+          _C = gesp(_C, (msize, Zero()))
         end
+        if Nblocks != 0
+          (tnum, tuu) = Polyester.iter(tnum, tuu)
+          launch_thread_mul!(_C, _A, _B, α, β, Mremfinal, K, nsize, tnum, Val{PACK}())
+        else
+          call_loopmul!(_C, _A, _B, α, β, Mremfinal, K, nsize, Val{PACK}())
+          waitonmultasks(threads, _nspawn)
+          return
+        end
+      end
+      _B = gesp(_B, (Zero(), nsize))
+      _C = gesp(_C, (Zero(), nsize))
     end
+  end
 end
 
 function __matmul!(
     C::AbstractStridedPointer{T}, A::AbstractStridedPointer, B::AbstractStridedPointer, α, β, M, K, N, nthread
 ) where {T}
-    Mᵣ, Nᵣ = matmul_params(Val(T))
-    W = pick_vector_width(T)
-    Mc, Kc, Nc = block_sizes(Val(T))
-    MᵣW = Mᵣ*W
-
-    # Not taking the fast path
-    # But maybe we don't want to thread anyway
-    # Maybe this is nested, or we have ≤ 1 threads
-    nt = _nthreads()
-    _nthread = nthread === nothing ? nt : min(nt, nthread)
-    if _nthread < 2
-        matmul_st_pack_dispatcher!(C, A, B, α, β, M, K, N)
-        return
-    end
-    # We are threading, but how many threads?
-    nspawn = if (Sys.ARCH === :x86_64) || (Sys.ARCH === :i686)
-        clamp(div_fast(M * N, StaticInt{128}() * W), 1, _nthread)
-    else
-        clamp(div_fast(M * N, StaticInt{256}() * W), 1, _nthread)
-    end
-    # nkern = cld_fast(M * N,  MᵣW * Nᵣ)
-    
-    # Approach:
-    # Check if we don't want to pack A,
-    #    if not, aggressively subdivide
-    # if so, check if we don't want to pack B
-    #    if not, check if we want to thread `N` loop anyway
-    #       if so, divide `M` first, then use ratio of desired divisions / divisions along `M` to calc divisions along `N`
-    #       if not, only thread along `M`. These don't need syncing, as we're not packing `B`
-    #    if so, `matmul_pack_A_and_B!`
-    #
-    # MᵣW * (MᵣW_mul_factor - One()) # gives a smaller Mc, then
-    # if 2M/nspawn is less than it, we don't don't `A`
-    # First check is: do we just want to split aggressively?
-    mᵣ, nᵣ = matmul_params(Val(T))
-    if dontpack(A, M, K, Mc, Kc, T, nspawn) || (W ≥ M) || (nᵣ*((num_cores() ≥ StaticInt(8)) ? max(nspawn,8) : 8) ≥ N)
-        # `nᵣ*nspawn ≥ N` is needed at the moment to avoid accidentally splitting `N` to be `< nᵣ` while packing
-        # Should probably handle that with a smarter splitting function...
-        matmulsplitn!(C, A, B, α, β, Mc, M, K, N, nspawn, Val{false}())
-    elseif (bcache_count() === Zero()) || ((nspawn*(W+W) > M) || (contiguousstride1(B) ? (roundtostaticint(Kc * Nc * R₂Default()) ≥ K * N) : (firstbytestride(B) ≤ 1600)))
-        matmulsplitn!(C, A, B, α, β, Mc, M, K, N, nspawn, Val{true}())
-    else # TODO: Allow splitting along `N` for `matmul_pack_A_and_B!`
-        matmul_pack_A_and_B!(C, A, B, α, β, M, K, N, nspawn, W₁Default(), W₂Default(), R₁Default(), R₂Default())
-    end
-    nothing
+  Mᵣ, Nᵣ = matmul_params(Val(T))
+  W = pick_vector_width(T)
+  Mc, Kc, Nc = block_sizes(Val(T))
+  MᵣW = Mᵣ*W
+
+  # Not taking the fast path
+  # But maybe we don't want to thread anyway
+  # Maybe this is nested, or we have ≤ 1 threads
+  nt = _nthreads()
+  _nthread = nthread === nothing ? nt : min(nt, nthread)
+  if _nthread < 2
+    @label SINGLETHREAD
+    matmul_st_pack_dispatcher!(C, A, B, α, β, M, K, N)
+    return
+  end
+  # We are threading, but how many threads?
+  _nrequest = if (Sys.ARCH === :x86_64) || (Sys.ARCH === :i686)
+    clamp(div_fast(M * N, StaticInt{128}() * W), 0, _nthread-1)
+  else
+    clamp(div_fast(M * N, StaticInt{256}() * W), 0, _nthread-1)
+  end
+  # nkern = cld_fast(M * N,  MᵣW * Nᵣ)
+  threads, torelease = Polyester.request_threads(Threads.threadid()%UInt32, _nrequest)
+  nrequest = threads.i
+  iszero(nrequest) && @goto SINGLETHREAD
+  nspawn = nrequest + 1
+  # Approach:
+  # Check if we don't want to pack A,
+  #    if not, aggressively subdivide
+  # if so, check if we don't want to pack B
+  #    if not, check if we want to thread `N` loop anyway
+  #       if so, divide `M` first, then use ratio of desired divisions / divisions along `M` to calc divisions along `N`
+  #       if not, only thread along `M`. These don't need syncing, as we're not packing `B`
+  #    if so, `matmul_pack_A_and_B!`
+  #
+  # MᵣW * (MᵣW_mul_factor - One()) # gives a smaller Mc, then
+  # if 2M/nspawn is less than it, we don't don't `A`
+  # First check is: do we just want to split aggressively?
+  mᵣ, nᵣ = matmul_params(Val(T))
+  if dontpack(A, M, K, Mc, Kc, T, nspawn) || (W ≥ M) || (nᵣ*((num_cores() ≥ StaticInt(8)) ? max(nspawn,8) : 8) ≥ N)
+    # `nᵣ*nspawn ≥ N` is needed at the moment to avoid accidentally splitting `N` to be `< nᵣ` while packing
+    # Should probably handle that with a smarter splitting function...
+    matmulsplitn!(C, A, B, α, β, Mc, M, K, N, threads, Val{false}())
+  elseif (bcache_count() === Zero()) || ((nspawn*(W+W) > M) || (contiguousstride1(B) ? (roundtostaticint(Kc * Nc * R₂Default()) ≥ K * N) : (firstbytestride(B) ≤ 1600)))
+    matmulsplitn!(C, A, B, α, β, Mc, M, K, N, threads, Val{true}())
+  else # TODO: Allow splitting along `N` for `matmul_pack_A_and_B!`
+    matmul_pack_A_and_B!(C, A, B, α, β, M, K, N, threads, W₁Default(), W₂Default(), R₁Default(), R₂Default())
+  end
+  Polyester.free_threads!(torelease)
+  nothing
 end
 
 
 # If tasks is [0,1,2,3] (e.g., `CloseOpen(0,4)`), it will wait on `MULTASKS[i]` for `i = [1,2,3]`.
-function waitonmultasks(tasks)
-    for tid ∈ tasks
-        wait(tid)
-    end
+function waitonmultasks(threads, nthread)
+  # for (_,tid) ∈ threads
+  #   wait(tid)
+  # end
+  (tnum, tuu) = Polyester.initial_state(threads)
+  for _ ∈ CloseOpen(One(), nthread)
+    (tnum, tuu) = Polyester.iter(tnum, tuu)
+    wait(tnum)
+  end
 end
 
 @inline allocref(::StaticInt{N}) where {N} = Ref{NTuple{N,UInt8}}()
 function matmul_pack_A_and_B!(
-    C::AbstractStridedPointer{T}, A::AbstractStridedPointer, B::AbstractStridedPointer, α, β, M, K, N,
-    tospawn::Int, ::StaticFloat64{W₁}, ::StaticFloat64{W₂}, ::StaticFloat64{R₁}, ::StaticFloat64{R₂}#, ::Val{1}
-) where {T,W₁,W₂,R₁,R₂}
-    W = pick_vector_width(T)
-    mᵣ, nᵣ = matmul_params(Val(T))
-    mᵣW = mᵣ * W
-    # atomicsync = Ref{NTuple{16,UInt}}()
-    Mbsize, Mrem, Mremfinal, _to_spawn = split_m(M, tospawn, W) # M is guaranteed to be > W because of `W ≥ M` condition for `jmultsplitn!`...
-    atomicsync = allocref((StaticInt{1}()+num_cores())*cache_linesize())
-    p = align(reinterpret(Ptr{UInt32}, Base.unsafe_convert(Ptr{UInt8}, atomicsync)))
-    GC.@preserve atomicsync begin
-        for i ∈ CloseOpen(_to_spawn)
-            _atomic_store!(reinterpret(Ptr{UInt64}, p) + i*cache_linesize(), 0x0000000000000000)
-        end
-        Mblock_Mrem, Mblock_ = promote(Mbsize + W, Mbsize)
-        u_to_spawn = _to_spawn % UInt
-        tid = 0
-        bc = _use_bcache()
-        bc_ptr = Base.unsafe_convert(typeof(pointer(C)), pointer(bc))
-        last_id = _to_spawn - One()
-        for m ∈ CloseOpen(last_id) # ...thus the fact that `CloseOpen()` iterates at least once is okay.
-            Mblock = ifelse(m < Mrem, Mblock_Mrem, Mblock_)
-            launch_thread_mul!(C, A, B, α, β, Mblock, K, N, p, bc_ptr, m % UInt, u_to_spawn, StaticFloat64{W₁}(),StaticFloat64{W₂}(),StaticFloat64{R₁}(),StaticFloat64{R₂}())
-            A = gesp(A, (Mblock, Zero()))
-            C = gesp(C, (Mblock, Zero()))
-        end
-        sync_mul!(C, A, B, α, β, Mremfinal, K, N, p, bc_ptr, last_id % UInt, u_to_spawn, StaticFloat64{W₁}(), StaticFloat64{W₂}(), StaticFloat64{R₁}(), StaticFloat64{R₂}())
-        waitonmultasks(CloseOpen(One(), _to_spawn))
+  C::AbstractStridedPointer{T}, A::AbstractStridedPointer, B::AbstractStridedPointer, α, β, M, K, N,
+  threads, ::StaticFloat64{W₁}, ::StaticFloat64{W₂}, ::StaticFloat64{R₁}, ::StaticFloat64{R₂}#, ::Val{1}
+  ) where {T,W₁,W₂,R₁,R₂}
+  W = pick_vector_width(T)
+  mᵣ, nᵣ = matmul_params(Val(T))
+  mᵣW = mᵣ * W
+  # atomicsync = Ref{NTuple{16,UInt}}()
+  Mbsize, Mrem, Mremfinal, _to_spawn = split_m(M, threads.i%Int + 1, W) # M is guaranteed to be > W because of `W ≥ M` condition for `jmultsplitn!`...
+  atomicsync = allocref((StaticInt{1}()+num_cores())*cache_linesize())
+  p = align(reinterpret(Ptr{UInt32}, Base.unsafe_convert(Ptr{UInt8}, atomicsync)))
+  GC.@preserve atomicsync begin
+    for i ∈ CloseOpen(_to_spawn)
+      store!(reinterpret(Ptr{UInt64}, p) + i*cache_linesize(), 0x0000000000000000)
     end
-    _free_bcache!(bc)
-    return
+    Mblock_Mrem, Mblock_ = promote(Mbsize + W, Mbsize)
+    u_to_spawn = _to_spawn % UInt
+    (tnum, tuu) = Polyester.initial_state(threads)
+    bc = _use_bcache()
+    bc_ptr = Base.unsafe_convert(typeof(pointer(C)), pointer(bc))
+    last_id = _to_spawn - One()
+    for m ∈ CloseOpen(last_id) # ...thus the fact that `CloseOpen()` iterates at least once is okay.
+      Mblock = ifelse(m < Mrem, Mblock_Mrem, Mblock_)
+      (tnum, tuu) = Polyester.iter(tnum, tuu)
+      launch_thread_mul!(C, A, B, α, β, Mblock, K, N, p, bc_ptr, tnum, m % UInt, u_to_spawn, StaticFloat64{W₁}(),StaticFloat64{W₂}(),StaticFloat64{R₁}(),StaticFloat64{R₂}())
+      A = gesp(A, (Mblock, Zero()))
+      C = gesp(C, (Mblock, Zero()))
+    end
+    sync_mul!(C, A, B, α, β, Mremfinal, K, N, p, bc_ptr, last_id % UInt, u_to_spawn, StaticFloat64{W₁}(), StaticFloat64{W₂}(), StaticFloat64{R₁}(), StaticFloat64{R₂}())
+    waitonmultasks(threads, _to_spawn)
+  end
+  _free_bcache!(bc)
+  return
 end
 
 function sync_mul!(
@@ -504,7 +517,7 @@ function sync_mul!(
   nothing
 end
 
-function _matmul!(y::AbstractVector{T}, A::AbstractMatrix, x::AbstractVector, α, β, MKN, contig_axis) where {T}
+function _matmul!(y::AbstractVector{T}, A::AbstractMatrix, x::AbstractVector, α, β, _, __) where {T}
   @tturbo for m ∈ indices((A,y),1)
     yₘ = zero(T)
     for n ∈ indices((A,x),(2,1))
@@ -514,7 +527,7 @@ function _matmul!(y::AbstractVector{T}, A::AbstractMatrix, x::AbstractVector, α
   end
   return y
 end
-function _matmul_serial!(y::AbstractVector{T}, A::AbstractMatrix, x::AbstractVector, α, β, MKN) where {T}
+function _matmul_serial!(y::AbstractVector{T}, A::AbstractMatrix, x::AbstractVector, α, β, _) where {T}
   @turbo for m ∈ indices((A,y),1)
     yₘ = zero(T)
     for n ∈ indices((A,x),(2,1))
diff --git a/src/memory_buffer.jl b/src/memory_buffer.jl
index 569ce7d..f4dc1aa 100644
--- a/src/memory_buffer.jl
+++ b/src/memory_buffer.jl
@@ -1,12 +1,15 @@
 
-if Sys.WORD_SIZE == 32
-    @inline function first_cache_buffer(::Val{T}) where {T}
-        reinterpret(Ptr{T}, ACACHEPTR[] + Threads.threadid() * first_cache_size() - first_cache_size())
-    end
+@inline function first_cache_buffer(::Val{T}) where {T}
+  first_cache_buffer(Val{T}(), first_cache_size(Val(T)))
+end
+@static if Sys.WORD_SIZE == 32
+  @inline function first_cache_buffer(::Val{T}, N) where {T}
+    reinterpret(Ptr{T}, ACACHEPTR[] + ((Threads.threadid()-1) * N) * static_sizeof(T))
+  end
 else
-    @inline function first_cache_buffer(::Val{T}) where {T}
-        MemoryBuffer{T}(undef, first_cache_size(Val(T)))
-    end
+  @inline function first_cache_buffer(::Val{T}, ::StaticInt{N}) where {T,N}
+    MemoryBuffer{N,T}(undef)
+  end
 end
 
 BCache(i::Integer) = BCache(BCACHEPTR[]+cld_fast(second_cache_size()*i, Threads.nthreads()), i % UInt)
@@ -16,20 +19,20 @@ BCache(::Nothing) = BCache(BCACHEPTR[], nothing)
 @inline Base.unsafe_convert(::Type{Ptr{T}}, b::BCache) where {T} = Base.unsafe_convert(Ptr{T}, b.p)
 
 function _use_bcache()
-    while Threads.atomic_cas!(BCACHE_LOCK, zero(UInt), typemax(UInt)) != zero(UInt)
-        pause()
-    end
-    return BCache(nothing)
+  while Threads.atomic_cas!(BCACHE_LOCK, zero(UInt), typemax(UInt)) != zero(UInt)
+    pause()
+  end
+  return BCache(nothing)
 end
 @inline _free_bcache!(b::BCache{Nothing}) = reseet_bcache_lock!()
 
 _use_bcache(::Nothing) = _use_bcache()
 function _use_bcache(i)
-    f = one(UInt) << i
-    while (Threads.atomic_or!(BCACHE_LOCK, f) & f) != zero(UInt)
-        pause()
-    end
-    BCache(i)
+  f = one(UInt) << i
+  while (Threads.atomic_or!(BCACHE_LOCK, f) & f) != zero(UInt)
+    pause()
+  end
+  BCache(i)
 end
 _free_bcache!(b::BCache{UInt}) = (Threads.atomic_xor!(BCACHE_LOCK, one(UInt) << b.i); nothing)
 
diff --git a/test/matmul_coverage.jl b/test/matmul_coverage.jl
index 4093a50..47f9595 100644
--- a/test/matmul_coverage.jl
+++ b/test/matmul_coverage.jl
@@ -3,23 +3,26 @@ k_values  = [10, 20, 50, 100, 150, 200]
 m_values  = [10, 20, 50, 100, 150, 200]
 
 function matmul_pack_ab!(C, A, B)
-    M, N = size(C); K = size(B,1)
-    zc, za, zb = Octavian.zstridedpointer.((C,A,B))
-    nspawn = min(Threads.nthreads(), Octavian.num_cores())
-    GC.@preserve C A B begin
-        if nspawn > 1
-            Octavian.matmul_pack_A_and_B!(
-                zc, za, zb, Octavian.StaticInt{1}(), Octavian.StaticInt{0}(), M, K, N, nspawn,
-                Octavian.W₁Default(), Octavian.W₂Default(), Octavian.R₁Default(), Octavian.R₂Default()
-            )
-        else
-            Octavian.matmul_st_pack_A_and_B!(
-                zc, za, zb, Octavian.StaticInt{1}(), Octavian.StaticInt{0}(), M, K, N,
-                Octavian.W₁Default(), Octavian.W₂Default(), Octavian.R₁Default(), Octavian.R₂Default(), 0
-            )
-        end
+  M, N = size(C); K = size(B,1)
+  zc, za, zb = Octavian.zstridedpointer.((C,A,B))
+  nspawn = min(Threads.nthreads(), Octavian.num_cores())
+  GC.@preserve C A B begin
+    if nspawn > 1
+      threads, torelease = Octavian.Polyester.request_threads(Threads.threadid(), nspawn-1)
+      @assert threads.i < Threads.nthreads()
+      Octavian.matmul_pack_A_and_B!(
+        zc, za, zb, Octavian.StaticInt{1}(), Octavian.StaticInt{0}(), M, K, N, threads,
+        Octavian.W₁Default(), Octavian.W₂Default(), Octavian.R₁Default(), Octavian.R₂Default()
+      )
+      Octavian.Polyester.free_threads!(torelease)
+    else
+      Octavian.matmul_st_pack_A_and_B!(
+        zc, za, zb, Octavian.StaticInt{1}(), Octavian.StaticInt{0}(), M, K, N,
+        Octavian.W₁Default(), Octavian.W₂Default(), Octavian.R₁Default(), Octavian.R₂Default(), 0
+      )
     end
-    C
+  end
+  C
 end
 
 testset_name_suffix = "(coverage)"