diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 87f7485..804912c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,27 +11,26 @@ defaults: run: shell: bash jobs: - test: - name: Julia ${{ matrix.version }}/${{ matrix.threads }} threads/${{ matrix.os }}/${{ matrix.arch }}/${{ github.event_name }} + coverage: + name: coverage=true/Julia ${{ matrix.version }}/${{ matrix.threads }} threads/${{ matrix.os }}/${{ matrix.arch }}/${{ github.event_name }} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: arch: - x64 - - x86 + eltype: + - Float64 + - Float32 + - Int64 + - Int32 os: - ubuntu-latest - - windows-latest - - macOS-latest threads: - '1' - '3' # GitHub runners have 2 cores, so `NUM_CORES+1` is 3 version: - '1' # automatically expands to the latest stable 1.x release of Julia - exclude: - - os: macOS-latest - arch: x86 # 32-bit Julia binaries are not available on macOS steps: - uses: actions/checkout@v2 - uses: julia-actions/setup-julia@v1 @@ -51,24 +50,35 @@ jobs: - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 with: - coverage: false + coverage: true env: JULIA_NUM_THREADS: ${{ matrix.threads }} - coverage: - name: coverage=true/Julia ${{ matrix.version }}/${{ matrix.threads }} threads/${{ matrix.os }}/${{ matrix.arch }}/${{ github.event_name }} + JULIA_TEST_ELTYPE: ${{ matrix.eltype }} + - uses: julia-actions/julia-processcoverage@v1 + - uses: codecov/codecov-action@v1 + with: + file: lcov.info + test: + name: Julia ${{ matrix.version }}/${{ matrix.threads }} threads/${{ matrix.os }}/${{ matrix.arch }}/${{ github.event_name }} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: arch: - x64 + - x86 os: - ubuntu-latest + - windows-latest + - macOS-latest threads: - '1' - '3' # GitHub runners have 2 cores, so `NUM_CORES+1` is 3 version: - '1' # automatically expands to the latest stable 1.x release of Julia + exclude: + - os: macOS-latest + arch: x86 # 32-bit Julia binaries are not available on macOS steps: - uses: actions/checkout@v2 - uses: julia-actions/setup-julia@v1 @@ -88,13 +98,9 @@ jobs: - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 with: - coverage: true + coverage: false env: JULIA_NUM_THREADS: ${{ matrix.threads }} - - uses: julia-actions/julia-processcoverage@v1 - - uses: codecov/codecov-action@v1 - with: - file: lcov.info docs: name: Documentation runs-on: ubuntu-latest diff --git a/Project.toml b/Project.toml index 32177fa..2cb7e7a 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Octavian" uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4" authors = ["Mason Protter", "Chris Elrod", "Dilum Aluthge", "contributors"] -version = "0.3.3" +version = "0.3.4" [deps] ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" @@ -16,12 +16,12 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" [compat] ArrayInterface = "3.1.14" IfElse = "0.1" -LoopVectorization = "0.12.34" +LoopVectorization = "0.12.86" ManualMemory = "0.1.1" -PolyesterWeave = "0.1" +PolyesterWeave = "0.1.1" Static = "0.2, 0.3" ThreadingUtilities = "0.4.6" -VectorizationBase = "0.21.5" +VectorizationBase = "0.21.15" julia = "1.6" [extras] diff --git a/src/complex_matmul.jl b/src/complex_matmul.jl index 6ca0f7f..dc92bb1 100644 --- a/src/complex_matmul.jl +++ b/src/complex_matmul.jl @@ -52,7 +52,8 @@ for AT in [:AbstractVector, :AbstractMatrix] # to avoid ambiguity error η = ifelse(ArrayInterface.is_lazy_conjugate(_A), StaticInt(-1), StaticInt(1)) (+ᶻ, -ᶻ) = ifelse(ArrayInterface.is_lazy_conjugate(_C), (-, +), (+, -)) - @tturbo for n ∈ indices((C, B), (3, 2)), m ∈ indices((C, A), 2) + # @tturbo for n ∈ indices((C, B), (3, 2)), m ∈ indices((C, A), 2) + @turbo for n ∈ indices((C, B), (3, 2)), m ∈ indices((C, A), 2) Cmn_re = zero(T) Cmn_im = zero(T) for k ∈ indices((A, B), (3, 1)) @@ -130,4 +131,4 @@ for AT in [:AbstractVector, :AbstractMatrix] # to avoid ambiguity error _C end end -end \ No newline at end of file +end diff --git a/src/matmul.jl b/src/matmul.jl index 7573a08..ea88805 100644 --- a/src/matmul.jl +++ b/src/matmul.jl @@ -360,7 +360,7 @@ function __matmul!( clamp(div_fast(M * N, StaticInt{256}() * W), 0, _nthread-1) end # nkern = cld_fast(M * N, MᵣW * Nᵣ) - threads, torelease = PolyesterWeave.__request_threads(_nrequest % UInt32, PolyesterWeave.worker_pointer()) + threads, torelease = PolyesterWeave.__request_threads(_nrequest % UInt32, PolyesterWeave.worker_pointer(), nothing) # _threads, _torelease = PolyesterWeave.request_threads(Threads.threadid()%UInt32, _nrequest) nrequest = threads.i diff --git a/test/_matmul.jl b/test/_matmul.jl index d9028d1..074e543 100644 --- a/test/_matmul.jl +++ b/test/_matmul.jl @@ -3,209 +3,130 @@ # `n_values` # `k_values` # `m_values` -for T ∈ (ComplexF32, ComplexF64, Complex{Int64}, Complex{Int32}) - @time @testset "Matrix Multiply $T $(testset_name_suffix)" begin - for n ∈ n_values - for k ∈ k_values - for m ∈ m_values - A = rand(T, m, k); - B = rand(T, k, n); - b = rand(T, k); - - Are = real.(A); - Bre = real.(B); - bre = real.(b); - - A′ = permutedims(A)' - B′ = permutedims(B)' - AB = A * B; - A′B = A′*B; - AB′ = A*B′; - A′B′= A′*B′; - Ab = A*b; - A′b = A′*b; +function test_complex(::Type{TE}, m_values, k_values, n_values, testset_name_suffix) where {TE} + T = Complex{TE} + @time @testset "Matrix Multiply $T $(testset_name_suffix)" begin + for n ∈ n_values + for k ∈ k_values + for m ∈ m_values + A = rand(T, m, k); + B = rand(T, k, n); + b = rand(T, k); - AreB = Are*B; - ABre = A*Bre; - Areb = Are*b; - Abre = A*bre; + Are = real.(A); + Bre = real.(B); + bre = real.(b); - @info "" T n k m - @test @time(Octavian.matmul(A, B)) ≈ AB - @test @time(Octavian.matmul(A, Bre)) ≈ ABre - @test @time(Octavian.matmul(Are, B)) ≈ AreB - @test @time(Octavian.matmul(A′, B)) ≈ A′B - @test @time(Octavian.matmul(A, B′)) ≈ AB′ - @test @time(Octavian.matmul(A′, B′)) ≈ A′B′ + A′ = permutedims(A)' + B′ = permutedims(B)' + AB = A * B; + A′B = A′*B; + AB′ = A*B′; + A′B′= A′*B′; + Ab = A*b; + A′b = A′*b; - @test @time(Octavian.matmul(A, b)) ≈ Ab - @test transpose(@time(Octavian.matmul(transpose(b), transpose(A)))) ≈ Ab - @test @time(Octavian.matmul(A, bre)) ≈ Abre - @test @time(Octavian.matmul(Are, b)) ≈ Areb - @test @time(Octavian.matmul(A′, b)) ≈ A′b - @test transpose(@time(Octavian.matmul(transpose(b), transpose(A′)))) ≈ A′b + AreB = Are*B; + ABre = A*Bre; + Areb = Are*b; + Abre = A*bre; - @test @time(Octavian.matmul_serial(A, B)) ≈ AB - @test @time(Octavian.matmul_serial(A, Bre)) ≈ ABre - @test @time(Octavian.matmul_serial(Are, B)) ≈ AreB - @test @time(Octavian.matmul_serial(A′, B)) ≈ A′B - @test @time(Octavian.matmul_serial(A, B′)) ≈ AB′ - @test @time(Octavian.matmul_serial(A′, B′)) ≈ A′B′ + @info "" T n k m + @test @time(Octavian.matmul(A, B)) ≈ AB + @test @time(Octavian.matmul(A, Bre)) ≈ ABre + @test @time(Octavian.matmul(Are, B)) ≈ AreB + @test @time(Octavian.matmul(A′, B)) ≈ A′B + @test @time(Octavian.matmul(A, B′)) ≈ AB′ + @test @time(Octavian.matmul(A′, B′)) ≈ A′B′ - @test @time(Octavian.matmul_serial(A, b)) ≈ Ab - @test transpose(@time(Octavian.matmul_serial(transpose(b), transpose(A)))) ≈ Ab - @test @time(Octavian.matmul_serial(A, bre)) ≈ Abre - @test @time(Octavian.matmul_serial(Are, b)) ≈ Areb - @test @time(Octavian.matmul_serial(A′, b)) ≈ A′b - @test transpose(@time(Octavian.matmul_serial(transpose(b), transpose(A′)))) ≈ A′b + @test @time(Octavian.matmul(A, b)) ≈ Ab + @test transpose(@time(Octavian.matmul(transpose(b), transpose(A)))) ≈ Ab + @test @time(Octavian.matmul(A, bre)) ≈ Abre + @test @time(Octavian.matmul(Are, b)) ≈ Areb + @test @time(Octavian.matmul(A′, b)) ≈ A′b + @test transpose(@time(Octavian.matmul(transpose(b), transpose(A′)))) ≈ A′b - C = Matrix{T}(undef, n, m)' - @test @time(Octavian.matmul!(C, A, B)) ≈ AB + @test @time(Octavian.matmul_serial(A, B)) ≈ AB + @test @time(Octavian.matmul_serial(A, Bre)) ≈ ABre + @test @time(Octavian.matmul_serial(Are, B)) ≈ AreB + @test @time(Octavian.matmul_serial(A′, B)) ≈ A′B + @test @time(Octavian.matmul_serial(A, B′)) ≈ AB′ + @test @time(Octavian.matmul_serial(A′, B′)) ≈ A′B′ - C1 = rand(T, m, n) - C2 = copy(C1) - α, β = T(1 - 2im), T(3 + 4im) - @test @time(Octavian.matmul!(C1, A, B, α, β)) ≈ Octavian.matmul!(C2, A, B, α, β) - end - end - end - end - testset_name_suffix === "(coverage)" && break -end + @test @time(Octavian.matmul_serial(A, b)) ≈ Ab + @test transpose(@time(Octavian.matmul_serial(transpose(b), transpose(A)))) ≈ Ab + @test @time(Octavian.matmul_serial(A, bre)) ≈ Abre + @test @time(Octavian.matmul_serial(Are, b)) ≈ Areb + @test @time(Octavian.matmul_serial(A′, b)) ≈ A′b + @test transpose(@time(Octavian.matmul_serial(transpose(b), transpose(A′)))) ≈ A′b -@time @testset "Matrix Multiply Float64 $(testset_name_suffix)" begin - T = Float64 - for n ∈ n_values - for k ∈ k_values - for m ∈ m_values - A = rand(T, m, k) - B = rand(T, k, n) - b = rand(T, k) - A′ = permutedims(A)' - B′ = permutedims(B)' - AB = A * B; - Ab = A * b; - @info "" T n k m - @test @time(Octavian.matmul(A, B)) ≈ AB - @test @time(Octavian.matmul(A′, B)) ≈ AB - @test @time(Octavian.matmul(A, B′)) ≈ AB - @test @time(Octavian.matmul(A′, B′)) ≈ AB - @test @time(Octavian.matmul_serial(A, B)) ≈ AB - @test @time(Octavian.matmul_serial(A′, B)) ≈ AB - @test @time(Octavian.matmul_serial(A, B′)) ≈ AB - @test @time(Octavian.matmul_serial(A′, B′)) ≈ AB - @test @time(Octavian.matmul(A, b)) ≈ Ab - @test @time(Octavian.matmul(A′, b)) ≈ Ab - @test @time(Octavian.matmul(b', A'))' ≈ Ab - @test @time(Octavian.matmul(b', A′'))' ≈ Ab - @test @time(Octavian.matmul_serial(A, b)) ≈ Ab - @test @time(Octavian.matmul_serial(A′, b)) ≈ Ab - @test @time(Octavian.matmul_serial(b', A'))' ≈ Ab - @test @time(Octavian.matmul_serial(b', A′'))' ≈ Ab - end - end - end - m = k = n = max(8Octavian.OCTAVIAN_NUM_TASKS[], 400) - A = rand(T, m, k); - B = rand(T, k, n); - A′ = permutedims(A)'; - B′ = permutedims(B)'; - AB = A * B; - @test matmul_pack_ab!(similar(AB), A, B) ≈ AB - @test matmul_pack_ab!(similar(AB), A, B′) ≈ AB - @test matmul_pack_ab!(similar(AB), A′, B) ≈ AB - @test matmul_pack_ab!(similar(AB), A′, B′) ≈ AB -end + C = Matrix{T}(undef, n, m)' + @test @time(Octavian.matmul!(C, A, B)) ≈ AB -@time @testset "Matrix Multiply Float32 $(testset_name_suffix)" begin - T = Float32 - for n ∈ n_values - for k ∈ k_values - for m ∈ m_values - A = rand(T, m, k) - B = rand(T, k, n) - A′ = permutedims(A)' - B′ = permutedims(B)' - AB = A * B; - @info "" T n k m - @test @time(Octavian.matmul(A, B)) ≈ AB - @test @time(Octavian.matmul(A′, B)) ≈ AB - @test @time(Octavian.matmul(A, B′)) ≈ AB - @test @time(Octavian.matmul(A′, B′)) ≈ AB - @test @time(Octavian.matmul_serial(A, B)) ≈ AB - @test @time(Octavian.matmul_serial(A′, B)) ≈ AB - @test @time(Octavian.matmul_serial(A, B′)) ≈ AB - @test @time(Octavian.matmul_serial(A′, B′)) ≈ AB - end + C1 = rand(T, m, n) + C2 = copy(C1) + α, β = T(1 - 2im), T(3 + 4im) + @test @time(Octavian.matmul!(C1, A, B, α, β)) ≈ Octavian.matmul!(C2, A, B, α, β) end + end end - m = k = n = max(8Octavian.OCTAVIAN_NUM_TASKS[], 400) - A = rand(T, m, k); - B = rand(T, k, n); - A′ = permutedims(A)'; - B′ = permutedims(B)'; - AB = A * B; - @test matmul_pack_ab!(similar(AB), A, B) ≈ AB - @test matmul_pack_ab!(similar(AB), A, B′) ≈ AB - @test matmul_pack_ab!(similar(AB), A′, B) ≈ AB - @test matmul_pack_ab!(similar(AB), A′, B′) ≈ AB + end end -@time @testset "Matrix Multiply Int32 $(testset_name_suffix)" begin - T = Int32 - for n ∈ n_values - for k ∈ k_values - for m ∈ m_values - A = rand(T, m, k) - B = rand(T, k, n) - A′ = permutedims(A)' - B′ = permutedims(B)' - AB = A * B; - @info "" T n k m - @test @time(Octavian.matmul(A, B)) ≈ AB - @test @time(Octavian.matmul(A′, B)) ≈ AB - @test @time(Octavian.matmul(A, B′)) ≈ AB - @test @time(Octavian.matmul(A′, B′)) ≈ AB - @test @time(Octavian.matmul_serial(A, B)) ≈ AB - @test @time(Octavian.matmul_serial(A′, B)) ≈ AB - @test @time(Octavian.matmul_serial(A, B′)) ≈ AB - @test @time(Octavian.matmul_serial(A′, B′)) ≈ AB - end - end +function matmul_pack_ab!(C, A, B) + M, N = size(C); K = size(B,1) + zc, za, zb = Octavian.zstridedpointer.((C,A,B)) + nspawn = min(Threads.nthreads(), Octavian.num_cores()) + GC.@preserve C A B begin + if nspawn > 1 + threads, torelease = Octavian.PolyesterWeave.__request_threads((nspawn-1)%UInt32, Octavian.PolyesterWeave.worker_pointer(), nothing) + @assert threads.i < Threads.nthreads() + Octavian.matmul_pack_A_and_B!( + zc, za, zb, Octavian.StaticInt{1}(), Octavian.StaticInt{0}(), M, K, N, threads, + Octavian.W₁Default(), Octavian.W₂Default(), Octavian.R₁Default(), Octavian.R₂Default() + ) + Octavian.PolyesterWeave.free_threads!(torelease) + else + Octavian.matmul_st_pack_A_and_B!( + zc, za, zb, Octavian.StaticInt{1}(), Octavian.StaticInt{0}(), M, K, N, + Octavian.W₁Default(), Octavian.W₂Default(), Octavian.R₁Default(), Octavian.R₂Default(), 0 + ) end - m = k = n = max(8Octavian.OCTAVIAN_NUM_TASKS[], 400) - A = rand(T, m, k); - B = rand(T, k, n); - A′ = permutedims(A)'; - B′ = permutedims(B)'; - AB = A * B; - @test matmul_pack_ab!(similar(AB), A, B) ≈ AB - @test matmul_pack_ab!(similar(AB), A, B′) ≈ AB - @test matmul_pack_ab!(similar(AB), A′, B) ≈ AB - @test matmul_pack_ab!(similar(AB), A′, B′) ≈ AB + end + C end -@time @testset "Matrix Multiply Int64 $(testset_name_suffix)" begin - T = Int64 +function test_real(::Type{T}, m_values, k_values, n_values, testset_name_suffix) where {T} + @time @testset "Matrix Multiply $T $(testset_name_suffix)" begin for n ∈ n_values - for k ∈ k_values - for m ∈ m_values - A = rand(T, m, k) - B = rand(T, k, n) - A′ = permutedims(A)' - B′ = permutedims(B)' - AB = A * B; - @info "" T n k m - @test @time(Octavian.matmul(A, B)) ≈ AB - @test @time(Octavian.matmul(A′, B)) ≈ AB - @test @time(Octavian.matmul(A, B′)) ≈ AB - @test @time(Octavian.matmul(A′, B′)) ≈ AB - @test @time(Octavian.matmul_serial(A, B)) ≈ AB - @test @time(Octavian.matmul_serial(A′, B)) ≈ AB - @test @time(Octavian.matmul_serial(A, B′)) ≈ AB - @test @time(Octavian.matmul_serial(A′, B′)) ≈ AB - end + for k ∈ k_values + for m ∈ m_values + A = rand(T, m, k) + B = rand(T, k, n) + b = rand(T, k) + A′ = permutedims(A)' + B′ = permutedims(B)' + AB = A * B; + Ab = A * b; + @info "" T n k m + @test @time(Octavian.matmul(A, B)) ≈ AB + @test @time(Octavian.matmul(A′, B)) ≈ AB + @test @time(Octavian.matmul(A, B′)) ≈ AB + @test @time(Octavian.matmul(A′, B′)) ≈ AB + @test @time(Octavian.matmul_serial(A, B)) ≈ AB + @test @time(Octavian.matmul_serial(A′, B)) ≈ AB + @test @time(Octavian.matmul_serial(A, B′)) ≈ AB + @test @time(Octavian.matmul_serial(A′, B′)) ≈ AB + @test @time(Octavian.matmul(A, b)) ≈ Ab + @test @time(Octavian.matmul(A′, b)) ≈ Ab + @test @time(Octavian.matmul(b', A'))' ≈ Ab + @test @time(Octavian.matmul(b', A′'))' ≈ Ab + @test @time(Octavian.matmul_serial(A, b)) ≈ Ab + @test @time(Octavian.matmul_serial(A′, b)) ≈ Ab + @test @time(Octavian.matmul_serial(b', A'))' ≈ Ab + @test @time(Octavian.matmul_serial(b', A′'))' ≈ Ab end + end end m = k = n = max(8Octavian.OCTAVIAN_NUM_TASKS[], 400) A = rand(T, m, k); @@ -217,6 +138,7 @@ end @test matmul_pack_ab!(similar(AB), A, B′) ≈ AB @test matmul_pack_ab!(similar(AB), A′, B) ≈ AB @test matmul_pack_ab!(similar(AB), A′, B′) ≈ AB + end end @time @testset "zero-sized-matrices" begin @@ -229,3 +151,4 @@ end @test Octavian.matmul(randn(2,0), randn(0,2)) == zeros(2, 2) @test Octavian.matmul!(ones(2,2),randn(2,0), randn(0,2), 1.0, 2.0) == ones(2, 2) .* 2 end + diff --git a/test/matmul_coverage.jl b/test/matmul_coverage.jl index 9abdc8a..80fb63c 100644 --- a/test/matmul_coverage.jl +++ b/test/matmul_coverage.jl @@ -2,29 +2,24 @@ n_values = [1, 10, 20, 50, 100, 150, 200] k_values = [10, 20, 50, 100, 150, 200] m_values = [10, 20, 50, 100, 150, 200] -function matmul_pack_ab!(C, A, B) - M, N = size(C); K = size(B,1) - zc, za, zb = Octavian.zstridedpointer.((C,A,B)) - nspawn = min(Threads.nthreads(), Octavian.num_cores()) - GC.@preserve C A B begin - if nspawn > 1 - threads, torelease = Octavian.PolyesterWeave.__request_threads((nspawn-1)%UInt32, Octavian.PolyesterWeave.worker_pointer()) - @assert threads.i < Threads.nthreads() - Octavian.matmul_pack_A_and_B!( - zc, za, zb, Octavian.StaticInt{1}(), Octavian.StaticInt{0}(), M, K, N, threads, - Octavian.W₁Default(), Octavian.W₂Default(), Octavian.R₁Default(), Octavian.R₂Default() - ) - Octavian.PolyesterWeave.free_threads!(torelease) - else - Octavian.matmul_st_pack_A_and_B!( - zc, za, zb, Octavian.StaticInt{1}(), Octavian.StaticInt{0}(), M, K, N, - Octavian.W₁Default(), Octavian.W₂Default(), Octavian.R₁Default(), Octavian.R₂Default(), 0 - ) - end - end - C -end -testset_name_suffix = "(coverage)" include("_matmul.jl") +typ = get(ENV, "JULIA_TEST_ELTYPE", "ALL") +types = if typ == "Float64" + DataType[Float64] +elseif typ == "Float32" + DataType[Float32] +elseif typ == "Int64" + DataType[Int64] +elseif typ == "Int32" + DataType[Int32] +else + DataType[Float64, Float32, Int64, Int32] +end +testset_name_suffix = "(coverage)" +for T ∈ types + @time test_complex(T, m_values, k_values, n_values, testset_name_suffix) + @time test_real(T, m_values, k_values, n_values, testset_name_suffix) +end + diff --git a/test/matmul_main.jl b/test/matmul_main.jl index f6d2005..6190aa2 100644 --- a/test/matmul_main.jl +++ b/test/matmul_main.jl @@ -4,4 +4,8 @@ m_values = [200, 300, 400] testset_name_suffix = "(main)" -include("_matmul.jl") +for T ∈ (Float64,Float32,Int64,Int32) + @time test_complex(T, m_values, k_values, n_values, testset_name_suffix) + @time test_real(T, m_values, k_values, n_values, testset_name_suffix) +end +