In [83]:
### My implementaion of simple matrix multiplication (O(n^3))
function simple_matrix_multiplication(A::Matrix{T}, B::Matrix{T}) where T<:AbstractFloat
    n, m = size(A)
    m2, p = size(B)
    
    C = zeros(T, n, p)
    
    @inbounds @simd for i in 1:n
        @inbounds @simd for j in 1:p
            @inbounds @simd for k in 1:m
                C[i, j] += A[i, k] * B[k, j]
            end
        end
    end
    
    return C
end


simple_matrix_multiplication (generic function with 1 method)

In [97]:
### My implementaion of simple matrix multiplication (O(n^3))
function simple_matrix_multiplication(A::Matrix{T}, B::Matrix{T}) where T<:AbstractFloat
    n, m = size(A)
    m2, p = size(B)
    
    C = zeros(T, n, p)
    
    @inbounds @simd for i in 1:n
        @inbounds @simd for j in 1:p
            #@inbounds @simd for k in 1:m
                C[i, j] = sum(@view A[i, :] .* @view B[:, j])
                #C[i, j]=mapreduce(+, *,  A[i, :],  B[:, j])
            #end
        end
    end
    
    return C
end


simple_matrix_multiplication (generic function with 1 method)

In [96]:
mapreduce(+, *,  [1,2,3,4], [10,20,30,40])

351384

In [5]:
#padding the matrix to make it n*n where n=2^k, this is the only suitable size for Strassen algorithm
function pad_matrix(M1::Matrix{T}, M2::Matrix{T} ) where T<:AbstractFloat
    n=2^ceil(Int, log2(max(size(M1)[1], size(M1)[2], size(M2)[2])))
    padded_M1 = zeros(T, n, n)
    original_size = size(M1)
    padded_M1[1:original_size[1], 1:original_size[2]] = M1

    padded_M2 = zeros(T, n, n)
    original_size = size(M2)
    padded_M2[1:original_size[1], 1:original_size[2]] = M2

    return padded_M1, padded_M2
end

pad_matrix (generic function with 1 method)

In [52]:
#example A and B for multipllication
n = 128
m=128
k=128
A = rand(Float32, n, m);
B = rand(Float32, m, k);

In [98]:
@btime C = simple_matrix_multiplication(A, B);

  109.082 ms (114690 allocations: 28.81 MiB)


In [11]:
padded_A, padded_B = pad_matrix(A, B)
@time C_padded = strassen(padded_A, padded_B)
C = C_padded[1:size(A)[1], 1:size(B)[2]];

  0.075309 seconds (627.46 k allocations: 57.024 MiB, 39.51% gc time)


In [14]:
padded_A, padded_B = pad_matrix(A, B)
@time C_padded = hybrid_strassen(padded_A, padded_B)
C = C_padded[1:size(A)[1], 1:size(B)[2]];

  0.000236 seconds (34 allocations: 580.047 KiB)


In [17]:
@time C_padded = hybrid_strassen_gpu(padded_A, padded_B)
C = C_padded[1:size(A)[1], 1:size(B)[2]];

  0.003339 seconds (899 allocations: 313.234 KiB)


In [57]:
@time C=A*B;

  0.000285 seconds (2 allocations: 64.047 KiB)


In [24]:
@code_llvm A*B

[90m;  @ C:\workdir\usr\share\julia\stdlib\v1.10\LinearAlgebra\src\matmul.jl:111 within `*`[39m
[90m; Function Attrs: uwtable[39m
[95mdefine[39m [95mnonnull[39m [33m{[39m[33m}[39m[0m* [93m@"julia_[39m[0m*[0m_4139"[33m([39m[33m{[39m[33m}[39m[0m* [95mnoundef[39m [95mnonnull[39m [95malign[39m [33m16[39m [95mdereferenceable[39m[33m([39m[33m40[39m[33m)[39m [0m%0[0m, [33m{[39m[33m}[39m[0m* [95mnoundef[39m [95mnonnull[39m [95malign[39m [33m16[39m [95mdereferenceable[39m[33m([39m[33m40[39m[33m)[39m [0m%1[33m)[39m [0m#0 [33m{[39m
[91mL72:[39m
  [0m%gcframe45 [0m= [96m[1malloca[22m[39m [33m[[39m[33m3[39m [0mx [33m{[39m[33m}[39m[0m*[33m][39m[0m, [95malign[39m [33m16[39m
  [0m%gcframe45.sub [0m= [96m[1mgetelementptr[22m[39m [95minbounds[39m [33m[[39m[33m3[39m [0mx [33m{[39m[33m}[39m[0m*[33m][39m[0m, [33m[[39m[33m3[39m [0mx [33m{[39m[33m}[39m[0m*[33m][39m[0m* [0m%gcframe4

In [25]:
@simd

LoadError: LoadError: MethodError: no method matching var"@simd"(::LineNumberNode, ::Module)

Closest candidates are:
  var"@simd"(::LineNumberNode, ::Module, !Matched::Any, !Matched::Any)
   @ Base simdloop.jl:131
  var"@simd"(::LineNumberNode, ::Module, !Matched::Any)
   @ Base simdloop.jl:127

in expression starting at c:\Users\babazadehmorteza\Desktop\MthClub\jl_notebook_cell_df34fa98e69747e1a8f8a730347b8e2f_X16sZmlsZQ==.jl:1

In [29]:
begin 
    import Pkg
    Pkg.add("SIMD")
    using SIMD
end

[32m[1m   Resolving[22m[39m package versions...
[32m[1m    Updating[22m[39m `C:\Users\babazadehmorteza\Desktop\MthClub\Project.toml`
  [90m[fdea26ae] [39m[92m+ SIMD v3.5.0[39m
[32m[1m    Updating[22m[39m `C:\Users\babazadehmorteza\Desktop\MthClub\Manifest.toml`
  [90m[fdea26ae] [39m[92m+ SIMD v3.5.0[39m


In [63]:
begin
    b = ones(10_000)
    a = 2 .* ones(10_000)
    @btime for i in 1:10_000
        a[i] *= b[i]
    end
end

  1.151 ms (58467 allocations: 913.55 KiB)


In [62]:
begin
    b = ones(10_000)
    a = 2 .* ones(10_000)
    @btime for i in 1:10_000
        a[i] += b[i]
    end
end

  1.133 ms (58467 allocations: 913.55 KiB)
