In [83]:
### My implementaion of simple matrix multiplication (O(n^3))
function simple_matrix_multiplication(A::Matrix{T}, B::Matrix{T}) where T<:AbstractFloat
    n, m = size(A)
    m2, p = size(B)
    
    C = zeros(T, n, p)
    
    @inbounds @simd for i in 1:n
        @inbounds @simd for j in 1:p
            @inbounds @simd for k in 1:m
                C[i, j] += A[i, k] * B[k, j]
            end
        end
    end
    
    return C
end


simple_matrix_multiplication (generic function with 1 method)

In [111]:
### My implementaion of simple matrix multiplication (O(n^3))
function simple_matrix_multiplication1(A::Matrix{T}, B::Matrix{T}) where T<:AbstractFloat
    n, m = size(A)
    m2, p = size(B)
    
    C = zeros(T, n, p)
    
    @inbounds @simd for i in 1:n
        @inbounds @simd for j in 1:p
            #@inbounds @simd for k in 1:m
                #C[i, j] = sum((@view A[i, :]) .* (@view B[:, j]))
                C[i, j]=mapreduce(+, *,  A[i, :], B[:, j])
            #end
        end
    end
    
    return C
end


simple_matrix_multiplication1 (generic function with 1 method)

In [5]:
#padding the matrix to make it n*n where n=2^k, this is the only suitable size for Strassen algorithm
function pad_matrix(M1::Matrix{T}, M2::Matrix{T} ) where T<:AbstractFloat
    n=2^ceil(Int, log2(max(size(M1)[1], size(M1)[2], size(M2)[2])))
    padded_M1 = zeros(T, n, n)
    original_size = size(M1)
    padded_M1[1:original_size[1], 1:original_size[2]] = M1

    padded_M2 = zeros(T, n, n)
    original_size = size(M2)
    padded_M2[1:original_size[1], 1:original_size[2]] = M2

    return padded_M1, padded_M2
end

pad_matrix (generic function with 1 method)

In [52]:
#example A and B for multipllication
n = 128
m=128
k=128
A = rand(Float32, n, m);
B = rand(Float32, m, k);

In [108]:
@time C = simple_matrix_multiplication(A, B);

  0.031414 seconds (147.46 k allocations: 16.813 MiB, 28.51% gc time)


In [11]:
padded_A, padded_B = pad_matrix(A, B)
@time C_padded = strassen(padded_A, padded_B)
C = C_padded[1:size(A)[1], 1:size(B)[2]];

  0.075309 seconds (627.46 k allocations: 57.024 MiB, 39.51% gc time)


In [14]:
padded_A, padded_B = pad_matrix(A, B)
@time C_padded = hybrid_strassen(padded_A, padded_B)
C = C_padded[1:size(A)[1], 1:size(B)[2]];

  0.000236 seconds (34 allocations: 580.047 KiB)


In [17]:
@time C_padded = hybrid_strassen_gpu(padded_A, padded_B)
C = C_padded[1:size(A)[1], 1:size(B)[2]];

  0.003339 seconds (899 allocations: 313.234 KiB)


In [57]:
@time C=A*B;

  0.000285 seconds (2 allocations: 64.047 KiB)
