In [70]:
# Algoritmo base
# https://www.cs.utexas.edu/~flame/Notes/NotesOnCholReal.pdf

# Alterações feitas:

# A matriz L = zero(A) está sendo criada fora da função
# As alterações estão sendo feitas na própria matriz A
# Utilizar .= e .-=, operações in-place, otimizaram a alocação

In [11]:
using(LinearAlgebra)
using(BenchmarkTools)

In [81]:
##### Nível 2

# Algoritmo do livro (applied numerical linear algebra)
function  Cholesky_factorize(A::Matrix{Float64})

    # Mateiz de zeros (2 alloc)
    L = zero(A)
    
    # Número de linhas (1 alloc)
    m = size(A, 1)
    
    # Loop principal
    @inbounds for j in 1:m
        
        # Somatório de Ljk ao quadrado
        sumLjk = 0
        @inbounds for k in 1:j-1
            sumLjk += L[j, k]*L[j, k]
        end

        # Alterações na diagonal de L
        @inbounds L[j, j] = sqrt(A[j, j] - sumLjk)

        # Segundo loop principal
        @inbounds for i in j+1:m
            
            # Somatório do produto Lik e Ljk
            sumLikLjk = 0
            @inbounds @simd for k in 1:j-1
                sumLikLjk += L[i, k]*L[j, k]
            end

            # Alterações
            @inbounds L[i, j] = (A[i, j] - sumLikLjk)/L[j, j]
        end
    end

    return L
end

Cholesky_factorize (generic function with 3 methods)

In [83]:
# TESTANDO NÍVEL 2 - TEMPO E ALOCAÇÃO

n = 1500
X = randn(n, n)
A = X*X'


@btime Cholesky_factorize($A)
@btime cholesky($A)

print("------------------------------------------------")

  3.882 s (3 allocations: 17.17 MiB)
  241.328 ms (3 allocations: 17.17 MiB)
------------------------------------------------

In [68]:
### Nível 3 ######

function Cholesky_by_blocks(A::Matrix{Float64}, block_size::Int)
    # Número de linhas
    n = size(A)[1]
    for j in 1:block_size:n
        b = min(n - j + 1, block_size)
        # Realizando as alterações na própria matriz A, poupa alocações
        A[j:j+b-1, j:j+b-1] .= cholesky(A[j:j+b-1, j:j+b-1]).L
        A[j+b:n, j:j+b-1] /= A[j:j+b-1, j:j+b-1]'
        A[j+b:n, j+b:n] .-= A[j+b:n, j:j+b-1]*A[j+b:n, j:j+b-1]'

    end
    return tril(A)
end

Cholesky_by_blocks (generic function with 1 method)

In [73]:
# TESTANDO NÍVEL 3 - ALOCAÇÃO E TEMPO - TESTE 1
n = 1200
X = randn(n, n)
size_block = 512
A = X*X'

@time Cholesky_factorize(A)
@time cholesky(A)
@time Cholesky_by_blocks(A, size_block) # Essa deve ficar por último pois altera A
print("----------------------------------------------------------------------------------")

  1.358177 seconds (3 allocations: 10.986 MiB, 0.09% gc time)
  0.133569 seconds (4 allocations: 10.986 MiB)
  0.509151 seconds (88 allocations: 52.504 MiB, 52.06% gc time)
----------------------------------------------------------------------------------

In [695]:
# TESTANDO NÍVEL 3 - ALOCAÇÃO E TEMPO - TESTE 2
n = 3000
X = randn(n, n)
size_block = 2048
A = X*X'

@time Cholesky_factorize(A)
@time cholesky(A)
@time Cholesky_by_blocks(A, size_block) # Essa deve ficar por último pois altera A
print("----------------------------------------------------------------------------------")

 78.980518 seconds (3 allocations: 68.665 MiB)
  2.625674 seconds (4 allocations: 68.665 MiB)
 38.679976 seconds (49 allocations: 273.614 MiB, 8.69% gc time)
----------------------------------------------------------------------------------

In [696]:
# TESTANDO NÍVEL 3 - ALOCAÇÃO E TEMPO - TESTE 3
n = 5000
X = randn(n, n)
size_block = 2048
A = X*X'

@time Cholesky_factorize(A)
@time cholesky(A)
@time Cholesky_by_blocks(A, size_block) # Essa deve ficar por último pois altera A
print("----------------------------------------------------------------------------------")

550.065713 seconds (3 allocations: 190.735 MiB, 0.00% gc time)
 23.421346 seconds (4 allocations: 190.735 MiB, 46.87% gc time)
104.517186 seconds (79 allocations: 848.131 MiB, 21.14% gc time)
----------------------------------------------------------------------------------

In [18]:
# ERROS ! ! !
n = 500
X = randn(n, n)
size_block = 224

A = X*X'
B = copy(A)

L1 = cholesky(A).L
L2 = Cholesky_factorize(A)
L3 = Cholesky_by_blocks(A, size_block)


display(norm(B - L1*L1'))
display(norm(B - L2*L2'))
display(norm(B - L3*L3'))

3.59036593153747e-12

4.051936509396488e-12

2.269092769490994e-12

In [19]:
#### COMPARAÇÃO DOS RESULTADOS
X = randn(4, 4)
A =X*X'

display(Cholesky_factorize(A))
display(cholesky(A).L)
display(Cholesky_by_blocks(A, 2))

4×4 Matrix{Float64}:
  1.34659    0.0        0.0       0.0
 -0.900193   1.92343    0.0       0.0
 -1.19725   -1.5        1.92819   0.0
 -0.896621   0.782927  -0.414207  0.508874

4×4 LowerTriangular{Float64, Matrix{Float64}}:
  1.34659     ⋅          ⋅         ⋅ 
 -0.900193   1.92343     ⋅         ⋅ 
 -1.19725   -1.5        1.92819    ⋅ 
 -0.896621   0.782927  -0.414207  0.508874

4×4 Matrix{Float64}:
  1.34659    0.0        0.0       0.0
 -0.900193   1.92343    0.0       0.0
 -1.19725   -1.5        1.92819   0.0
 -0.896621   0.782927  -0.414207  0.508874

In [22]:
n = 10

a = randn(n, 1)
b = randn(n, 1)

function prod(a::Matrix{Float64}, b::Matrix{Float64})
        
    soma = 0
    l = size(a, 1)
        
    for i in 1:l
        soma += a[i]*b[i]
    end

    return soma
end

function prodzip(a::Matrix{Float64}, b::Matrix{Float64})
    
    soma = 0
    for (x, y) in zip(a, b)
        soma += x*y
    end

    return soma

end


@time sum(a.*b)
@time prod(a, b)
@time prodzip(a, b)


  0.000019 seconds (4 allocations: 208 bytes)
  0.000014 seconds
  0.000014 seconds


-0.2167463749787647

In [387]:
A = rand(5, 5)

function acessarA(A::Matrix{Float64})

    for x in A[1, 1:3]
        x
    end
end

@time A[1, 1:3]
@time acessarA(A)


  0.000701 seconds (2 allocations: 80 bytes)
  0.000015 seconds (1 allocation: 48 bytes)


In [31]:
using LinearAlgebra, BenchmarkTools

function cholesky_fast!(A::AbstractMatrix{Float64})
    m = size(A,1)
    L = zeros(m,m)
    for j in 1:m
        sumLjk = 0.0
        @inbounds for k in 1:j-1
            # quadrado mais rápido que ^2
            sumLjk += L[j,k] * L[j,k]
        end
        @inbounds L[j,j] = sqrt(A[j,j] - sumLjk)

        for i in j+1:m
            sumLikLjk = 0.0
            @inbounds @simd for k in 1:j-1
                sumLikLjk += L[i,k] * L[j,k]
            end
            @inbounds L[i,j] = (A[i,j] - sumLikLjk) / L[j,j]
        end
    end
    return L
end

# Teste de velocidade
n = 800
X = rand(n,n)
A = X * X'

println("Meu Cholesky otimizado:")
@btime cholesky_fast!($A)
println("Função nativa:")
@btime cholesky($A)

Meu Cholesky otimizado:
  298.873 ms (3 allocations: 4.88 MiB)
Função nativa:
  33.513 ms (3 allocations: 4.88 MiB)


Cholesky{Float64, Matrix{Float64}}
U factor:
800×800 UpperTriangular{Float64, Matrix{Float64}}:
 16.827  12.014   12.307    12.3746   …  12.6437     12.0436     12.138
   ⋅     10.7504   4.84113   3.979        5.26709     4.75349     4.65579
   ⋅       ⋅       9.77081   2.55134      2.46614     2.69496     2.39028
   ⋅       ⋅        ⋅        9.49232      2.25599     2.31876     2.4085
   ⋅       ⋅        ⋅         ⋅           1.37098     1.83028     1.35359
   ⋅       ⋅        ⋅         ⋅       …   1.4251      1.25697     1.25465
   ⋅       ⋅        ⋅         ⋅           0.931095    0.799382    1.13898
   ⋅       ⋅        ⋅         ⋅           0.939443    0.8732      0.688567
   ⋅       ⋅        ⋅         ⋅           0.690619    0.459858    0.762936
   ⋅       ⋅        ⋅         ⋅           0.664876    1.0653      0.836598
   ⋅       ⋅        ⋅         ⋅       …   0.612049    0.354188    0.915732
   ⋅       ⋅        ⋅         ⋅           0.742169    0.898187    1.00676
   ⋅       ⋅   

In [77]:
using LinearAlgebra
import/using .BLAS

# 1) refatore a sua Cholesky para operar *in-place* sobre qualquer StridedMatrix
function cholesky_inplace!(B::StridedMatrix{Float64})
    m = size(B,1)
    @assert size(B,2)==m
    for j in 1:m
        s = 0.0
        @inbounds @simd for k in 1:j-1
            s += B[j,k]*B[j,k]
        end
        @inbounds B[j,j] = sqrt(B[j,j] - s)
        for i in j+1:m
            t = 0.0
            @inbounds @simd for k in 1:j-1
                t += B[i,k]*B[j,k]
            end
            @inbounds B[i,j] = (B[i,j] - t)/B[j,j]
        end
    end
    return LowerTriangular(B)
end

# 2) Cholesky *blocked*, sem nenhuma alocação extra fora do BLAS
function cholesky_by_blocks!(A::Matrix{Float64}, block_size::Int=64)
    n = size(A,1)
    for j in 1:block_size:n
        b = min(block_size, n-j+1)

        # bloco diagonal ─ fatoração in-place
        Bj = @view A[j:j+b-1, j:j+b-1]
        cholesky_inplace!(Bj)  # escreve Ljj de volta em Bj

        # resolve A[j+b:end, j:j+b-1] ← A21 * inv(Ljj)'
        # (note: trsm! faz um triangular solve in-place)
        BLAS.trsm!('R','L','T','N',
                   1.0,
                   Bj,
                   @view A[j+b:end, j:j+b-1])

        # atualização do trailing block: A22 -= A21 * A21'
        BLAS.gemm!('N','T',
                   -1.0,
                   @view A[j+b:end, j:j+b-1],
                   @view A[j+b:end, j:j+b-1],
                   1.0,
                   @view A[j+b:end, j+b:end])
    end

    return LowerTriangular(A)
end


LoadError: ArgumentError: Package / not found in current path.
- Run `import Pkg; Pkg.add("/")` to install the / package.

In [74]:
n = 800
X = randn(n n)
A = X*X'

cholesky_by_blocks(A, 512)



LoadError: ParseError:
[90m# Error @ [0;0m]8;;file://C:/Users/lucas/OneDrive/UFRJ/Cursos/Matemática Aplicada/5º Período/Álgebra Linear Computacional/In[74]#2:13\[90mIn[74]:2:13[0;0m]8;;\
n = 800
X = randn(n [48;2;120;70;70mn[0;0m)
[90m#           ╙ ── [0;0m[91mExpected `)`[0;0m