From 30747910546f2280e453d21582423df6cd7c5efe Mon Sep 17 00:00:00 2001 From: Elliot Saba Date: Tue, 11 Mar 2025 06:28:01 -0700 Subject: [PATCH 1/2] Initial implementation of lazy JLLs for LinearAlgebra This alters CompilerSupportLibraries_jll, OpenBLAS_jll and libblastrampoline_jll to use `LazyLibrary` objects and thereby be loaded only upon first `dlopen()` or `ccall()` to the individual library objects. Note that this is one of the more complicated cases, as `libblastrampoline` must have OpenBLAS_jll added as a dynamic dependency (as it does not actually have it listed in its shared object headers) and also has some on-load callbacks that must be invoked. This must be paired with the appropriate base Julia changes [0]. [0] https://github.com/JuliaLang/julia/pull/57719 --- src/LinearAlgebra.jl | 22 ++++++++++++---------- src/blas.jl | 4 +++- src/lbt.jl | 8 ++++++-- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/src/LinearAlgebra.jl b/src/LinearAlgebra.jl index 2c76dcdc..7f45370c 100644 --- a/src/LinearAlgebra.jl +++ b/src/LinearAlgebra.jl @@ -820,25 +820,27 @@ function versioninfo(io::IO=stdout) return nothing end -function __init__() - try - verbose = parse(Bool, get(ENV, "LBT_VERBOSE", "false")) - BLAS.lbt_forward(OpenBLAS_jll.libopenblas_path; clear=true, verbose) - BLAS.check() - catch ex - Base.showerror_nostdio(ex, "WARNING: Error during initialization of module LinearAlgebra") - end +function lbt_openblas_onload_callback() + # We don't use `BLAS.lbt_forward()` here because we don't want to take a lock on the config cache. + verbose = parse(Bool, get(ENV, "LBT_VERBOSE", "false")) + BLAS.lbt_forward_ccall(OpenBLAS_jll.libopenblas_path; clear=true, verbose) + BLAS.check() + # register a hook to disable BLAS threading Base.at_disable_library_threading(() -> BLAS.set_num_threads(1)) # https://github.com/xianyi/OpenBLAS/blob/c43ec53bdd00d9423fc609d7b7ecb35e7bf41b85/README.md#setting-the-number-of-threads-using-environment-variables if !haskey(ENV, "OPENBLAS_NUM_THREADS") && !haskey(ENV, "GOTO_NUM_THREADS") && !haskey(ENV, "OMP_NUM_THREADS") @static if Sys.isapple() && Base.BinaryPlatforms.arch(Base.BinaryPlatforms.HostPlatform()) == "aarch64" - BLAS.set_num_threads(max(1, @ccall(jl_effective_threads()::Cint))) + nthreads = max(1, @ccall(jl_effective_threads()::Cint)) else - BLAS.set_num_threads(max(1, @ccall(jl_effective_threads()::Cint) ÷ 2)) + nthreads = max(1, @ccall(jl_effective_threads()::Cint) ÷ 2) end + BLAS.lbt_set_num_threads(nthreads) end end +# If users want to lazily load a different BLAS, they'd need to either change this call, or +# clear the datastructures modified by this call and call it again with their own. +libblastrampoline_jll.add_dependency!(OpenBLAS_jll, libopenblas, lbt_openblas_onload_callback) end # module LinearAlgebra diff --git a/src/blas.jl b/src/blas.jl index a8ce39ef..38af4928 100644 --- a/src/blas.jl +++ b/src/blas.jl @@ -162,7 +162,9 @@ get_num_threads()::Int = lbt_get_num_threads() function check() # TODO: once we have bitfields of the BLAS functions that are actually forwarded, # ensure that we have a complete set here (warning on an incomplete BLAS implementation) - config = get_config() + # We don't use `get_config()` here because we are invoked in the onload callback and + # we don't want to take any locks. + config = LBTConfig(unsafe_load(ccall((:lbt_get_config, libblastrampoline), Ptr{lbt_config_t}, ()))) # Ensure that one of our loaded libraries satisfies our interface requirement interface = USE_BLAS64 ? :ilp64 : :lp64 diff --git a/src/lbt.jl b/src/lbt.jl index 81d10f93..77ddc580 100644 --- a/src/lbt.jl +++ b/src/lbt.jl @@ -228,10 +228,14 @@ function lbt_set_num_threads(nthreads) return ccall((:lbt_set_num_threads, libblastrampoline), Cvoid, (Int32,), nthreads) end +function lbt_forward_ccall(path::AbstractString; clear::Bool = false, verbose::Bool = false, suffix_hint::Union{String,Nothing} = nothing) + return ccall((:lbt_forward, libblastrampoline), Int32, (Cstring, Int32, Int32, Cstring), + path, clear ? 1 : 0, verbose ? 1 : 0, something(suffix_hint, C_NULL)) +end + function lbt_forward(path::AbstractString; clear::Bool = false, verbose::Bool = false, suffix_hint::Union{String,Nothing} = nothing) _clear_config_with() do - return ccall((:lbt_forward, libblastrampoline), Int32, (Cstring, Int32, Int32, Cstring), - path, clear ? 1 : 0, verbose ? 1 : 0, something(suffix_hint, C_NULL)) + lbt_forward_ccall(path; clear, verbose, suffix_hint) end end From 5514f15df75e68a2fd4dafca37b9dde69f7d3385 Mon Sep 17 00:00:00 2001 From: Elliot Saba Date: Wed, 12 Mar 2025 07:46:54 -0700 Subject: [PATCH 2/2] Before running any `@allocations` tests, run the workload This ensures that our LazyLibrary code, (which makes one allocation) does not fail these tests. Some of these `@allocations` checks had already included warm-up code, this simply expands that to the rest that I could find. --- test/matmul.jl | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/matmul.jl b/test/matmul.jl index afa730f1..86c75ae5 100644 --- a/test/matmul.jl +++ b/test/matmul.jl @@ -325,6 +325,10 @@ end @test 0 == @allocations mul!(C, At, Bt) end # syrk/herk + mul!(C, transpose(A), A) + mul!(C, adjoint(A), A) + mul!(C, A, transpose(A)) + mul!(C, A, adjoint(A)) @test 0 == @allocations mul!(C, transpose(A), A) @test 0 == @allocations mul!(C, adjoint(A), A) @test 0 == @allocations mul!(C, A, transpose(A)) @@ -334,6 +338,7 @@ end Ac = complex(A) for t in (identity, adjoint, transpose) Bt = t(B) + mul!(Cc, Ac, Bt) @test 0 == @allocations mul!(Cc, Ac, Bt) end end @@ -356,6 +361,9 @@ end A = rand(-10:10, n, n) B = ones(Float64, n, n) C = zeros(Float64, n, n) + mul!(C, A, B) + mul!(C, A, transpose(B)) + mul!(C, adjoint(A), B) @test 0 == @allocations mul!(C, A, B) @test 0 == @allocations mul!(C, A, transpose(B)) @test 0 == @allocations mul!(C, adjoint(A), B)