In [1]:
using LinearAlgebra, SparseArrays, FFTW, BenchmarkTools, Test
using CUDA, CUDA.CUSPARSE, CUDA.CUFFT

In [2]:
function my_kernel(a)
    i = threadIdx().x
    a[i] = 42
    return
end

a = CuArray{Int}(undef, 5);
@cuda threads=length(a) my_kernel(a);

In [3]:
a

5-element CuArray{Int64, 1, CUDA.DeviceMemory}:
 42
 42
 42
 42
 42

In [11]:
function my_inner_kernel!(f, t) # does not specialize
    t .= f.(t)
end

function my_outer_kernel(f, a)
    i = threadIdx().x
    my_inner_kernel!(f, @view a[i, :])
    return nothing
end

a = CUDA.rand(Int, (2,2))
id(x) = x

@cuda threads=size(a, 1) my_outer_kernel(id, a)

LoadError: InvalidIRError: compiling MethodInstance for my_outer_kernel(::typeof(id), ::CuDeviceMatrix{Int64, 1}) resulted in invalid LLVM IR
[31mReason: unsupported dynamic function invocation[39m[31m (call to my_inner_kernel!([90mf[39m, [90mt[39m)[90m @[39m [90mMain[39m [90m[4mIn[11]:1[24m[39m)[39m
Stacktrace:
 [1] [0m[1mmy_outer_kernel[22m
[90m   @[39m [90m./[39m[90m[4mIn[11]:7[24m[39m
[36m[1mHint[22m[39m[36m: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erronous code with Cthulhu.jl[39m

In [8]:
function my_inner_kernel!(f::F, t::T) where {F,T}
    t .= f.(t)
end

function my_outer_kernel(f, a)
    i = threadIdx().x
    my_inner_kernel!(f, @view a[i, :])
    return nothing
end

a = CUDA.rand(Int, (2,2))

id(x) = x

@cuda threads=size(a, 1) my_outer_kernel(id, a)

CUDA.HostKernel for my_outer_kernel(typeof(id), CuDeviceMatrix{Int64, 1})

In [12]:
function reverse_kernel(a::CuDeviceArray{T}) where T
    i = threadIdx().x
    b = CuStaticSharedArray(T, 2)
    b[2-i+1] = a[i]
    sync_threads()
    a[i] = b[i]
    return
end

a = cu([1,2])

@cuda threads = 2 reverse_kernel(a)

CUDA.HostKernel for reverse_kernel(CuDeviceVector{Int64, 1})

In [16]:
function reverse_kernel(a::CuDeviceArray{T}) where T
    i = threadIdx().x
    b = CuDynamicSharedArray(T, length(a))
    b[length(a)-i+1] = a[i]
    sync_threads()
    a[i] = b[i]
    return
end

a = cu([1,2,3, 5])

@cuda threads=length(a) shmem=sizeof(a) reverse_kernel(a)

CUDA.HostKernel for reverse_kernel(CuDeviceVector{Int64, 1})

In [17]:
a

4-element CuArray{Int64, 1, CUDA.DeviceMemory}:
 5
 3
 2
 1

In [None]:
using 

In [26]:
cholesky(A)

Cholesky{Float32, CuArray{Float32, 2, CUDA.DeviceMemory}}
U factor:
3×3 UpperTriangular{Float32, CuArray{Float32, 2, CUDA.DeviceMemory}}:
 0.685254  0.386212  0.527034
  ⋅        0.977934  0.542067
  ⋅         ⋅        0.182822

In [32]:
x = sprand(100, 0.3);
cx = CuSparseVector(x);
nonzeros(cx)

33-element CuArray{Float64, 1, CUDA.DeviceMemory}:
 0.6039031468344143
 0.07820569791269638
 0.39737596504279893
 0.8366767710843347
 0.1355349047603367
 0.620027224469006
 0.7493612531248238
 0.9509819490258438
 0.3396524595929371
 0.07685341787368771
 0.5526647315131051
 0.4115848747669809
 0.4660764773388031
 ⋮
 0.8752954546330955
 0.6817879737018145
 0.3107973631356781
 0.4076293888082748
 0.5730931196097949
 0.3213292733791735
 0.09249037240516944
 0.419741901565465
 0.2801374248225166
 0.8489863552701344
 0.7639768606072798
 0.7149204562461796

In [68]:
d1 = rand(Float32, 2048, 2048);
c1 = CuArray(d1);

In [69]:
@benchmark fft(d1)

BenchmarkTools.Trial: 26 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m115.318 ms[22m[39m … [35m233.668 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 1.47%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m187.126 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m1.84%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m195.700 ms[22m[39m ± [32m 36.154 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m1.59% ± 0.57%

  [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▂[39m [39m▅[34m [39m[39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m█[39m [39m [39m [39m 
  [39m█[39m▅[39m▁[39m▁

In [71]:
@benchmark CUDA.@sync fft($c1)

BenchmarkTools.Trial: 6485 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m433.504 μs[22m[39m … [35m 11.076 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 18.69%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m741.453 μs               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m765.796 μs[22m[39m ± [32m767.649 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m1.96% ±  1.84%

  [39m█[39m▃[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▇[34m▇[39m[39m▄[32m▇[39m[39m█[39m▄[39m▂[39m [39m▂[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▂
  [39m█[39m█[39m█[

In [59]:
carr1 = CuArray{Float32}(undef,  1024)
carr2 = fill!(copy(carr1), 0f0)
@test carr2 == CUDA.zeros(Float32, 1024)

[32m[1mTest Passed[22m[39m

In [None]:
carr3 = carr1.^2 + carr2.^2
carr4 = map(cos, carr1)
carr5 = reduce(+, carr1)

1024×1024 Matrix{Float32}:
 0.202309   0.439262   0.132544   …  0.0932927  0.496841   0.0942984
 0.972338   0.725799   0.775524      0.500368   0.357494   0.280119
 0.481463   0.0768188  0.874136      0.785356   0.0594708  0.0391255
 0.611652   0.168059   0.771702      0.522558   0.734388   0.582099
 0.952296   0.359975   0.803226      0.841998   0.147775   0.947803
 0.635537   0.420779   0.359215   …  0.557018   0.121318   0.0227185
 0.286994   0.605701   0.343228      0.744424   0.340557   0.829776
 0.348642   0.509614   0.412459      0.0766146  0.0522597  0.0356306
 0.937924   0.244475   0.623697      0.480176   0.622869   0.979609
 0.64576    0.903165   0.345828      0.300507   0.204809   0.665933
 0.618522   0.777219   0.858111   …  0.0282814  0.20614    0.75639
 0.162471   0.490091   0.661842      0.976594   0.499811   0.94055
 0.961984   0.987511   0.184654      0.618866   0.831588   0.480281
 ⋮                                ⋱                        
 0.676752   0.135848   0.75

In [61]:
carr2

1024-element CuArray{Float32, 1, CUDA.DeviceMemory}:
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 ⋮
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0

In [67]:
carr0 = CuArray(rand(Float32, 128, 128))

128×128 CuArray{Float32, 2, CUDA.DeviceMemory}:
 0.614408   0.545875   0.716857   …  0.25018    0.304857   0.99422
 0.432157   0.148661   0.060947      0.0701835  0.15738    0.26359
 0.691122   0.743682   0.736573      0.179693   0.290273   0.639974
 0.196913   0.967706   0.506799      0.189907   0.142643   0.243997
 0.760219   0.935825   0.721442      0.688403   0.844589   0.125919
 0.709927   0.51387    0.211541   …  0.726804   0.346087   0.563484
 0.528632   0.684303   0.616334      0.721476   0.0159537  0.233643
 0.57822    0.205926   0.999921      0.57385    0.818043   0.345943
 0.226337   0.0525993  0.558896      0.868044   0.869828   0.584088
 0.373114   0.545261   0.765772      0.963495   0.879001   0.215512
 0.435306   0.0942882  0.308403   …  0.274051   0.844993   0.312775
 0.0454069  0.292291   0.942017      0.269561   0.989988   0.942471
 0.750616   0.76125    0.438698      0.244673   0.99518    0.662555
 ⋮                                ⋱  ⋮                     
 0.398871 

In [73]:
c1 = CuArray([1,2,3,4,5])
c2 = c1[[true, false, false, true, true]]

3-element CuArray{Int64, 1, CUDA.DeviceMemory}:
 1
 4
 5

In [74]:
findall(isodd, c1)

3-element CuArray{Int64, 1, CUDA.DeviceMemory}:
 1
 3
 5

In [75]:
findfirst(isodd, c1)

1

In [76]:
findmin(c1)

(1, 1)

In [77]:
d = findfirst(isodd, c1)

1

In [78]:
typeof(d)

Int64

In [82]:
c2 = CuArray{Int32}(collect(1:6))
c3 = reshape(c2, 2, 3)
c4 = view(c2, 2:4)

3-element CuArray{Int32, 1, CUDA.DeviceMemory}:
 2
 3
 4

In [86]:
c1 = CuArray{Int32}([1,2,3,4])
c1[3]=4
c1[3] += 1

5

In [85]:
c1

4-element CuArray{Int32, 1, CUDA.DeviceMemory}:
 1
 2
 4
 4

In [87]:
CUBLAS.cublasLoggerConfigure(1, 0, 1, C_NULL)

In [95]:
x, y = rand(Float32, 1024), rand(Float32, 1024)
cx, cy = CuArray(x), CuArray(y)

(Float32[0.6862528, 0.3502003, 0.17327243, 0.5727967, 0.4321698, 0.21588409, 0.9317812, 0.8133021, 0.031039774, 0.90765685  …  0.662232, 0.59176946, 0.30912292, 0.87857586, 0.46198666, 0.24350399, 0.83844346, 0.025021374, 0.8581258, 0.8255516], Float32[0.049223542, 0.5396586, 0.5479738, 0.2955312, 0.49956048, 0.74897486, 0.8889911, 0.932449, 0.007093966, 0.13516706  …  0.345146, 0.31396365, 0.10546261, 0.41614407, 0.45485222, 0.12701416, 0.9572982, 0.7359793, 0.031462073, 0.94465625])

In [100]:
CUDA.@elapsed CUBLAS.dot(2, cx, cy)
    

I! cuBLAS (v12.3) function cublasStatus_t cublasGetProperty(libraryPropertyType, int*) called:
i!  type: type=SOME TYPE; val=0
i!  value: type=int; val=POINTER (IN HEX:0x0x7f1eca905b90)
i! Time: 2024-11-07T11:12:28 elapsed from start 188.566667 minutes or 11314.000000 seconds
i!Process=29499; Thread=139776632226496; GPU=0; Handle=POINTER (IN HEX:0x(nil))
i! COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
I! cuBLAS (v12.3) function cublasStatus_t cublasGetProperty(libraryPropertyType, int*) called:
i!  type: type=SOME TYPE; val=1
i!  value: type=int; val=POINTER (IN HEX:0x0x7f1eca905bb0)
i! Time: 2024-11-07T11:12:28 elapsed from start 188.566667 minutes or 11314.000000 seconds
i!Process=29499; Thread=139776632226496; GPU=0; Handle=POINTER (IN HEX:0x(nil))
i! COMPILED WITH: GNU GCC/G++ / 6.3.1 20170216 (Red Hat 6.3.1-3)
I! cuBLAS (v12.3) function cublasStatus_t cublasGetProperty(libraryPropertyType, int*) called:
i!  type: type=SOME TYPE; val=2
i!  value: type=int; val=POIN

0.000444384f0

In [102]:
@elapsed dot(x, y)

9.375e-6