# Dependences

In [28]:
using CUDA
using DelimitedFiles

R_Agg = 6
r_max = 3.0

3.0

# idx

In [29]:
euclidean(points,i,j) = sqrt((points[i,1]-points[j,1])^2+(points[i,2]-points[j,2])^2+(points[i,3]-points[j,3])^2)

function dist_kernel!(idx, points ,r_max)
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
    
    if i <= size(points, 1) && j <= size(points, 1)
        if euclidean(points,i,j) < r_max
            idx[i, j] = i
        else
            idx[i, j] = 0
        end 
    end
    return nothing
end

dist_kernel! (generic function with 1 method)

In [30]:
r_max=3.0
X = Float64.(readdlm("init/$(R_Agg).xyz")[3:end,2:end]) |> cu
idx = Int32.(zeros(size(X, 1), size(X, 1))) |> cu

println("------------------------ SIZE ------------------------")
println("r_max         = $(r_max)")
println("Size X        = $(size(X))")
println("Size idx      = $(size(idx))")

println("--------------------- SIZE CUDA ----------------------")
threads =(32,32)
blocks  =cld.(size(X,1),threads)
println("Threads = $(threads)")
println("Blocks  = $(blocks)")
println("--------------------- CUDA SOLVER --------------------")
CUDA.@time @cuda threads=threads blocks=blocks dist_kernel!(idx, X ,r_max)
println("------------------------ IDX -------------------------")
idx_real = idx
display(idx)


------------------------ SIZE ------------------------
r_max         = 3.0
Size X        = (156, 3)
Size idx      = (156, 156)
--------------------- SIZE CUDA ----------------------
Threads = (32, 32)
Blocks  = (5, 5)
--------------------- CUDA SOLVER --------------------
  0.119789 seconds (71.71 k CPU allocations: 5.199 MiB)
------------------------ IDX -------------------------


156×156 CuArray{Int32, 2, CUDA.Mem.DeviceBuffer}:
  1   1   1  0  0   1   1   1   0   0  …    0    0    0    0    0    0    0
  2   2   2  0  0   0   0   0   0   2       0    0    0    0    0    0    0
  3   3   3  0  0   0   0   0   0   0       0    0    0    0    0    0    0
  0   0   0  4  4   4   4   0   0   0       0    0    0    0    0    0    0
  0   0   0  5  5   0   5   5   0   0       0    0    0    0    0    0    0
  6   0   0  6  0   6   6   0   0   6  …    0    0    0    0    0    0    0
  7   0   0  7  7   7   7   7   0   0       0    0    0    0    0    0    0
  8   0   0  0  8   0   8   8   8   0       0    0    0    0    0    0    0
  0   0   0  0  0   0   0   9   9   0       0    0    0    0    0    0    0
  0  10   0  0  0  10   0   0   0  10       0    0    0    0    0    0    0
 11  11  11  0  0  11  11   0   0  11  …    0    0    0    0    0    0    0
 12  12  12  0  0   0  12  12   0   0       0    0    0    0    0    0    0
  0   0  13  0  0   0   0  13  13   0 

# reduce

In [31]:
function reduce_kernel(idx,idx_red,idx_sum)
    j  = (blockIdx().y-1) * blockDim().y + threadIdx().y
    # cache = @cuStaticSharedMem(Int32, blockDim().y)

    if j <= size(idx,1)
        for i = 1:size(idx,1)
            if idx[i,j] != 0
                idx_sum[j] += 1
                idx_red[idx_sum[j],j] = i
            end
        end
    end
    
    return nothing
end

reduce_kernel (generic function with 1 method)

## test

In [32]:
N = 22
idx_test = repeat(collect(1:N), inner=(1,N)).*rand((0,1),N,N) |> cu
idx_sum  = Int32.(fill(0,1,size(idx_test, 1))) |> cu
idx_red  = Int32.(fill(0,size(idx_test, 1),size(idx_test, 1))) |> cu


println("------------------------ SIZE ------------------------")
println("Size idx_test = $(size(idx_test))")
println("Size idx_sum  = $(size(idx_sum))")
println("Size idx_red  = $(size(idx_red))")
println("------------------------ IDX -------------------------")
display(idx_test)
println("--------------------- SIZE CUDA ----------------------")
threads=(1,1024)
blocks=cld.(size(idx_test,1),threads)
println("Threads = $(threads)")
println("Blocks  = $(blocks)")
println("--------------------- CUDA SOLVER --------------------")
CUDA.@time @cuda threads=threads blocks=blocks reduce_kernel(idx_test,idx_red,idx_sum)
println("------------------ REDUCED IDX -----------------------")
idx_red

------------------------ SIZE ------------------------
Size idx_test = (22, 22)
Size idx_sum  = (1, 22)
Size idx_red  = (22, 22)
------------------------ IDX -------------------------


22×22 CuArray{Int64, 2, CUDA.Mem.DeviceBuffer}:
  0   0   0   1   1   1   1   0   0  …   0   0   1   1   0   1   0   1   0
  2   2   0   0   0   0   0   0   2      0   0   2   2   0   0   2   2   2
  3   3   3   0   0   3   0   3   3      0   0   3   3   3   0   3   3   3
  0   4   0   4   4   4   4   4   0      4   0   0   0   4   0   4   4   4
  0   5   0   0   0   0   0   0   0      5   0   0   5   5   5   5   0   5
  6   0   6   0   0   6   0   6   6  …   0   0   6   6   6   6   0   0   0
  7   7   7   0   7   7   7   7   7      7   0   0   0   7   0   7   0   7
  0   0   8   8   8   8   0   8   8      8   0   8   0   8   8   0   8   8
  0   0   0   0   9   0   9   0   9      9   9   9   9   0   0   0   0   9
  0  10  10   0   0   0   0  10   0      0   0   0   0   0   0   0  10  10
  0  11   0   0   0  11   0   0  11  …  11  11   0   0  11  11   0  11  11
 12   0  12  12  12  12  12  12  12      0  12  12  12  12   0  12  12   0
 13  13   0  13  13  13  13  13   0     13   0  13  

--------------------- SIZE CUDA ----------------------
Threads = (1, 1024)
Blocks  = (22, 1)
--------------------- CUDA SOLVER --------------------
  0.060529 seconds (52.06 k CPU allocations: 4.174 MiB)
------------------ REDUCED IDX -----------------------


22×22 CuArray{Int32, 2, CUDA.Mem.DeviceBuffer}:
  2   2   3   1   1   1   1   3   2  …   4   9   1   1   3   1   2   1   2
  3   3   6   4   4   3   4   4   3      5  11   2   2   4   5   3   2   3
  6   4   7   8   7   4   7   6   6      7  12   3   3   5   6   4   3   4
  7   5   8  12   8   6   9   7   7      8  17   6   5   6   8   5   4   5
 12   7  10  13   9   7  12   8   8      9  20   8   6   7  11   7   8   7
 13  10  12  15  12   8  13  10   9  …  11   0   9   9   8  14  12  10   8
 14  11  14  16  13  11  20  12  11     13   0  12  12  11  16  15  11   9
 16  13  16  17  16  12  22  13  12     14   0  13  14  12  21  17  12  10
 17  14  17  18  18  13   0  14  16     15   0  14  18  13  22  18  13  11
 20  15  18  20  19  14   0  15  18     19   0  15  21  14   0  19  17  13
  0  16  19  21  21  15   0  16  19  …  22   0  18   0  15   0   0  20  14
  0  22  20   0  22  17   0  17  21      0   0   0   0  16   0   0   0  15
  0   0   0   0   0  18   0  22   0      0   0   0  

## real

In [33]:
idx_sum  = Int32.(zeros(1, size(idx_real, 1))) |> cu
idx_red  = Int32.(zeros(size(idx_real, 1), size(idx_real, 1))) |> cu

println("------------------------ SIZE ------------------------")
println("Size idx_real  = $(size(idx_real))")
println("Size idx_sum   = $(size(idx_sum))")
println("Size idx_red   = $(size(idx_red))")
println("------------------------ IDX -------------------------")
display(idx)
println("--------------------- SIZE CUDA ----------------------")
threads=(1,1024)
blocks=cld.(size(idx_real,1),threads)
println("Threads = $(threads)")
println("Blocks  = $(blocks)")
println("--------------------- CUDA SOLVER --------------------")
CUDA.@time @cuda threads=threads blocks=blocks reduce_kernel(idx_real,idx_red,idx_sum)
display(idx_red)


------------------------ SIZE ------------------------
Size idx_real  = (156, 156)
Size idx_sum   = (1, 156)
Size idx_red   = (156, 156)
------------------------ IDX -------------------------


156×156 CuArray{Int32, 2, CUDA.Mem.DeviceBuffer}:
  1   1   1  0  0   1   1   1   0   0  …    0    0    0    0    0    0    0
  2   2   2  0  0   0   0   0   0   2       0    0    0    0    0    0    0
  3   3   3  0  0   0   0   0   0   0       0    0    0    0    0    0    0
  0   0   0  4  4   4   4   0   0   0       0    0    0    0    0    0    0
  0   0   0  5  5   0   5   5   0   0       0    0    0    0    0    0    0
  6   0   0  6  0   6   6   0   0   6  …    0    0    0    0    0    0    0
  7   0   0  7  7   7   7   7   0   0       0    0    0    0    0    0    0
  8   0   0  0  8   0   8   8   8   0       0    0    0    0    0    0    0
  0   0   0  0  0   0   0   9   9   0       0    0    0    0    0    0    0
  0  10   0  0  0  10   0   0   0  10       0    0    0    0    0    0    0
 11  11  11  0  0  11  11   0   0  11  …    0    0    0    0    0    0    0
 12  12  12  0  0   0  12  12   0   0       0    0    0    0    0    0    0
  0   0  13  0  0   0   0  13  13   0 

--------------------- SIZE CUDA ----------------------
Threads = (1, 1024)
Blocks  = (156, 1)
--------------------- CUDA SOLVER --------------------
  0.057760 seconds (50.75 k CPU allocations: 4.112 MiB)


156×156 CuArray{Int32, 2, CUDA.Mem.DeviceBuffer}:
  1   1   1   4   4   1   1   1   8   2  …  126  127  129  130  137  138  141
  2   2   2   5   5   4   4   5   9   6     127  128  130  131  140  141  145
  3   3   3   6   7   6   5   7  13  10     128  132  131  132  141  142  146
  6  10  11   7   8   7   6   8  27  11     131  133  134  135  144  145  148
  7  11  12  21  22  10   7   9  28  14     132  147  135  136  145  146  149
  8  12  13  22  23  11   8  12  32  24  …  136  150  148  149  146  147  150
 11  14  15  24  25  21  11  13  33  29     146  151  149  150  154  154  154
 12  15  16  25  26  24  12  23   8  30     147  127  152  152  155  155  155
 15  18  19  26  27  25  22  26   9  34     149  128  153  153  156  156  156
  1   1   1   4   4  29  25  27  13  35     150  132  129  130  137  138  141
  2   2   2   5   5  30  26  31  27   2  …  151  133  130  131  140  141  145
  3   3   3   6   7  31  30  32  28   6     153  147  131  132  141  142  146
  6  10  11   

In [37]:
sum(idx_red .> 0, dims=1)

1×156 CuArray{Int64, 2, CUDA.Mem.DeviceBuffer}:
 59  59  59  56  55  79  89  85  50  64  …  113  103  55  72  90  72  73  79