# Dependences

In [1]:
using CUDA
using DelimitedFiles

R_Agg = 15
r_max = 4.0
max_row = 60

60

# Functions

In [2]:
euclidean(points,i,j) = sqrt((points[i,1]-points[j,1])^2+(points[i,2]-points[j,2])^2+(points[i,3]-points[j,3])^2)

function dist_kernel!(idx, points ,r_max)
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
    
    if i <= size(points, 1) && j <= size(points, 1)
        if euclidean(points,i,j) < r_max
            idx[i, j] = i
        else
            idx[i, j] = 0
        end 
    end
    return nothing
end

function reduce_kernel(idx,idx_red,idx_sum)
    i  = (blockIdx().x-1) * blockDim().x + threadIdx().x
    # cache = @cuStaticSharedMem(Int32, blockDim().x)

    if i <= size(idx,1)
        for j = 1:size(idx,1)
            if idx[j,i] != 0
                idx_sum[i] += 1
                idx_red[idx_sum[i],i] = j
            end
        end
    end
    
    return nothing
end

reduce_kernel (generic function with 1 method)

# Idx

In [3]:
X = Float64.(readdlm("../init/$(R_Agg).xyz")[3:end,2:end]) |> cu
idx = Int32.(zeros(size(X, 1), size(X, 1))) |> cu

println("------------------------ SIZE ------------------------")
println("r_max         = $(r_max)")
println("Size X        = $(size(X))")
println("Size idx      = $(size(idx))")

println("--------------------- SIZE CUDA ----------------------")
threads =(32,32)
blocks  =cld.(size(X,1),threads)
println("Threads = $(threads)")
println("Blocks  = $(blocks)")
println("--------------------- CUDA SOLVER --------------------")
CUDA.@time @cuda threads=threads blocks=blocks dist_kernel!(idx, X ,r_max)
println("------------------------ IDX -------------------------")
idx_real = idx
display(idx)


------------------------ SIZE ------------------------
r_max         = 4.0
Size X        = (2504, 3)
Size idx      = (2504, 2504)
--------------------- SIZE CUDA ----------------------
Threads = (32, 32)
Blocks  = (79, 79)
--------------------- CUDA SOLVER --------------------
  9.144280 seconds (25.10 M CPU allocations: 1.434 GiB, 5.62% gc time)
------------------------ IDX -------------------------


2504×2504 CuArray{Int32, 2, CUDA.Mem.DeviceBuffer}:
  1   1   0   1   1   1   1   0   0  …     0     0     0     0     0     0
  2   2   2   0   2   2   2   2   0        0     0     0     0     0     0
  0   3   3   0   0   3   3   3   0        0     0     0     0     0     0
  4   0   0   4   4   0   0   0   4        0     0     0     0     0     0
  5   5   0   5   5   5   0   0   5        0     0     0     0     0     0
  6   6   6   0   6   6   6   0   0  …     0     0     0     0     0     0
  7   7   7   0   0   7   7   7   0        0     0     0     0     0     0
  0   8   8   0   0   0   8   8   0        0     0     0     0     0     0
  0   0   0   9   9   0   0   0   9        0     0     0     0     0     0
  0   0   0  10  10  10   0   0  10        0     0     0     0     0     0
 11   0   0  11  11  11  11   0   0  …     0     0     0     0     0     0
  0  12   0   0  12  12  12  12   0        0     0     0     0     0     0
  0   0  13   0   0  13  13  13   0        0    

# reduce-test

In [4]:
N = 22
idx_test = repeat(collect(1:N), inner=(1,N)).*rand((0,1),N,N) |> cu
idx_sum  = Int32.(fill(0,1,size(idx_test, 1))) |> cu
idx_red  = Int32.(fill(0,max_row,size(idx_test, 1))) |> cu


println("------------------------ SIZE ------------------------")
println("Size idx_test = $(size(idx_test))")
println("Size idx_sum  = $(size(idx_sum))")
println("Size idx_red  = $(size(idx_red))")
println("------------------------ IDX -------------------------")
display(idx_test)
println("--------------------- SIZE CUDA ----------------------")
threads=1024
blocks=cld.(size(idx_test,1),threads)
println("Threads = $(threads)")
println("Blocks  = $(blocks)")
println("--------------------- CUDA SOLVER --------------------")
CUDA.@time @cuda threads=threads blocks=blocks reduce_kernel(idx_test,idx_red,idx_sum)
println("------------------ REDUCED IDX -----------------------")
display(idx_red)
println("------------------------- MAX VALUE(idx_red) ------------------------")
display(sum(idx_red .> 0, dims=1))
println("Max_Value  = $(findmax(sum(idx_red .> 0, dims=1)))")
println("------------------------ MAX VALUE(idx_test) ------------------------")
display(sum(idx_test .> 0, dims=1))
println("Max_Value  = $(findmax(sum(idx_test .> 0, dims=1)))")

------------------------ SIZE ------------------------
Size idx_test = (22, 22)
Size idx_sum  = (1, 22)
Size idx_red  = (60, 22)
------------------------ IDX -------------------------


22×22 CuArray{Int64, 2, CUDA.Mem.DeviceBuffer}:
  0   1   0   1   1   0   1   0   1  …   0   1   0   1   1   1   1   0   1
  0   2   2   0   0   2   0   2   2      0   2   2   2   0   2   2   0   2
  3   0   3   3   3   0   0   0   3      0   3   0   3   0   0   3   3   3
  0   0   0   4   0   4   4   4   0      4   4   0   0   4   4   4   0   0
  0   5   5   5   0   5   0   5   5      0   0   0   0   0   0   0   5   5
  6   6   0   0   6   0   0   6   6  …   6   0   6   0   6   6   0   6   0
  7   7   0   0   0   0   7   7   7      7   7   7   7   0   0   0   0   0
  8   8   8   8   8   8   0   8   8      0   8   0   0   0   0   8   0   8
  0   9   9   0   9   9   9   0   0      0   0   9   0   9   9   9   0   0
  0  10  10  10   0  10  10  10   0      0   0   0   0   0  10   0   0   0
 11   0   0  11  11   0   0  11  11  …   0   0   0  11  11  11  11  11  11
 12   0   0  12   0  12   0  12  12     12   0  12  12   0  12   0  12   0
  0   0   0   0  13   0   0   0  13     13   0  13  

--------------------- SIZE CUDA ----------------------
Threads = 1024
Blocks  = 1
--------------------- CUDA SOLVER --------------------
  0.607707 seconds (2.45 M CPU allocations: 141.302 MiB, 6.28% gc time)
------------------ REDUCED IDX -----------------------


60×22 CuArray{Int32, 2, CUDA.Mem.DeviceBuffer}:
  3   1   2   1   1   2   1   2   1  …   4   1   2   1   1   1   1   3   1
  6   2   3   3   3   4   4   4   2      6   2   6   2   4   2   2   5   2
  7   5   5   4   6   5   7   5   3      7   3   7   3   6   4   3   6   3
  8   6   8   5   8   8   9   6   5     12   4   9   7   9   6   4  11   5
 11   7   9   8   9   9  10   7   6     13   7  12  11  11   9   8  12   8
 12   8  10  10  11  10  14   8   7  …  14   8  13  12  13  10   9  16  11
 15   9  14  11  13  12  16  10   8     16  14  14  15  14  11  11  17  13
 16  10  15  12  22  14  17  11  11     17  15  15  18  16  12  14  18  14
 22  14  16  14   0  15  19  12  12     18  16  17  21  17  15  17   0  15
  0  15  17  15   0  16  20  21  13     19  17  19   0  19  16  19   0  17
  0  16  18  16   0   0  22   0  14  …  20  18  20   0  22  17  21   0  18
  0  21  20  18   0   0   0   0  15     22  20   0   0   0  22  22   0  19
  0   0  21  19   0   0   0   0  18      0  22   0  

------------------------- MAX VALUE(idx_red) ------------------------


1×22 CuArray{Int64, 2, CUDA.Mem.DeviceBuffer}:
 9  12  13  14  8  10  11  10  15  9  11  …  12  13  11  9  11  12  12  8  14

Max_Value  = (18, CartesianIndex(1, 12))
------------------------ MAX VALUE(idx_test) ------------------------


1×22 CuArray{Int64, 2, CUDA.Mem.DeviceBuffer}:
 9  12  13  14  8  10  11  10  15  9  11  …  12  13  11  9  11  12  12  8  14

Max_Value  = (18, CartesianIndex(1, 12))


# reduce-real

In [5]:
idx_sum = Int32.(zeros(1, size(idx_real, 1))) |> cu
idx_red = Int32.(zeros(max_row, size(idx_real, 1))) |> cu
# idx_red  = Int32.(zeros(size(idx_real, 1), size(idx_real, 1))) |> cu

println("------------------------ SIZE ------------------------")
println("Size idx_real  = $(size(idx_real))")
println("Size idx_sum   = $(size(idx_sum))")
println("Size idx_red   = $(size(idx_red))")
println("------------------------ IDX -------------------------")
display(idx)
println("--------------------- SIZE CUDA ----------------------")
threads=1024
blocks=cld.(size(idx_real,1),threads)
println("Threads = $(threads)")
println("Blocks  = $(blocks)")
println("--------------------- CUDA SOLVER --------------------")
CUDA.@time @cuda threads=threads blocks=blocks reduce_kernel(idx_real,idx_red,idx_sum)
display(idx_red)


------------------------ SIZE ------------------------
Size idx_real  = (2504, 2504)
Size idx_sum   = (1, 2504)
Size idx_red   = (60, 2504)
------------------------ IDX -------------------------


2504×2504 CuArray{Int32, 2, CUDA.Mem.DeviceBuffer}:
  1   1   0   1   1   1   1   0   0  …     0     0     0     0     0     0
  2   2   2   0   2   2   2   2   0        0     0     0     0     0     0
  0   3   3   0   0   3   3   3   0        0     0     0     0     0     0
  4   0   0   4   4   0   0   0   4        0     0     0     0     0     0
  5   5   0   5   5   5   0   0   5        0     0     0     0     0     0
  6   6   6   0   6   6   6   0   0  …     0     0     0     0     0     0
  7   7   7   0   0   7   7   7   0        0     0     0     0     0     0
  0   8   8   0   0   0   8   8   0        0     0     0     0     0     0
  0   0   0   9   9   0   0   0   9        0     0     0     0     0     0
  0   0   0  10  10  10   0   0  10        0     0     0     0     0     0
 11   0   0  11  11  11  11   0   0  …     0     0     0     0     0     0
  0  12   0   0  12  12  12  12   0        0     0     0     0     0     0
  0   0  13   0   0  13  13  13   0        0    

--------------------- SIZE CUDA ----------------------
Threads = 1024
Blocks  = 3
--------------------- CUDA SOLVER --------------------
  0.490258 seconds (2.05 M CPU allocations: 117.263 MiB, 6.30% gc time)


60×2504 CuArray{Int32, 2, CUDA.Mem.DeviceBuffer}:
  1   1   2   1   1   1   1   2   4  …  2362  2363  2364  2372  2373  2374
  2   2   3   4   2   2   2   3   5     2363  2364  2365  2373  2374  2375
  4   3   6   5   4   3   3   7   9     2372  2373  2374  2382  2383  2384
  5   5   7   9   5   5   6   8  10     2373  2374  2375  2383  2384  2385
  6   6   8  10   6   6   7  12  15     2374  2375  2376  2384  2385  2386
  7   7  13  11   9   7   8  13  16  …  2383  2384  2385  2391  2392  2393
 11   8  32  15  10  10  11  14  21     2384  2385  2386  2392  2393  2394
 30  12  33  34  11  11  12  19  40     2444  2445  2446  2452  2453  2454
 31  31  37  35  12  12  13  38  41     2445  2446  2447  2453  2454  2455
 35  32  38  40  16  13  14  39  48     2446  2447  2448  2454  2455  2456
 36  36  39  41  35  17  18  44  49  …  2452  2453  2454  2459  2460  2461
 37  37  43  42  36  36  37  45  50     2453  2454  2455  2460  2461  2462
 41  38  44  48  41  37  38  46  56     2454  2455

# test

In [6]:
println("------------------------- MAX VALUE(idx_red) ------------------------")
display(sum(idx_red .> 0, dims=1))
println("Max_Value  = $(findmax(sum(idx_red .> 0, dims=1)))")
println("--------------------------- MAX VALUE(idx) --------------------------")
display(sum(idx .> 0, dims=1))
println("Max_Value  = $(findmax(sum(idx .> 0, dims=1)))")

------------------------- MAX VALUE(idx_red) ------------------------


1×2504 CuArray{Int64, 2, CUDA.Mem.DeviceBuffer}:
 26  27  25  26  29  30  30  27  26  …  28  26  29  30  30  27  26  27  25

Max_Value  = (55, CartesianIndex(1, 163))
--------------------------- MAX VALUE(idx) --------------------------


1×2504 CuArray{Int64, 2, CUDA.Mem.DeviceBuffer}:
 26  27  25  26  29  30  30  27  26  …  28  26  29  30  30  27  26  27  25

Max_Value  = (55, CartesianIndex(1, 163))
