# Dependences

In [2]:
using CUDA
using DelimitedFiles
R_Agg = 15

X = Float64.(readdlm("../../../data/init/Sphere/$(R_Agg).xyz")[3:end,2:end]) |> cu
idx = Int32.(zeros(size(X, 1), size(X, 1))) |> cu;

# Program

## Functions

In [31]:
euclidean(points,i,j) = sqrt((points[i,1]-points[j,1])^2+(points[i,2]-points[j,2])^2+(points[i,3]-points[j,3])^2)

function dist_kernel!(idx, points ,r_max)
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
    
    if i <= size(points, 1) && j <= size(points, 1)
        if euclidean(points,i,j) < r_max
            idx[i, j] = i
        else
            idx[i, j] = 0
        end 
    end
    return nothing
end

function reduce_kernel(idx,idx_red,idx_sum)
    i  = (blockIdx().x-1) * blockDim().x + threadIdx().x
    # cache = @cuStaticSharedMem(Int32, blockDim().x)

    if i <= size(idx,1)
        for j = 1:size(idx,1)
            if idx[j,i] != 0
                idx_sum[i] += 1
                idx_red[idx_sum[i],i] = j
            end
        end
    end
    
    return nothing
end

reduce_kernel (generic function with 1 method)

: 

## Idx

In [4]:
println("------------------------ SIZE ------------------------")
println("r_max         = $(r_max)")
println("Size X        = $(size(X))")
println("Size idx      = $(size(idx))")

println("--------------------- SIZE CUDA ----------------------")
threads =(32,32)
blocks  =cld.(size(X,1),threads)
println("Threads = $(threads)")
println("Blocks  = $(blocks)")
println("--------------------- CUDA SOLVER --------------------")
CUDA.@time @cuda threads=threads blocks=blocks dist_kernel!(idx, X ,r_max)
println("------------------------ IDX -------------------------")
idx_real = idx
display(idx)


------------------------ SIZE ------------------------
r_max         = 2.3
Size X        = (2504, 3)
Size idx      = (2504, 2504)
--------------------- SIZE CUDA ----------------------
Threads = (32, 32)
Blocks  = (79, 79)
--------------------- CUDA SOLVER --------------------
 21.928154 seconds (25.83 M CPU allocations: 1.473 GiB, 2.74% gc time)
------------------------ IDX -------------------------


2504×2504 CuArray{Int32, 2, CUDA.Mem.DeviceBuffer}:
 1  1  0   0   1   1   0   0   0   0  …     0     0     0     0     0     0
 2  2  2   0   0   2   2   0   0   0        0     0     0     0     0     0
 0  3  3   0   0   0   3   3   0   0        0     0     0     0     0     0
 0  0  0   4   4   0   0   0   4   4        0     0     0     0     0     0
 5  0  0   5   5   5   0   0   0   5        0     0     0     0     0     0
 6  6  0   0   6   6   6   0   0   0  …     0     0     0     0     0     0
 0  7  7   0   0   7   7   7   0   0        0     0     0     0     0     0
 0  0  8   0   0   0   8   8   0   0        0     0     0     0     0     0
 0  0  0   9   0   0   0   0   9   9        0     0     0     0     0     0
 0  0  0  10  10   0   0   0  10  10        0     0     0     0     0     0
 0  0  0   0  11  11   0   0   0  11  …     0     0     0     0     0     0
 0  0  0   0   0  12  12   0   0   0        0     0     0     0     0     0
 0  0  0   0   0   0  13  13   0   0

## reduce-test

In [5]:
N = 22
idx_test = repeat(collect(1:N), inner=(1,N)).*rand((0,1),N,N) |> cu
idx_sum  = Int32.(fill(0,1,size(idx_test, 1))) |> cu
idx_red  = Int32.(fill(0,max_row,size(idx_test, 1))) |> cu


println("------------------------ SIZE ------------------------")
println("Size idx_test = $(size(idx_test))")
println("Size idx_sum  = $(size(idx_sum))")
println("Size idx_red  = $(size(idx_red))")
println("------------------------ IDX -------------------------")
display(idx_test)
println("--------------------- SIZE CUDA ----------------------")
threads=1024
blocks=cld.(size(idx_test,1),threads)
println("Threads = $(threads)")
println("Blocks  = $(blocks)")
println("--------------------- CUDA SOLVER --------------------")
CUDA.@time @cuda threads=threads blocks=blocks reduce_kernel(idx_test,idx_red,idx_sum)
println("------------------ REDUCED IDX -----------------------")
display(idx_red)
println("------------------------- MAX VALUE(idx_red) ------------------------")
display(sum(idx_red .> 0, dims=1))
println("Max_Value  = $(findmax(sum(idx_red .> 0, dims=1)))")
println("------------------------ MAX VALUE(idx_test) ------------------------")
display(sum(idx_test .> 0, dims=1))
println("Max_Value  = $(findmax(sum(idx_test .> 0, dims=1)))")

------------------------ SIZE ------------------------
Size idx_test = (22, 22)
Size idx_sum  = (1, 22)
Size idx_red  = (60, 22)
------------------------ IDX -------------------------


22×22 CuArray{Int64, 2, CUDA.Mem.DeviceBuffer}:
  0   0   0   0   0   1   0   0   0  …   1   0   0   0   1   0   1   1   1
  2   2   2   2   2   2   2   2   0      2   2   0   2   0   0   2   2   2
  0   0   0   0   3   0   3   3   0      0   3   0   3   3   3   3   3   0
  0   0   4   4   4   4   4   0   4      0   0   0   0   0   4   4   4   0
  5   0   0   0   5   0   5   0   5      5   0   0   5   5   0   0   0   5
  0   0   6   0   6   0   0   0   0  …   6   6   6   0   0   6   0   6   0
  0   0   0   7   0   0   7   7   0      0   7   0   7   7   7   7   7   0
  8   0   0   0   0   0   8   0   0      8   0   8   8   8   8   0   0   0
  0   9   9   9   0   9   9   0   9      0   0   9   9   9   0   0   0   9
  0   0  10  10  10  10   0   0   0      0  10  10  10  10   0   0   0  10
 11  11  11  11   0   0   0   0   0  …  11   0   0  11  11   0  11   0  11
  0  12  12   0   0  12  12  12  12      0  12  12  12  12  12  12  12   0
  0   0   0  13  13  13  13   0   0     13   0   0  

--------------------- SIZE CUDA ----------------------
Threads = 1024
Blocks  = 1
--------------------- CUDA SOLVER --------------------
  1.395057 seconds (2.62 M CPU allocations: 150.491 MiB, 4.80% gc time)
------------------ REDUCED IDX -----------------------


60×22 CuArray{Int32, 2, CUDA.Mem.DeviceBuffer}:
  2   2   2   2   2   1   2   2   4  …   1   2   6   2   1   3   1   1   1
  5   9   4   4   3   2   3   3   5      2   3   8   3   3   4   2   2   2
  8  11   6   7   4   4   4   7   9      5   6   9   5   5   6   3   3   5
 11  12   9   9   5   9   5  12  12      6   7  10   7   7   7   4   4   9
 14  15  10  10   6  10   7  18  14      8  10  12   8   8   8   7   6  10
 16  16  11  11  10  12   8  19  15  …  11  12  14   9   9  12  11   7  11
 17  18  12  13  13  13   9  21  17     13   0  16  10  10  14  12  12  14
 18  19  15  15  18  18  12  22  19     17   0  21  11  11  15  13  14  15
 20  22  17  18  22  22  13   0   0      0   0  22  12  12  16  16  16  18
 21   0  18  19   0   0  15   0   0      0   0   0  13  14  17  18  17  19
  0   0  19  20   0   0  16   0   0  …   0   0   0  14  17  18  21  19  20
  0   0  21   0   0   0  17   0   0      0   0   0  17  18  19   0   0   0
  0   0   0   0   0   0  22   0   0      0   0   0  

------------------------- MAX VALUE(idx_red) ------------------------


1×22 CuArray{Int64, 2, CUDA.Mem.DeviceBuffer}:
 10  9  12  11  9  9  13  8  8  6  15  9  10  8  6  9  15  15  13  11  11  11

Max_Value  = (15, CartesianIndex(1, 11))
------------------------ MAX VALUE(idx_test) ------------------------


1×22 CuArray{Int64, 2, CUDA.Mem.DeviceBuffer}:
 10  9  12  11  9  9  13  8  8  6  15  9  10  8  6  9  15  15  13  11  11  11

Max_Value  = (15, CartesianIndex(1, 11))


## reduce-real

In [6]:
idx_sum = Int32.(zeros(1, size(idx_real, 1))) |> cu
idx_red = Int32.(zeros(max_row, size(idx_real, 1))) |> cu

println("------------------------ SIZE ------------------------")
println("Size idx_real  = $(size(idx_real))")
println("Size idx_sum   = $(size(idx_sum))")
println("Size idx_red   = $(size(idx_red))")
println("------------------------ IDX -------------------------")
display(idx)
println("--------------------- SIZE CUDA ----------------------")
threads=1024
blocks=cld.(size(idx_real,1),threads)
println("Threads = $(threads)")
println("Blocks  = $(blocks)")
println("--------------------- CUDA SOLVER --------------------")
CUDA.@time @cuda threads=threads blocks=blocks reduce_kernel(idx_real,idx_red,idx_sum)
display(idx_red)


------------------------ SIZE ------------------------
Size idx_real  = (2504, 2504)
Size idx_sum   = (1, 2504)
Size idx_red   = (60, 2504)
------------------------ IDX -------------------------


2504×2504 CuArray{Int32, 2, CUDA.Mem.DeviceBuffer}:
 1  1  0   0   1   1   0   0   0   0  …     0     0     0     0     0     0
 2  2  2   0   0   2   2   0   0   0        0     0     0     0     0     0
 0  3  3   0   0   0   3   3   0   0        0     0     0     0     0     0
 0  0  0   4   4   0   0   0   4   4        0     0     0     0     0     0
 5  0  0   5   5   5   0   0   0   5        0     0     0     0     0     0
 6  6  0   0   6   6   6   0   0   0  …     0     0     0     0     0     0
 0  7  7   0   0   7   7   7   0   0        0     0     0     0     0     0
 0  0  8   0   0   0   8   8   0   0        0     0     0     0     0     0
 0  0  0   9   0   0   0   0   9   9        0     0     0     0     0     0
 0  0  0  10  10   0   0   0  10  10        0     0     0     0     0     0
 0  0  0   0  11  11   0   0   0  11  …     0     0     0     0     0     0
 0  0  0   0   0  12  12   0   0   0        0     0     0     0     0     0
 0  0  0   0   0   0  13  13   0   0

--------------------- SIZE CUDA ----------------------
Threads = 1024
Blocks  = 3
--------------------- CUDA SOLVER --------------------
  1.165164 seconds (2.29 M CPU allocations: 130.718 MiB, 3.19% gc time)


60×2504 CuArray{Int32, 2, CUDA.Mem.DeviceBuffer}:
  1   1   2   4   1   1   2   3   4  …  2453  2454  2455  2460  2461  2462
  2   2   3   5   4   2   3   7   9     2454  2455  2456  2461  2462  2463
  5   3   7   9   5   5   6   8  10     2461  2462  2463  2468  2469  2470
  6   6   8  10   6   6   7  13  15     2493  2494  2495  2498  2499  2500
 36   7  38  41  10   7   8  14  49     2494  2495  2496  2499  2500  2501
 42  37  44  49  11  11  12  45  57  …  2498  2499  2500  2502  2502  2503
 43  43  45  50  42  12  13  53  58     2499  2500  2501  2503  2503  2504
  0  44   0   0  50  43  44  54   0     2500  2501  2504     0  2504     0
  0   0   0   0  51  51  52   0   0     2502  2503     0     0     0     0
  0   0   0   0   0  52  53   0   0     2503  2504     0     0     0     0
  0   0   0   0   0   0   0   0   0  …     0     0     0     0     0     0
  0   0   0   0   0   0   0   0   0        0     0     0     0     0     0
  0   0   0   0   0   0   0   0   0        0     0

# test

In [28]:
R_Agg = 15

for r_max=2.0:0.05:4.0

    idx_red_size =  r_max ≤ 2.80 ? 13 :
            2.80 < r_max ≤ 3.45 ? 21 :
            3.45 < r_max ≤ 3.80 ? 39 :
            3.80 < r_max ≤ 4.00 ? 55 :
            70

    println("------------------------------ r_max = $(r_max)-------------------------------------")
    X = Float64.(readdlm("../../../data/init/Sphere/$(R_Agg).xyz")[3:end,2:end]) |> cu
    idx_real = Int32.(zeros(size(X, 1), size(X, 1))) |> cu;

    idx_sum = Int32.(zeros(1, size(idx_real, 1))) |> cu
    idx_red  = Int32.(zeros(idx_red_size, size(idx_real, 1))) |> cu

    threads =(32,32)
    blocks  =cld.(size(X,1),threads)
    @cuda threads=threads blocks=blocks dist_kernel!(idx_real, X ,r_max)
    threads=1024
    blocks=cld.(size(idx_real,1),threads)
    @cuda threads=threads blocks=blocks reduce_kernel(idx_real,idx_red,idx_sum)
    println("Max_Value$(r_max)  = $(findmax(sum(idx_red .> 0, dims=1))[1])")
end


------------------------------ r_max = 2.0-------------------------------------
Max_Value2.0  = 11
------------------------------ r_max = 2.05-------------------------------------
Max_Value2.05  = 13
------------------------------ r_max = 2.1-------------------------------------
Max_Value2.1  = 13
------------------------------ r_max = 2.15-------------------------------------
Max_Value2.15  = 13
------------------------------ r_max = 2.2-------------------------------------
Max_Value2.2  = 13
------------------------------ r_max = 2.25-------------------------------------
Max_Value2.25  = 13
------------------------------ r_max = 2.3-------------------------------------
Max_Value2.3  = 13
------------------------------ r_max = 2.35-------------------------------------
Max_Value2.35  = 13
------------------------------ r_max = 2.4-------------------------------------
Max_Value2.4  = 13
------------------------------ r_max = 2.45-------------------------------------
Max_Value2.45  = 13


In [29]:
idx_red

55×2504 CuArray{Int32, 2, CUDA.Mem.DeviceBuffer}:
  1   1   2   1   1   1   1   2   4  …  2362  2363  2364  2372  2373  2374
  2   2   3   4   2   2   2   3   5     2363  2364  2365  2373  2374  2375
  4   3   6   5   4   3   3   7   9     2372  2373  2374  2382  2383  2384
  5   5   7   9   5   5   6   8  10     2373  2374  2375  2383  2384  2385
  6   6   8  10   6   6   7  12  15     2374  2375  2376  2384  2385  2386
  7   7  13  11   9   7   8  13  16  …  2383  2384  2385  2391  2392  2393
 11   8  32  15  10  10  11  14  21     2384  2385  2386  2392  2393  2394
 30  12  33  34  11  11  12  19  40     2444  2445  2446  2452  2453  2454
 31  31  37  35  12  12  13  38  41     2445  2446  2447  2453  2454  2455
 35  32  38  40  16  13  14  39  48     2446  2447  2448  2454  2455  2456
 36  36  39  41  35  17  18  44  49  …  2452  2453  2454  2459  2460  2461
 37  37  43  42  36  36  37  45  50     2453  2454  2455  2460  2461  2462
 41  38  44  48  41  37  38  46  56     2454  2455