# Dependences

In [98]:
using CUDA
using DelimitedFiles
using Test
using BenchmarkTools

# initial data
R_Agg = 13
r_max = 3.0
n_knn = 50
idx_red_size =  r_max ≤ 2.80 ? 13 :
            2.80 < r_max ≤ 3.45 ? 21 :
            3.45 < r_max ≤ 3.80 ? 39 :
            3.80 < r_max ≤ 4.00 ? 55 :
            70
X = Float64.(readdlm("../../../data/init/Sphere/$(R_Agg).xyz")[3:end,2:end]) |> cu

# kNN
idx      = Int32.(zeros(size(X, 1), size(X, 1))) |> cu;
idx_sum  = Int32.(zeros(1, size(idx, 1))) |> cu
idx_red  = Int32.(zeros(idx_red_size, size(idx, 1))) |> cu
idx_contractile =Int32.(zeros(n_knn,size(X,1))) |> cu

# forces
force = CUDA.zeros(size(X));

# Generic Functions

In [99]:
# force_func
cubic(A,r_max,r_min,dist) = - A * (dist-r_max)^2* (dist-r_min)

# distances
euclidean(points,i,j) = sqrt((points[i,1]-points[j,1])^2+(points[i,2]-points[j,2])^2+(points[i,3]-points[j,3])^2)

euclidean (generic function with 1 method)

# idx

## functions

In [100]:
function dist_kernel!(idx, points ,r_max)
    # Defining Index for kernel
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
    
    # Limiting data inside matrix
    if i <= size(points, 1) && j <= size(points, 1)
        if euclidean(points,i,j) < r_max
            idx[i, j] = i
        else
            idx[i, j] = 0
        end 
    end
    return nothing
end

function reduce_kernel(idx,idx_red,idx_sum)
    # Defining Index for kernel
    i  = (blockIdx().x-1) * blockDim().x + threadIdx().x

    # Limiting data inside matrix
    if i <= size(idx,1)
        # Cleaning idx_sum
        idx_sum[i] = 0
        
        # looping on each row for searching non-zero values
        for j = 1:size(idx,1)
            if idx[j,i] != 0
                idx_sum[i] += 1
                idx_red[idx_sum[i],i] = j
            end
        end
    end
    
    return nothing
end

function index_contractile!(idx_contractile,idx_sum,idx_red)
    # Defining Index for kernel
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y

    # Limiting data inside matrix
    if i <= size(idx_contractile, 1) && j <= size(idx_contractile,2)
        idx_contractile[i,j] = idx_red[rand(1:idx_sum[j]),j]
    end
    return nothing
end

function nearest_neighbors(idx, idx_red, idx_sum, idx_contractile, points ,r_max)
    # Calculating Distance Matrix
    threads =(32,32)
    blocks  =cld.(size(points,1),threads)
    @cuda threads=threads blocks=blocks dist_kernel!(idx, points ,r_max)

    # Reducing Distance Matrix to Nearest Neighbors
    threads=1024
    blocks=cld.(size(idx,1),threads)
    @cuda threads=threads blocks=blocks reduce_kernel(idx,idx_red,idx_sum)

    # Finding index contractile
    threads =(32,32)
    blocks  =cld.(size(X,1),threads)
    @cuda threads=threads blocks=blocks index_contractile!(idx_contractile,idx_sum,idx_red)
end

nearest_neighbors (generic function with 1 method)

## running

In [101]:
println("---------------------------------- VARIABLES ------------------------------------")
println("R_Agg = $(R_Agg) | R_Max = $(r_max) | col_size_idx = $(idx_red_size)")
display(X)
println("------------------------------------ SIZES --------------------------------------")
println("Size → idx     = $(size(idx))")
println("     → idx_sum = $(size(idx_sum))")
println("     → idx_red = $(size(idx_red))")
println("------------------------------------ RESULTS ------------------------------------")
@btime nearest_neighbors(idx, idx_red, idx_sum,idx_contractile, X ,r_max)
println("----------------------------------- IDX RED -------------------------------------")
display(idx_red)
println("----------------------------------- IDX SUM -------------------------------------")
display(idx_sum)
println("------------------------------- IDX CONTRACTILE ---------------------------------")
display(idx_contractile)
println("---------------------------------- Memory Used ----------------------------------")
CUDA.memory_status()  

---------------------------------- VARIABLES ------------------------------------
R_Agg = 13 | R_Max = 3.0 | col_size_idx = 21


1620×3 CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}:
 -0.5  -4.04  -12.25
  1.5  -4.04  -12.25
 -3.5  -2.31  -12.25
 -1.5  -2.31  -12.25
  0.5  -2.31  -12.25
  2.5  -2.31  -12.25
 -2.5  -0.58  -12.25
 -0.5  -0.58  -12.25
  1.5  -0.58  -12.25
  3.5  -0.58  -12.25
 -3.5   1.15  -12.25
 -1.5   1.15  -12.25
  0.5   1.15  -12.25
  ⋮           
  0.5  -1.15   12.25
  2.5  -1.15   12.25
 -2.5   0.58   12.25
 -0.5   0.58   12.25
  1.5   0.58   12.25
  3.5   0.58   12.25
 -3.5   2.31   12.25
 -1.5   2.31   12.25
  0.5   2.31   12.25
  2.5   2.31   12.25
 -0.5   4.04   12.25
  1.5   4.04   12.25

------------------------------------ SIZES --------------------------------------
Size → idx     = (1620, 1620)
     → idx_sum = (1, 1620)
     → idx_red = (21, 1620)
------------------------------------ RESULTS ------------------------------------
  13.273 μs (77 allocations: 4.03 KiB)
----------------------------------- IDX RED -------------------------------------


21×1620 CuArray{Int32, 2, CUDA.Mem.DeviceBuffer}:
  1   1   3   1   1   2   3   4   5  …  1580  1581  1582  1583  1589  1590
  2   2   4   3   2   5   4   5   6     1581  1582  1583  1584  1590  1591
  4   5   7   4   4   6   7   7   8     1582  1583  1584  1585  1591  1592
  5   6  23   5   5   9   8   8   9     1588  1589  1590  1591  1596  1597
 19  20  29   7   6  10  11   9  10     1589  1590  1591  1592  1597  1598
 24  25  30   8   8  26  12  12  13  …  1595  1596  1597  1598  1601  1602
 25  26  35  24   9  32  30  13  14     1611  1611  1612  1613  1616  1617
 30  31  36  30  25  33  36  31  32     1615  1612  1613  1614  1617  1618
 31  32  37  31  31  38  37  37  38     1616  1615  1616  1617  1619  1619
 32  33   0  36  32  39  43  38  39        0  1616  1617  1618  1620  1620
  0   0   0  37  37  40  44  44  45  …     0  1617  1618  1620     0     0
  0   0   0  38  38   0  45  45  46        0  1619  1619     0     0     0
  0   0   0   0  39   0   0  46  47        0     0

----------------------------------- IDX SUM -------------------------------------


1×1620 CuArray{Int32, 2, CUDA.Mem.DeviceBuffer}:
 10  10  9  12  13  11  12  13  13  10  …  13  13  10  9  12  13  11  10  10

------------------------------- IDX CONTRACTILE ---------------------------------


50×1620 CuArray{Int32, 2, CUDA.Mem.DeviceBuffer}:
  5   1  23   4  37  32  11   7  13  …  1616  1611  1612  1583  1616  1619
  5  26  23   7   5  40   7   8  13     1581  1617  1590  1583  1589  1620
 30  20  23   8   5   2  11  38  10     1580  1581  1590  1614  1619  1590
  2   6   7   1   1  39   3  37  47     1580  1611  1619  1613  1620  1602
  5   2  30   5   2  40   7   7   8     1616  1617  1591  1591  1601  1598
 31   6  36  24  37   9  36  31  45  …  1616  1583  1618  1617  1619  1590
  5   6  36   5  38  40  30   7  14     1589  1589  1590  1585  1620  1618
 30  32  23  37   9   6   7  46  38     1581  1581  1590  1613  1590  1591
  2   2  29   1  39  32   3  31  10     1581  1611  1582  1618  1591  1619
 24   5  29  31   9   9  30  13  39     1595  1583  1619  1583  1620  1620
 24  32  23   1   2  39   3  37   6  …  1615  1615  1613  1585  1597  1591
 30  31  36  24  31  32   3  44  10     1581  1616  1584  1620  1596  1591
  1   6  29   3  32  39  36  37  46     1589  1612

---------------------------------- Memory Used ----------------------------------
Effective GPU memory usage: 11.22% (224.562 MiB/1.955 GiB)
No memory pool is in use.

# forces

## functions

In [102]:
function sum_force!(idx,idx_contractile,points,force, force_func,A,r_max,r_min,A_con,t_knn)
    # Defining Index for kernel
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    k = (blockIdx().y - 1) * blockDim().y + threadIdx().y

    # Limiting data inside matrix
    if i <= size(points, 1) && k <= size(points, 2)

        # Cleaning idx_sum
        force[i,k] = 0
        dist = 0

        # Iterate on each row
        for j=1:size(idx,1)

            # Finding forces
            if idx[j,i] != i && idx[j,i] != 0
                dist = euclidean(points,i,idx[j,i])
                force[i,k] +=force_func(A,r_max,r_min,dist) * points[idx[j,i],k] / dist
            end
            
        end

        # Adding Contrractile Force
        if idx_contractile[t_knn,i] != i && idx_contractile[t_knn,i] != 0
            dist = euclidean(points,i,idx_contractile[t_knn,i])
            force[i,k] += A_con*points[idx_contractile[t_knn,i],k]/dist
        end
        
    end
    return nothing
end

sum_force! (generic function with 1 method)

## running

In [103]:
dt = 0.5
A     = 0.1
A_con = 0.001
r_min = 2.0
r_max = r_max

t_knn = 1

1

In [104]:
# GPU
threads =(100,3)
blocks  =cld.(size(X,1),threads)
@cuda threads=threads blocks=blocks sum_force!(idx_red,idx_contractile, X ,force, cubic,A,r_max,r_min,A_con,Int(t_knn))

CUDA.HostKernel{typeof(sum_force!), Tuple{CuDeviceMatrix{Int32, 1}, CuDeviceMatrix{Int32, 1}, CuDeviceMatrix{Float32, 1}, CuDeviceMatrix{Float32, 1}, typeof(cubic), Float64, Float64, Float64, Float64, Int64}}(sum_force!, CuFunction(Ptr{CUDA.CUfunc_st} @0x0000000013cf13e0, CuModule(Ptr{CUDA.CUmod_st} @0x0000000013ddf8d0, CuContext(0x0000000004361960, instance 310f24a47c9247f0))), CUDA.KernelState(Ptr{Nothing} @0x0000000203400000))

## Pending (Test with CPU)

In [105]:
# CPU
X_CPU = Matrix(X)
idx_red_CPU = Matrix(idx_red)
idx_contractile_CPU = Matrix(idx_contractile)
force_CPU = zeros(size((X)))


for j = 1:size(idx_red_CPU,1)
    for i =1:size(X,1)
        if idx_red_CPU[j,i] != 0 && idx_red_CPU[j,i] != i
            dist = euclidean(X_CPU,i,idx_red_CPU[j,i])
            force_CPU[i,:] += cubic.(A,r_max,r_min,dist) .* X_CPU[idx_red_CPU[j,i],:] ./ dist
        end
    end
end

for i =1:size(X,1)
    if idx_contractile_CPU[t_knn,i] != 0 && idx_contractile_CPU[t_knn,i] != i
        dist = euclidean(X_CPU,i,idx_contractile_CPU[t_knn,i])
        force_CPU[i,:] += A_con .* X_CPU[idx_contractile_CPU[t_knn,i],:] ./ dist
    end
end

In [106]:
# testing
@test Matrix(force) ≈ force_CPU atol=0.01

[32m[1mTest Passed[22m[39m

In [107]:
display(force)
display(force_CPU)

1620×3 CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}:
  0.00180784    0.0118934     0.0266297
 -0.00492285    0.0110295     0.0266351
  0.0107864     0.00629834    0.0322816
  0.00493564    0.00751712    0.0338418
 -0.00210006    0.00675113    0.0290064
 -0.00741904    0.00607679    0.028554
  0.00540399    0.00196914    0.0236526
  0.000181109   0.00110363    0.0236581
 -0.0040431     0.00196914    0.0236526
 -0.0100704     0.00120708    0.0258431
  0.0109036    -0.00520964    0.0318804
  0.00444228   -0.00565993    0.0357698
 -0.00214581   -0.00450937    0.0349612
  ⋮                          
 -0.0021469     0.00623967   -0.0349348
 -0.0075479     0.00526136   -0.030788
  0.0064069    -0.000819668  -0.0244877
  0.000549163  -0.00159824   -0.0260401
 -0.0040431    -0.00196914   -0.0236526
 -0.00907805   -0.00178581   -0.0266862
  0.0112719    -0.00677414   -0.0299018
  0.00368453   -0.00722686   -0.0277114
 -0.00182112   -0.00686657   -0.0266189
 -0.00799023   -0.00711142   -0.0300989
 

1620×3 Matrix{Float64}:
  0.00180784    0.0118934     0.0266297
 -0.00492285    0.0110295     0.0266351
  0.0107864     0.00629834    0.0322816
  0.00493564    0.00751712    0.0338418
 -0.00210006    0.00675113    0.0290064
 -0.00741904    0.00607679    0.028554
  0.00540399    0.00196914    0.0236527
  0.000181109   0.00110363    0.0236581
 -0.0040431     0.00196914    0.0236527
 -0.0100704     0.00120708    0.025843
  0.0109036    -0.00520964    0.0318805
  0.00444228   -0.00565993    0.0357698
 -0.00214581   -0.00450937    0.0349612
  ⋮                          
 -0.0021469     0.00623967   -0.0349348
 -0.0075479     0.00526136   -0.030788
  0.0064069    -0.000819668  -0.0244877
  0.000549164  -0.00159824   -0.0260401
 -0.0040431    -0.00196914   -0.0236527
 -0.00907805   -0.00178581   -0.0266862
  0.0112719    -0.00677414   -0.0299018
  0.00368453   -0.00722686   -0.0277114
 -0.00182112   -0.00686657   -0.0266189
 -0.00799022   -0.00711142   -0.0300989
  0.00080695   -0.0118934    