# Dependences

In [10]:
include("../../src/struct_data.jl")
include("../../src/neighbor.jl")

nearest_neighbors

# Input Data

## Model Parameter

In [11]:
@time model = ModelSet(
    TimeModel(
        tₛᵢₘ  = 10000.0,
        dt    = 0.5,
        nₖₙₙ  = 50,
        nₛₐᵥₑ = 100
    ),
    InputModel(
        outer_ratio = 0.8,
        path_input = "../../data/init/Sphere"
    ),
    OutputModel(
        name_output = "Test_1",
        path_output = ""
    )
)
dump(model)

  0.000000 seconds (4 allocations: 144 bytes)
ModelSet
  Time: TimeModel
    tₛᵢₘ: Float64 10000.0
    dt: Float64 0.5
    nₖₙₙ: Int64 50
    nₛₐᵥₑ: Int64 100
  Input: InputModel
    outer_ratio: Float64 0.8
    path_input: String "../../data/init/Sphere"
  Output: OutputModel
    name_output: String "Test_1"
    path_output: String ""


## Initial Aggregate

In [44]:
@time @start_agg agg = FusionAggregate(
    [
        AggType(
            "HEK", 
            InteractionPar(
                Cubic(1.0,2.0,3.5), 
                ContractilePar(0.01)
            ),
            Float64.(readdlm("../../data/init/Sphere/15.0.xyz")[3:end,2:end]) |> cu
        )
    ], 
    model
)

  0.119977 seconds (48.13 k allocations: 295.484 MiB, 15.05% gc time)


Aggregate(AggType[AggType("HEK", InteractionPar(Cubic{Float64}(1.0, 2.0, 3.5), ContractilePar(0.01)), 15.27f0, Float32[-1.5 -4.62 -13.88; 0.5 -4.62 -13.88; … ; 0.5 4.62 13.88; 2.5 4.62 13.88], CuArray{Float32, 2, CUDA.Mem.DeviceBuffer})], AggIndex([1, 1, 1, 1, 1, 1, 1, 1, 1, 1  …  1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1  …  2, 2, 2, 2, 2, 2, 2, 2, 2, 2], ["HEK", "HEK", "HEK", "HEK", "HEK", "HEK", "HEK", "HEK", "HEK", "HEK"  …  "HEK", "HEK", "HEK", "HEK", "HEK", "HEK", "HEK", "HEK", "HEK", "HEK"]), Float32[-16.77 -4.62 -13.88; -14.77 -4.62 -13.88; … ; 15.77 4.62 13.88; 17.77 4.62 13.88], AggGeometry(Float32[15.27, 15.27, 15.27, 15.27, 15.27, 15.27, 15.27, 15.27, 15.27, 15.27  …  15.27, 15.27, 15.27, 15.27, 15.27, 15.27, 15.27, 15.27, 15.27, 15.27], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1  …  1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), AggSimulation(AggParameter(Cubic{CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}(Float32[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0  …  1.0, 1.0, 1.0, 1.

# Program

In [45]:
points = agg.Position
r_max  = agg.Simulation.Parameter.Force.rₘₐₓ

idx    = agg.Simulation.Neighbor.idx_red
idx_sum = agg.Simulation.Neighbor.idx_sum
dist   = zeros(size(idx)) |> cu

force    = agg.Simulation.Force.F
ParForce = agg.Simulation.Parameter.Force
ParCont  = agg.Simulation.Parameter.Contractile.fₚ
outline  = agg.Geometry.outline
random = zeros(size(force)) |> cu;

In [46]:
function test_kernel!(idx,idx_sum,dist,points,r_max)
    i  = (blockIdx().x-1) * blockDim().x + threadIdx().x

    if i <= size(idx,2)

        # Cleaning index matrices
        idx_sum[i] = 0
        for m = 1:size(idx,1)
            idx[m,i] = 0
        end

        for j = 1:size(points,1)
            d = euclidean(points,i,j) 
            if d < r_max[i]
                if i != j
                    idx_sum[i] += 1
                    idx[idx_sum[i],i] = j
                    dist[idx_sum[i],i] = d
                end
            end
        end

    end
    sync_threads()
    return nothing
end

test_kernel! (generic function with 2 methods)

In [55]:
# Calculating Distance Matrix
threads=(100)
@time @cuda(
    threads=threads,
    blocks=cld.(size(points,1),threads),
    test_kernel!(idx,idx_sum,dist,points,r_max)
)

display(idx_sum)
display(idx)
display(dist)

  0.000064 seconds (35 allocations: 2.062 KiB)


1×5008 CuArray{Int32, 2, CUDA.Mem.DeviceBuffer}:
 19  20  18  19  22  23  23  20  18  …  20  19  22  23  23  20  19  20  18

39×5008 CuArray{Int32, 2, CUDA.Mem.DeviceBuffer}:
  2   1   2   1   1   1   1   2   4  …  4877  4878  4879  4887  4888  4889
  4   3   6   5   2   2   2   3   5     4948  4949  4950  4956  4957  4958
  5   5   7   9   4   3   3   7  10     4949  4950  4951  4957  4958  4959
  6   6   8  10   6   5   6  12  15     4950  4951  4952  4958  4959  4960
  7   7  13  11   9   7   8  13  16     4956  4957  4958  4963  4964  4965
 11   8  32  15  10  10  11  14  40  …  4957  4958  4959  4964  4965  4966
 30  12  33  34  11  11  12  19  41     4958  4959  4960  4965  4966  4967
 31  31  37  35  12  12  13  38  48     4959  4960  4961  4966  4967  4968
 35  32  38  40  16  13  14  39  49     4964  4965  4966  4971  4972  4973
 36  36  39  41  35  17  18  44  50     4965  4966  4967  4972  4973  4974
 37  37  43  42  36  36  37  45  56  …  4966  4967  4968  4973  4974  4975
 41  38  44  48  41  37  38  46  57     4972  4973  4974  4976  4977  4978
 42  42  45  49  42  42  43  52  58     4973  4974

39×5008 CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}:
 2.0      2.0      2.0      3.46308  …  3.27     3.27     3.27     3.27
 3.46308  2.0      3.46308  2.0         3.46309  3.46309  3.46309  3.46309
 1.99822  3.46308  1.99822  2.00689     2.82719  2.82719  2.82719  2.82719
 1.99822  1.99822  1.99822  2.00689     3.46309  3.46309  3.46309  3.46309
 3.46308  1.99822  3.47     3.46808     3.46313  3.46313  3.46313  3.46313
 3.47     3.46308  3.4654   3.47     …  1.99832  1.99832  1.99832  1.99832
 3.4654   3.47     3.4654   3.45707     1.99832  1.99832  1.99832  1.99832
 3.4654   3.4654   2.82478  3.45707     3.46313  3.46313  3.46313  3.46313
 2.82478  3.4654   1.99484  2.82478     2.82478  2.82478  2.82478  2.82478
 1.99484  2.82478  2.82478  1.99484     1.99484  1.99484  1.99484  1.99484
 2.82478  1.99484  3.46313  2.82478  …  2.82478  2.82478  2.82478  2.82478
 3.46313  2.82478  1.99832  3.46313     3.45707  3.4654   3.4654   3.4654
 1.99832  3.46313  1.99832  1.99832     3.45707  3.4

In [58]:
findmax(idx_sum)

(38, CartesianIndex(1, 121))

In [69]:
dist[:,67]

39-element CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}:
 3.4564288
 2.8190248
 3.4564288
 1.9954449
 1.9954449
 3.4614737
 2.000625
 2.828869
 3.4654005
 3.46
 3.4630766
 1.9982243
 1.9982243
 ⋮
 3.466194
 2.8346431
 2.008781
 2.8346431
 3.4701154
 3.4701154
 3.2700005
 0.0
 0.0
 0.0
 0.0
 0.0

In [67]:
euclidean(points,121,42)

│ Invocation of getindex resulted in scalar indexing of a GPU array.
│ This is typically caused by calling an iterating implementation of a method.
│ Such implementations *do not* execute on the GPU, but very slowly on the CPU,
│ and therefore are only permitted from the REPL for prototyping purposes.
│ If you did intend to index this array, annotate the caller with @allowscalar.
└ @ GPUArraysCore /home/nicomosty/.julia/packages/GPUArraysCore/lojQM/src/GPUArraysCore.jl:90


2.0064898f0

In [66]:
display(points[121,:]')
display(points[42,:]')

1×3 adjoint(::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}) with eltype Float32:
 -16.77  -4.62  -10.61

1×3 adjoint(::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}) with eltype Float32:
 -17.77  -4.04  -12.25

# Later

In [16]:
# println("------------------------ SIZE ------------------------")
# println("r_max         = $(FusionAGG.Type[1].Interaction.Force.rₘₐₓ)")
# println("Size X        = $(size(FusionAGG.Position))")
# println("Size idx      = $(size(FusionAGG.Simulation.Neighbor.idx))")
# println("Size idx_real  = $(size(FusionAGG.Simulation.Neighbor.idx))")
# println("Size idx_sum   = $(size(FusionAGG.Simulation.Neighbor.idx_sum))")
# println("Size idx_red   = $(size(FusionAGG.Simulation.Neighbor.idx_red))")
# @time nearest_neighbors(FusionAGG)
# println("------------------------ IDX -------------------------")
# println("idx")
# display(FusionAGG.Simulation.Neighbor.idx)
# println("idx_red")
# display(FusionAGG.Simulation.Neighbor.idx_red)
# println("idx_sum")
# display(FusionAGG.Simulation.Neighbor.idx_sum)
# println("idx_cont")
# display(FusionAGG.Simulation.Neighbor.idx_cont)

In [17]:
# function reduce_kernel_temp!(idx,idx_red,idx_sum)
#     i  = (blockIdx().x-1) * blockDim().x + threadIdx().x

#     if i <= size(idx,1)

#         # Cleaning
#         idx_sum[i] = 0
#         for m = 1:size(idx_red,1)
#             idx_red[m,i] = 0
#         end

#         for j = 1:size(idx,1)
#             if idx[j,i] != 0
#                 idx_sum[i] += 1
#                 idx_red[idx_sum[i],i] = j
#             end
#         end

#     end
#     # sync_threads()
#     return nothing
# end

In [18]:
# # Reducing Distance Matrix to Nearest Neighbors
# threads = 64
# @time @cuda(
#     threads=threads,
#     blocks=cld.(size(points,1),threads),
#     reduce_kernel_temp!(idx,idx_red,idx_sum)
# )

# display(idx_sum)
# display(idx_red)