In [1]:
using Pkg
Pkg.activate(".")
# Pkg.instantiate()
# Pkg.add("CUDAKernels")

[32m[1m  Activating[22m[39m project at `c:\Users\wenbl13\Desktop\Wenbo_Timing`


In [2]:
using CSV
using CUDA
using Dates
using Tullio
using DataFrames
using Statistics
using CUDAKernels
using LinearAlgebra
using ChainRulesCore
using BenchmarkTools 
using KernelAbstractions
using DistanceTransforms
using FastAI, FastVision, Flux

In [3]:
ks = DistanceTransforms.get_GPU_kernels(Wenbo())
size(ks)

GPU threads = 768.


(11,)

# Loss functions

## Not in loop

In [4]:
# input format = d1 * ... * dn, (n <= 3)
function hausdorff_loss_cpu_Wenbo(ŷ, y)
    # DT
    pred_mask_dtm = DistanceTransforms.transform(ŷ, Wenbo(), 16)
    mask_dtm = DistanceTransforms.transform(y, Wenbo(), 16)
    # Loss
    temp = ((ŷ .- y) .^ 2) .* (pred_mask_dtm .+ mask_dtm)
    loss_hd = sum(temp) / length(temp)
    return loss_hd
end

hausdorff_loss_cpu_Wenbo (generic function with 1 method)

In [5]:
# input format = d1 * ... * dn, (n <= 3)
function hausdorff_loss_gpu_Wenbo(ŷ::CuArray, y::CuArray)
    # DT
    pred_mask_dtm = DistanceTransforms.transform(ŷ, Wenbo(), ks)
    mask_dtm = DistanceTransforms.transform(y, Wenbo(), ks)
    # Loss
    temp = ((ŷ .- y) .^ 2) .* (pred_mask_dtm .+ mask_dtm)
    loss_hd = sum(temp) / length(temp)
    return loss_hd
end

hausdorff_loss_gpu_Wenbo (generic function with 1 method)

In [6]:
# input format = d1 * ... * dn, (n <= 3)
function hausdorff_loss_cpu_Maurer(ŷ, y)
    # DT
    pred_mask_dtm = DistanceTransforms.transform(round.(ŷ), Maurer()) .^ 2
    mask_dtm = DistanceTransforms.transform(y, Maurer()) .^ 2
    # Loss
    temp = ((ŷ .- y) .^ 2) .* (pred_mask_dtm .+ mask_dtm)
    loss_hd = sum(temp) / length(temp)
    return loss_hd
end

hausdorff_loss_cpu_Maurer (generic function with 1 method)

In [7]:
# input format = d1 * ... * dn, (n <= 3)
function hausdorff_loss_gpu_Maurer(ŷ::CuArray, y::CuArray)
    # DT
    pred_mask, mask = ŷ |> cpu, y |> cpu
    pred_mask_dtm = DistanceTransforms.transform(round.(pred_mask), Maurer()) .^ 2 |> gpu
    mask_dtm = DistanceTransforms.transform(mask, Maurer()) .^ 2 |> gpu
    # Loss
    temp = ((ŷ .- y) .^ 2) .* (pred_mask_dtm .+ mask_dtm)
    loss_hd = sum(temp) / length(temp)
    return loss_hd
end

hausdorff_loss_gpu_Maurer (generic function with 1 method)

## In loop

### Wenbo

In [8]:
# input format = d1 * ... * dn * channel * batch, (n <= 3)
function hausdorff_loss_cpu_Wenbo_loop(ŷ, y)
    ŷ_dtm = 0
    y_dtm = 0
    ignore_derivatives() do
        ŷ_dtm = DistanceTransforms.transform(true, ŷ, Wenbo(), 0)
        y_dtm = DistanceTransforms.transform(true, y, Wenbo(), 0)
    end
    temp = ((ŷ .- y) .^ 2) .* (ŷ_dtm .+ y_dtm)
    loss_hd = sum(temp) / length(temp)
    return loss_hd
end

hausdorff_loss_cpu_Wenbo_loop (generic function with 1 method)

In [9]:
# input format = d1 * ... * dn * channel * batch, (n <= 3)
function hausdorff_loss_gpu_Wenbo_loop(ŷ::CuArray, y::CuArray)
    ŷ_dtm = 0
    y_dtm = 0
    ignore_derivatives() do
        ŷ_dtm = DistanceTransforms.transform(true, ŷ, Wenbo(), ks)
        y_dtm = DistanceTransforms.transform(true, y, Wenbo(), ks)
    end
    temp = ((ŷ .- y) .^ 2) .* (ŷ_dtm .+ y_dtm)
    loss_hd = sum(temp) / length(temp)
    return loss_hd
end

hausdorff_loss_gpu_Wenbo_loop (generic function with 1 method)

### Maurer

### 2D

In [10]:
# input format = d1 * d2 * channel * batch
function hausdorff_loss_2d_cpu_Maurer_loop(ŷ, y)
    ŷ_dtm = similar(ŷ)
    y_dtm = similar(y)
    ignore_derivatives() do
        ŷ_rounded = round.(ŷ)
        s = size(ŷ)
        for batch = 1 : s[4]
            for channel = 1: s[3]
                ŷ_dtm[:,:, channel, batch] = DistanceTransforms.transform(ŷ_rounded[:,:, channel, batch], Maurer()) # Maurer
                y_dtm[:,:, channel, batch] = DistanceTransforms.transform(y[:,:, channel, batch], Maurer()) # Maurer
            end
        end
    end
    temp = ((ŷ .- y) .^ 2) .* (ŷ_dtm .+ y_dtm)
    loss_hd = sum(temp) / length(temp)
    return loss_hd
end

hausdorff_loss_2d_cpu_Maurer_loop (generic function with 1 method)

In [11]:
# input format = d1 * d2 * channel * batch
function hausdorff_loss_2d_gpu_Maurer_loop(ŷ::CuArray, y::CuArray)
    ŷ_dtm = similar(ŷ)
    y_dtm = similar(y)
    ignore_derivatives() do
        ŷ_cpu = round.(ŷ) |> cpu
        y_cpu = y |> cpu
        s = size(ŷ)
        for batch = 1 : s[4]
            for channel = 1: s[3]
                ŷ_dtm[:,:, channel, batch] = DistanceTransforms.transform(ŷ_cpu[:,:, channel, batch], Maurer()) # Maurer
                y_dtm[:,:, channel, batch] = DistanceTransforms.transform(y_cpu[:,:, channel, batch], Maurer()) # Maurer
            end
        end
        ŷ_dtm = ŷ_dtm |> gpu
        y_dtm = y_dtm |> gpu
    end
    temp = ((ŷ .- y) .^ 2) .* (ŷ_dtm .+ y_dtm)
    loss_hd = sum(temp) / length(temp)
    return loss_hd
end

hausdorff_loss_2d_gpu_Maurer_loop (generic function with 1 method)

### 3D

In [12]:
# input format = d1 * d2 * d3 * channel * batch
function hausdorff_loss_3d_cpu_Maurer_loop(ŷ, y)
    ŷ_dtm = similar(ŷ)
    y_dtm = similar(y)
    ignore_derivatives() do
        ŷ_rounded = round.(ŷ)
        s = size(ŷ)
        for batch = 1 : s[5]
            for channel = 1: s[4]
                ŷ_dtm[:,:,:, channel, batch] = DistanceTransforms.transform(ŷ_rounded[:,:,:, channel, batch], Maurer()) # Maurer
                y_dtm[:,:,:, channel, batch] = DistanceTransforms.transform(y[:,:,:, channel, batch], Maurer()) # Maurer
            end
        end
    end
    temp = ((ŷ .- y) .^ 2) .* (ŷ_dtm .+ y_dtm)
    loss_hd = sum(temp) / length(temp)
    return loss_hd
end

hausdorff_loss_3d_cpu_Maurer_loop (generic function with 1 method)

In [13]:
# input format = d1 * d2 * channel * batch
function hausdorff_loss_3d_gpu_Maurer_loop(ŷ::CuArray, y::CuArray)
    ŷ_dtm = similar(ŷ)
    y_dtm = similar(y)
    ignore_derivatives() do
        ŷ_cpu = round.(ŷ) |> cpu
        y_cpu = y |> cpu
        s = size(ŷ)
        for batch = 1 : s[5]
            for channel = 1: s[4]
                ŷ_dtm[:,:,:, channel, batch] = DistanceTransforms.transform(ŷ_cpu[:,:,:, channel, batch], Maurer()) # Maurer
                y_dtm[:,:,:, channel, batch] = DistanceTransforms.transform(y_cpu[:,:,:, channel, batch], Maurer()) # Maurer
            end
        end
        ŷ_dtm = ŷ_dtm |> gpu
        y_dtm = y_dtm |> gpu
    end
    temp = ((ŷ .- y) .^ 2) .* (ŷ_dtm .+ y_dtm)
    loss_hd = sum(temp) / length(temp)
    return loss_hd
end

hausdorff_loss_3d_gpu_Maurer_loop (generic function with 1 method)

# Prepare other things for test

## Create 2D U-Net

In [14]:
conv = (stride, in, out) -> Conv((3, 3), in=>out, stride=stride, pad=SamePad())
tran = (stride, in, out) -> ConvTranspose((3, 3), in=>out, stride=stride, pad=SamePad())

conv1 = (in, out) -> Chain(conv(1, in, out), BatchNorm(out, leakyrelu))
conv2 = (in, out) -> Chain(conv(2, in, out), BatchNorm(out, leakyrelu))
conv3 = (in, out) -> Chain(conv(1, in, out), x -> softmax(x; dims = 3))
tran2 = (in, out) -> Chain(tran(2, in, out), BatchNorm(out, leakyrelu))

function unet2D_new(in_chs, lbl_chs)
    # Contracting layers
    l1 = Chain(conv1(in_chs, 8))
    l2 = Chain(l1, conv2(8, 16), conv1(16, 16))
    l3 = Chain(l2, conv2(16, 32), conv1(32, 32))
    l4 = Chain(l3, conv2(32, 64), conv1(64, 64))
    l5 = Chain(l4, conv2(64, 128), conv1(128, 128))

    # Expanding layers
    l6 = Chain(l5, tran2(128, 64))
    l7 = Chain(Parallel(FastVision.Models.catchannels,l4,l6), conv1(128, 64), tran2(64, 32))
    l8 = Chain(Parallel(FastVision.Models.catchannels,l3,l7), conv1(64, 32), tran2(32, 16))
    l9 = Chain(Parallel(FastVision.Models.catchannels,l2,l8), conv1(32, 16), tran2(16, 8))
    l10 = Chain(l9, conv3(8, lbl_chs))
end

unet2D_new (generic function with 1 method)

Test model

In [15]:
x_test = rand(Float32, 96, 96, 1, 1)
m_test = unet2D_new(1, 2)
o = m_test(x_test)
o |> size

(96, 96, 2, 1)

## Create 3D U-Net

In [16]:
conv3d = (stride, in, out) -> Conv((3, 3, 3), in=>out, stride=stride, pad=SamePad())
tran3d = (stride, in, out) -> ConvTranspose((3, 3, 3), in=>out, stride=stride, pad=SamePad())

conv13d = (in, out) -> Chain(conv3d(1, in, out), BatchNorm(out, leakyrelu))
conv23d = (in, out) -> Chain(conv3d(2, in, out), BatchNorm(out, leakyrelu))
conv33d = (in, out) -> Chain(conv3d(1, in, out), x -> softmax(x; dims = 4))
tran23d = (in, out) -> Chain(tran3d(2, in, out), BatchNorm(out, leakyrelu))



function unet3D_new(in_chs, lbl_chs)
    # Contracting layers
    l1 = Chain(conv13d(in_chs, 8))
    l2 = Chain(l1, conv23d(8, 16), conv13d(16, 16))
    l3 = Chain(l2, conv23d(16, 32), conv13d(32, 32))
    l4 = Chain(l3, conv23d(32, 64), conv13d(64, 64))
    l5 = Chain(l4, conv23d(64, 128), conv13d(128, 128))

    # Expanding layers
    l6 = Chain(l5, tran23d(128, 64))
    l7 = Chain(Parallel(FastVision.Models.catchannels,l4,l6), conv13d(128, 64), tran23d(64, 32))
    l8 = Chain(Parallel(FastVision.Models.catchannels,l3,l7), conv13d(64, 32), tran23d(32, 16))
    l9 = Chain(Parallel(FastVision.Models.catchannels,l2,l8), conv13d(32, 16), tran23d(16, 8))
    l10 = Chain(l9, conv33d(8, lbl_chs))
end



unet3D_new (generic function with 1 method)

Test model

In [17]:
x_test = rand(Float32, 96, 96, 96, 1, 1)
m_test = unet3D_new(1, 2)
o = m_test(x_test)
o |> size

(96, 96, 96, 2, 1)

## Create test cases

In [18]:
num_range_2D = range(2^1, 2^9; length = 25)
num_range_3D = range(2^1, 2^7; length = 25)
test_cases_2D = []
test_cases_2D_loop = []
test_cases_3D = []
test_cases_3D_loop = []
sizes_2D = []
sizes_2D_loop = []
sizes_3D = []
sizes_3D_loop = []
for i in num_range_2D
    n = Int(round(i))
    print("2D $n --> ")
    #2D
    push!(sizes_2D, n^2)
    # non loop
    pred_mask = rand(Float32, n, n)
    mask = Float32.(rand([0, 1], n, n))
    push!(test_cases_2D, deepcopy([pred_mask, mask]))
end
println()
for n in 16 : 16 : 512
    print("2DLoop $n --> ")
    #2D
    push!(sizes_2D_loop, n^2)
    # loop
    x = rand(Float32, n, n, 1, 1)
    mask = Float32.(rand([0, 1], n, n, 1, 1))
    push!(test_cases_2D_loop, deepcopy([x, mask]))
end
println()
for i in num_range_3D
    n = Int(round(i))
    print("3D $n --> ")
    #3D
    push!(sizes_3D, n^3)
    # non loop
    pred_mask = rand(Float32, n, n, n)
    mask = Float32.(rand([0, 1], n, n, n))
    push!(test_cases_3D, deepcopy([pred_mask, mask]))
end
println()
for n in 16 : 16 : 128
    print("3DLoop $n --> ")
    #3D
    push!(sizes_3D_loop, n^3)
    # loop
    x = rand(Float32, n, n, n, 1, 1)
    mask = Float32.(rand([0, 1], n, n, n, 1, 1))
    push!(test_cases_3D_loop, deepcopy([x, mask]))
end

2D 2 --> 2D 23 --> 2D 44 --> 2D 66 --> 2D 87 --> 2D 108 --> 2D 130 --> 2D 151 --> 2D 172 --> 2D 193 --> 2D 214 --> 2D 236 --> 2D 257 --> 2D 278 --> 2D 300 --> 2D 321 --> 2D 342 --> 2D 363 --> 2D 384 --> 2D 406 --> 2D 427 --> 2D 448 --> 2D 470 --> 2D 491 --> 2D 512 --> 


2DLoop 16 --> 2DLoop 32 --> 2DLoop 48 --> 2DLoop 64 --> 2DLoop 80 --> 2DLoop 96 --> 2DLoop 112 --> 2DLoop 128 --> 2DLoop 144 --> 2DLoop 160 --> 2DLoop 176 --> 2DLoop 192 --> 2DLoop 208 --> 2DLoop 224 --> 2DLoop 240 --> 2DLoop 256 --> 2DLoop 272 --> 2DLoop 288 --> 2DLoop 304 --> 2DLoop 320 --> 2DLoop 336 --> 2DLoop 352 --> 2DLoop 368 --> 2DLoop 384 --> 2DLoop 400 --> 2DLoop 416 --> 2DLoop 432 --> 2DLoop 448 --> 2DLoop 464 --> 2DLoop 480 --> 2DLoop 496 --> 2DLoop 512 --> 
3D 2 --> 

3D 7 --> 3D 12 --> 3D 18 --> 3D 23 --> 3D 28 --> 3D 34 --> 3D 39 --> 3D 44 --> 3D 49 --> 3D 54 --> 3D 60 --> 3D 65 --> 3D 70 --> 3D 76 --> 3D 81 --> 3D 86 --> 3D 91 --> 3D 96 --> 3D 102 --> 3D 107 --> 3D 112 --> 3D 118 --> 3D 123 --> 

3D 128 --> 


3DLoop 16 --> 3DLoop 32 --> 3DLoop 48 --> 3DLoop 64 --> 3DLoop 80 --> 3DLoop 96 --> 3DLoop 112 --> 3DLoop 128 --> 

# Timing 

## Not loop

### 2D

In [19]:
not_loop_2d_Maurer_cpu_hd_cpu_min = []
not_loop_2d_Maurer_cpu_hd_cpu_std = []

not_loop_2d_Maurer_cpu_hd_gpu_min = []
not_loop_2d_Maurer_cpu_hd_gpu_std = []

not_loop_2d_Wenbo_cpu_hd_cpu_min = []
not_loop_2d_Wenbo_cpu_hd_cpu_std = []

not_loop_2d_Wenbo_gpu_hd_gpu_min = []
not_loop_2d_Wenbo_gpu_hd_gpu_std = []

for (idx, case) in enumerate(test_cases_2D)
    # load case
    pred_mask, mask = case
    # cpu -> gpu
    pred_mask_gpu, mask_gpu = pred_mask |> gpu, mask |> gpu

    #---Start testing---#

    # tfm = Maurer, tfm_device = cpu, hd_device = cpu
    Maurer_cpu_hd_cpu = @benchmark hausdorff_loss_cpu_Maurer($pred_mask, $mask)
    # tfm = Maurer, tfm_device = cpu, hd_device = gpu
    Maurer_cpu_hd_gpu = @benchmark hausdorff_loss_gpu_Maurer($pred_mask_gpu, $mask_gpu)
    # tfm = Wenbo, tfm_device = cpu, hd_device = cpu
    Wenbo_cpu_hd_cpu = @benchmark hausdorff_loss_cpu_Wenbo($pred_mask, $mask)
    # tfm = Wenbo, tfm_device = gpu, hd_device = gpu
    Wenbo_gpu_hd_gpu = @benchmark hausdorff_loss_gpu_Wenbo($pred_mask_gpu, $mask_gpu)

    #---Finished testing---#
    append!(not_loop_2d_Maurer_cpu_hd_cpu_min, BenchmarkTools.minimum(Maurer_cpu_hd_cpu).time)
    append!(not_loop_2d_Maurer_cpu_hd_cpu_std, BenchmarkTools.std(Maurer_cpu_hd_cpu).time)
    
    append!(not_loop_2d_Maurer_cpu_hd_gpu_min, BenchmarkTools.minimum(Maurer_cpu_hd_gpu).time)
    append!(not_loop_2d_Maurer_cpu_hd_gpu_std, BenchmarkTools.std(Maurer_cpu_hd_gpu).time)
    
    append!(not_loop_2d_Wenbo_cpu_hd_cpu_min, BenchmarkTools.minimum(Wenbo_cpu_hd_cpu).time)
    append!(not_loop_2d_Wenbo_cpu_hd_cpu_std, BenchmarkTools.std(Wenbo_cpu_hd_cpu).time)

    append!(not_loop_2d_Wenbo_gpu_hd_gpu_min, BenchmarkTools.minimum(Wenbo_gpu_hd_gpu).time)
    append!(not_loop_2d_Wenbo_gpu_hd_gpu_std, BenchmarkTools.std(Wenbo_gpu_hd_gpu).time)

    println("size = $(sizes_2D[idx]), Maurer_cpu = $(round(not_loop_2d_Maurer_cpu_hd_cpu_min[end]*1e-6, digits = 3))ms, Maurer_gpu = $(round(not_loop_2d_Maurer_cpu_hd_gpu_min[end]*1e-6, digits = 3))ms, Wenbo_cpu = $(round(not_loop_2d_Wenbo_cpu_hd_cpu_min[end]*1e-6, digits = 3))ms, Wenbo_gpu = $(round(not_loop_2d_Wenbo_gpu_hd_gpu_min[end]*1e-6, digits = 3))ms")
end

size = 4, Maurer_cpu = 0.0ms, Maurer_gpu = 0.175ms, Wenbo_cpu = 0.034ms, Wenbo_gpu = 0.226ms


size = 529, Maurer_cpu = 0.018ms, Maurer_gpu = 0.271ms, Wenbo_cpu = 0.041ms, Wenbo_gpu = 0.252ms


size = 1936, Maurer_cpu = 0.096ms, Maurer_gpu = 0.45ms, Wenbo_cpu = 0.048ms, Wenbo_gpu = 0.395ms


size = 4356, Maurer_cpu = 0.178ms, Maurer_gpu = 0.632ms, Wenbo_cpu = 0.06ms, Wenbo_gpu = 0.282ms


size = 7569, Maurer_cpu = 0.244ms, Maurer_gpu = 0.638ms, Wenbo_cpu = 0.094ms, Wenbo_gpu = 0.336ms


size = 11664, Maurer_cpu = 0.316ms, Maurer_gpu = 0.752ms, Wenbo_cpu = 0.119ms, Wenbo_gpu = 0.293ms


size = 16900, Maurer_cpu = 0.44ms, Maurer_gpu = 0.947ms, Wenbo_cpu = 0.136ms, Wenbo_gpu = 0.309ms


size = 22801, Maurer_cpu = 0.562ms, Maurer_gpu = 1.143ms, Wenbo_cpu = 0.157ms, Wenbo_gpu = 0.242ms


size = 29584, Maurer_cpu = 0.696ms, Maurer_gpu = 1.396ms, Wenbo_cpu = 0.19ms, Wenbo_gpu = 0.252ms


size = 37249, Maurer_cpu = 0.891ms, Maurer_gpu = 1.606ms, Wenbo_cpu = 0.249ms, Wenbo_gpu = 0.271ms


size = 45796, Maurer_cpu = 1.216ms, Maurer_gpu = 1.991ms, Wenbo_cpu = 0.288ms, Wenbo_gpu = 0.225ms


size = 55696, Maurer_cpu = 1.349ms, Maurer_gpu = 2.36ms, Wenbo_cpu = 0.328ms, Wenbo_gpu = 0.284ms


size = 66049, Maurer_cpu = 1.756ms, Maurer_gpu = 3.197ms, Wenbo_cpu = 0.352ms, Wenbo_gpu = 0.318ms


size = 77284, Maurer_cpu = 1.99ms, Maurer_gpu = 3.726ms, Wenbo_cpu = 0.396ms, Wenbo_gpu = 0.269ms


size = 90000, Maurer_cpu = 2.198ms, Maurer_gpu = 4.012ms, Wenbo_cpu = 0.432ms, Wenbo_gpu = 0.309ms


size = 103041, Maurer_cpu = 2.598ms, Maurer_gpu = 4.461ms, Wenbo_cpu = 0.502ms, Wenbo_gpu = 0.638ms


size = 116964, Maurer_cpu = 3.012ms, Maurer_gpu = 5.626ms, Wenbo_cpu = 0.538ms, Wenbo_gpu = 0.239ms


size = 131769, Maurer_cpu = 3.514ms, Maurer_gpu = 5.763ms, Wenbo_cpu = 0.594ms, Wenbo_gpu = 0.254ms


size = 147456, Maurer_cpu = 3.827ms, Maurer_gpu = 6.244ms, Wenbo_cpu = 0.658ms, Wenbo_gpu = 0.256ms


size = 164836, Maurer_cpu = 4.246ms, Maurer_gpu = 6.886ms, Wenbo_cpu = 0.742ms, Wenbo_gpu = 0.264ms


size = 182329, Maurer_cpu = 4.619ms, Maurer_gpu = 7.381ms, Wenbo_cpu = 0.841ms, Wenbo_gpu = 0.254ms


size = 200704, Maurer_cpu = 5.0ms, Maurer_gpu = 8.096ms, Wenbo_cpu = 0.962ms, Wenbo_gpu = 0.26ms


size = 220900, Maurer_cpu = 5.508ms, Maurer_gpu = 8.818ms, Wenbo_cpu = 1.027ms, Wenbo_gpu = 0.261ms


size = 241081, Maurer_cpu = 5.903ms, Maurer_gpu = 9.609ms, Wenbo_cpu = 1.109ms, Wenbo_gpu = 0.252ms


size = 262144, Maurer_cpu = 6.398ms, Maurer_gpu = 10.389ms, Wenbo_cpu = 1.467ms, Wenbo_gpu = 0.281ms




#### Save Data

In [20]:
df_not_loop_2D = DataFrame(size = sizes_2D,
Maurer_cpu_hd_cpu_min = not_loop_2d_Maurer_cpu_hd_cpu_min,
Maurer_cpu_hd_cpu_std = not_loop_2d_Maurer_cpu_hd_cpu_std,
Maurer_cpu_hd_gpu_min = not_loop_2d_Maurer_cpu_hd_gpu_min,
Maurer_cpu_hd_gpu_std = not_loop_2d_Maurer_cpu_hd_gpu_std,
Wenbo_cpu_hd_cpu_min = not_loop_2d_Wenbo_cpu_hd_cpu_min,
Wenbo_cpu_hd_cpu_std = not_loop_2d_Wenbo_cpu_hd_cpu_std,
Wenbo_gpu_hd_gpu_min = not_loop_2d_Wenbo_gpu_hd_gpu_min,
Wenbo_gpu_hd_gpu_std = not_loop_2d_Wenbo_gpu_hd_gpu_std)

CSV.write("Julia_Loop_Results/HD_2D_Not_Loop_Jan_11.csv", df_not_loop_2D)

"Julia_Loop_Results/HD_2D_Not_Loop_Jan_11.csv"

### 3D

In [21]:
not_loop_3d_Maurer_cpu_hd_cpu_min = []
not_loop_3d_Maurer_cpu_hd_cpu_std = []

not_loop_3d_Maurer_cpu_hd_gpu_min = []
not_loop_3d_Maurer_cpu_hd_gpu_std = []

not_loop_3d_Wenbo_cpu_hd_cpu_min = []
not_loop_3d_Wenbo_cpu_hd_cpu_std = []

not_loop_3d_Wenbo_gpu_hd_gpu_min = []
not_loop_3d_Wenbo_gpu_hd_gpu_std = []

for (idx, case) in enumerate(test_cases_3D)
    # load case
    pred_mask, mask = case
    # cpu -> gpu
    pred_mask_gpu, mask_gpu = pred_mask |> gpu, mask |> gpu

    #---Start testing---#

    # tfm = Maurer, tfm_device = cpu, hd_device = cpu
    Maurer_cpu_hd_cpu = @benchmark hausdorff_loss_cpu_Maurer($pred_mask, $mask)
    # tfm = Maurer, tfm_device = cpu, hd_device = gpu
    Maurer_cpu_hd_gpu = @benchmark hausdorff_loss_gpu_Maurer($pred_mask_gpu, $mask_gpu)
    # tfm = Wenbo, tfm_device = cpu, hd_device = cpu
    Wenbo_cpu_hd_cpu = @benchmark hausdorff_loss_cpu_Wenbo($pred_mask, $mask)
    # tfm = Wenbo, tfm_device = gpu, hd_device = gpu
    Wenbo_gpu_hd_gpu = @benchmark hausdorff_loss_gpu_Wenbo($pred_mask_gpu, $mask_gpu)

    #---Finished testing---#
    append!(not_loop_3d_Maurer_cpu_hd_cpu_min, BenchmarkTools.minimum(Maurer_cpu_hd_cpu).time)
    append!(not_loop_3d_Maurer_cpu_hd_cpu_std, BenchmarkTools.std(Maurer_cpu_hd_cpu).time)
    
    append!(not_loop_3d_Maurer_cpu_hd_gpu_min, BenchmarkTools.minimum(Maurer_cpu_hd_gpu).time)
    append!(not_loop_3d_Maurer_cpu_hd_gpu_std, BenchmarkTools.std(Maurer_cpu_hd_gpu).time)

    append!(not_loop_3d_Wenbo_cpu_hd_cpu_min, BenchmarkTools.minimum(Wenbo_cpu_hd_cpu).time)
    append!(not_loop_3d_Wenbo_cpu_hd_cpu_std, BenchmarkTools.std(Wenbo_cpu_hd_cpu).time)
    
    append!(not_loop_3d_Wenbo_gpu_hd_gpu_min, BenchmarkTools.minimum(Wenbo_gpu_hd_gpu).time)
    append!(not_loop_3d_Wenbo_gpu_hd_gpu_std, BenchmarkTools.std(Wenbo_gpu_hd_gpu).time)

    println("size = $(sizes_3D[idx]), Maurer_cpu = $(round(not_loop_3d_Maurer_cpu_hd_cpu_min[end]*1e-6, digits = 3))ms, Maurer_gpu = $(round(not_loop_3d_Maurer_cpu_hd_gpu_min[end]*1e-6, digits = 3))ms, Wenbo_cpu = $(round(not_loop_3d_Wenbo_cpu_hd_cpu_min[end]*1e-6, digits = 3))ms, Wenbo_gpu = $(round(not_loop_3d_Wenbo_gpu_hd_gpu_min[end]*1e-6, digits = 3))ms")
end

size = 8, Maurer_cpu = 0.001ms, Maurer_gpu = 0.247ms, Wenbo_cpu = 0.086ms, Wenbo_gpu = 0.308ms


size = 343, Maurer_cpu = 0.027ms, Maurer_gpu = 0.29ms, Wenbo_cpu = 0.096ms, Wenbo_gpu = 0.297ms




size = 1728, Maurer_cpu = 0.19ms, Maurer_gpu = 0.549ms, Wenbo_cpu = 0.142ms, Wenbo_gpu = 0.349ms


size = 5832, Maurer_cpu = 0.35ms, Maurer_gpu = 0.745ms, Wenbo_cpu = 0.265ms, Wenbo_gpu = 0.58ms


size = 12167, Maurer_cpu = 0.52ms, Maurer_gpu = 1.424ms, Wenbo_cpu = 0.333ms, Wenbo_gpu = 0.433ms


size = 21952, Maurer_cpu = 0.769ms, Maurer_gpu = 1.413ms, Wenbo_cpu = 0.394ms, Wenbo_gpu = 0.345ms


size = 39304, Maurer_cpu = 1.457ms, Maurer_gpu = 2.417ms, Wenbo_cpu = 0.546ms, Wenbo_gpu = 0.326ms


size = 59319, Maurer_cpu = 2.14ms, Maurer_gpu = 3.688ms, Wenbo_cpu = 0.694ms, Wenbo_gpu = 0.294ms


size = 85184, Maurer_cpu = 2.79ms, Maurer_gpu = 4.642ms, Wenbo_cpu = 0.89ms, Wenbo_gpu = 0.291ms


size = 117649, Maurer_cpu = 3.939ms, Maurer_gpu = 6.483ms, Wenbo_cpu = 1.096ms, Wenbo_gpu = 0.292ms


size = 157464, Maurer_cpu = 5.376ms, Maurer_gpu = 8.516ms, Wenbo_cpu = 1.3ms, Wenbo_gpu = 0.284ms


size = 216000, Maurer_cpu = 6.858ms, Maurer_gpu = 10.601ms, Wenbo_cpu = 1.768ms, Wenbo_gpu = 0.3ms


size = 274625, Maurer_cpu = 9.279ms, Maurer_gpu = 13.651ms, Wenbo_cpu = 2.22ms, Wenbo_gpu = 0.421ms


size = 343000, Maurer_cpu = 11.354ms, Maurer_gpu = 15.921ms, Wenbo_cpu = 2.568ms, Wenbo_gpu = 0.471ms


size = 438976, Maurer_cpu = 15.536ms, Maurer_gpu = 19.387ms, Wenbo_cpu = 3.165ms, Wenbo_gpu = 0.629ms


size = 531441, Maurer_cpu = 17.78ms, Maurer_gpu = 23.757ms, Wenbo_cpu = 3.747ms, Wenbo_gpu = 0.769ms


size = 636056, Maurer_cpu = 21.133ms, Maurer_gpu = 34.351ms, Wenbo_cpu = 4.336ms, Wenbo_gpu = 0.841ms


size = 753571, Maurer_cpu = 24.91ms, Maurer_gpu = 32.427ms, Wenbo_cpu = 5.138ms, Wenbo_gpu = 1.046ms


size = 884736, Maurer_cpu = 28.737ms, Maurer_gpu = 37.98ms, Wenbo_cpu = 6.119ms, Wenbo_gpu = 1.083ms


size = 1061208, Maurer_cpu = 35.463ms, Maurer_gpu = 45.21ms, Wenbo_cpu = 6.833ms, Wenbo_gpu = 1.334ms


size = 1225043, Maurer_cpu = 40.645ms, Maurer_gpu = 52.007ms, Wenbo_cpu = 7.654ms, Wenbo_gpu = 1.594ms


size = 1404928, Maurer_cpu = 45.827ms, Maurer_gpu = 58.981ms, Wenbo_cpu = 9.138ms, Wenbo_gpu = 1.775ms


size = 1643032, Maurer_cpu = 54.682ms, Maurer_gpu = 69.173ms, Wenbo_cpu = 10.16ms, Wenbo_gpu = 2.204ms


size = 1860867, Maurer_cpu = 61.571ms, Maurer_gpu = 78.758ms, Wenbo_cpu = 11.611ms, Wenbo_gpu = 2.624ms


size = 2097152, Maurer_cpu = 68.525ms, Maurer_gpu = 87.449ms, Wenbo_cpu = 15.109ms, Wenbo_gpu = 2.988ms


#### Save Data

In [22]:
df_not_loop_3D = DataFrame(size = sizes_3D,
Maurer_cpu_hd_cpu_min = not_loop_3d_Maurer_cpu_hd_cpu_min,
Maurer_cpu_hd_cpu_std = not_loop_3d_Maurer_cpu_hd_cpu_std,
Maurer_cpu_hd_gpu_min = not_loop_3d_Maurer_cpu_hd_gpu_min,
Maurer_cpu_hd_gpu_std = not_loop_3d_Maurer_cpu_hd_gpu_std,
Wenbo_cpu_hd_cpu_min = not_loop_3d_Wenbo_cpu_hd_cpu_min,
Wenbo_cpu_hd_cpu_std = not_loop_3d_Wenbo_cpu_hd_cpu_std,
Wenbo_gpu_hd_gpu_min = not_loop_3d_Wenbo_gpu_hd_gpu_min,
Wenbo_gpu_hd_gpu_std = not_loop_3d_Wenbo_gpu_hd_gpu_std)

CSV.write("Julia_Loop_Results/HD_3D_Not_Loop_Jan_11.csv", df_not_loop_3D)

"Julia_Loop_Results/HD_3D_Not_Loop_Jan_11.csv"

## In loop

### 2D

In [19]:
loop_2d_Maurer_cpu_hd_cpu_min = []
loop_2d_Maurer_cpu_hd_cpu_std = []

loop_2d_Maurer_cpu_hd_gpu_min = []
loop_2d_Maurer_cpu_hd_gpu_std = []

loop_2d_Wenbo_cpu_hd_cpu_min = []
loop_2d_Wenbo_cpu_hd_cpu_std = []

loop_2d_Wenbo_gpu_hd_gpu_min = []
loop_2d_Wenbo_gpu_hd_gpu_std = []

model_org = unet2D_new(1, 2)

for (idx, case) in enumerate(test_cases_2D_loop)
    # load case
    x_org, mask_org = case
    
    #---Start testing---#

    # tfm = Maurer, tfm_device = cpu, hd_device = cpu
    Maurer_cpu_hd_cpu = []
    t1 = Dates.Time(now())
    for j =1:1000 # 1000 Evaluations
        # deepcopy to reset grad
        x, mask, model = deepcopy(x_org), deepcopy(mask_org), deepcopy(model_org)
        # Model forward
        pred = model(x)
        #---Loss begin---#
        t = @timed begin 
            loss = hausdorff_loss_2d_cpu_Maurer_loop(pred[:,:,2:end,:], mask)
        end
        #---Loss finished---#
        push!(Maurer_cpu_hd_cpu, t.time*1e9)
        if (Dates.Time(now()) - t1).value > 1e9*300
            break
        end
    end
    # tfm = Maurer, tfm_device = cpu, hd_device = gpu
    Maurer_cpu_hd_gpu = []
    t1 = Dates.Time(now())
    for j =1:1000 # 1000 Evaluations
        # deepcopy to reset grad
        x, mask, model = deepcopy(x_org) |> gpu, deepcopy(mask_org) |> gpu, deepcopy(model_org) |> gpu
        # Model forward
        pred = model(x)
        #---Loss begin---#
        t = @timed begin 
            loss = hausdorff_loss_2d_gpu_Maurer_loop(pred[:,:,2:end,:], mask)
        end
        #---Loss finished---#
        push!(Maurer_cpu_hd_gpu, t.time*1e9)
        if (Dates.Time(now()) - t1).value > 1e9*300
            break
        end
    end
    # tfm = Wenbo, tfm_device = cpu, hd_device = cpu
    Wenbo_cpu_hd_cpu = []
    t1 = Dates.Time(now())
    for j =1:1000 # 1000 Evaluations
        # deepcopy to reset grad
        x, mask, model = deepcopy(x_org), deepcopy(mask_org), deepcopy(model_org)
        # Model forward
        pred = model(x)
        #---Loss begin---#
        t = @timed begin 
            loss = hausdorff_loss_cpu_Wenbo_loop(pred[:,:,2:end,:], mask)
        end
        #---Loss finished---#
        push!(Wenbo_cpu_hd_cpu, t.time*1e9)
        if (Dates.Time(now()) - t1).value > 1e9*300
            break
        end
    end
    # tfm = Wenbo, tfm_device = gpu, hd_device = gpu
    Wenbo_gpu_hd_gpu = []
    t1 = Dates.Time(now())
    for j =1:1000 # 1000 Evaluations
        # deepcopy to reset grad
        x, mask, model = deepcopy(x_org) |> gpu, deepcopy(mask_org) |> gpu, deepcopy(model_org) |> gpu
        # Model forward
        pred = model(x)
        #---Loss begin---#
        t = @timed begin 
            loss = hausdorff_loss_gpu_Wenbo_loop(pred[:,:,2:end,:], mask)
        end
        #---Loss finished---#
        push!(Wenbo_gpu_hd_gpu, t.time*1e9)
        if (Dates.Time(now()) - t1).value > 1e9*300
            break
        end
    end

    #---Finished testing---#
    append!(loop_2d_Maurer_cpu_hd_cpu_min, minimum(Maurer_cpu_hd_cpu))
    append!(loop_2d_Maurer_cpu_hd_cpu_std, Statistics.std(Maurer_cpu_hd_cpu))
    
    append!(loop_2d_Maurer_cpu_hd_gpu_min, minimum(Maurer_cpu_hd_gpu))
    append!(loop_2d_Maurer_cpu_hd_gpu_std, Statistics.std(Maurer_cpu_hd_gpu))
    
    append!(loop_2d_Wenbo_cpu_hd_cpu_min, minimum(Wenbo_cpu_hd_cpu))
    append!(loop_2d_Wenbo_cpu_hd_cpu_std, Statistics.std(Wenbo_cpu_hd_cpu))

    append!(loop_2d_Wenbo_gpu_hd_gpu_min, minimum(Wenbo_gpu_hd_gpu))
    append!(loop_2d_Wenbo_gpu_hd_gpu_std, Statistics.std(Wenbo_gpu_hd_gpu))

    println("size = $(sizes_2D_loop[idx]), Maurer_cpu = $(round(loop_2d_Maurer_cpu_hd_cpu_min[end]*1e-6, digits = 3))ms, Maurer_gpu = $(round(loop_2d_Maurer_cpu_hd_gpu_min[end]*1e-6, digits = 3))ms, Wenbo_cpu = $(round(loop_2d_Wenbo_cpu_hd_cpu_min[end]*1e-6, digits = 3))ms, Wenbo_gpu = $(round(loop_2d_Wenbo_gpu_hd_gpu_min[end]*1e-6, digits = 3))ms")

end

size = 256, Maurer_cpu = 0.024ms, Maurer_gpu = 1.408ms, Wenbo_cpu = 0.054ms, Wenbo_gpu = 0.757ms


size = 1024, Maurer_cpu = 0.13ms, Maurer_gpu = 1.767ms, Wenbo_cpu = 0.07ms, Wenbo_gpu = 0.36ms


size = 2304, Maurer_cpu = 0.182ms, Maurer_gpu = 1.376ms, Wenbo_cpu = 0.133ms, Wenbo_gpu = 0.696ms


size = 4096, Maurer_cpu = 0.241ms, Maurer_gpu = 1.459ms, Wenbo_cpu = 0.212ms, Wenbo_gpu = 0.744ms


size = 6400, Maurer_cpu = 0.321ms, Maurer_gpu = 1.769ms, Wenbo_cpu = 0.286ms, Wenbo_gpu = 0.717ms


size = 9216, Maurer_cpu = 0.392ms, Maurer_gpu = 1.982ms, Wenbo_cpu = 0.343ms, Wenbo_gpu = 0.365ms


size = 12544, Maurer_cpu = 0.469ms, Maurer_gpu = 1.982ms, Wenbo_cpu = 0.437ms, Wenbo_gpu = 0.526ms


size = 16384, Maurer_cpu = 0.575ms, Maurer_gpu = 2.374ms, Wenbo_cpu = 0.56ms, Wenbo_gpu = 0.755ms


size = 20736, Maurer_cpu = 0.674ms, Maurer_gpu = 2.658ms, Wenbo_cpu = 0.723ms, Wenbo_gpu = 0.434ms


size = 25600, Maurer_cpu = 0.827ms, Maurer_gpu = 1.477ms, Wenbo_cpu = 0.855ms, Wenbo_gpu = 0.455ms


size = 30976, Maurer_cpu = 0.958ms, Maurer_gpu = 1.799ms, Wenbo_cpu = 1.035ms, Wenbo_gpu = 0.395ms


size = 36864, Maurer_cpu = 1.144ms, Maurer_gpu = 2.119ms, Wenbo_cpu = 1.185ms, Wenbo_gpu = 0.524ms


size = 43264, Maurer_cpu = 1.363ms, Maurer_gpu = 2.306ms, Wenbo_cpu = 1.339ms, Wenbo_gpu = 0.636ms


size = 50176, Maurer_cpu = 1.501ms, Maurer_gpu = 2.203ms, Wenbo_cpu = 1.507ms, Wenbo_gpu = 0.636ms


size = 57600, Maurer_cpu = 1.609ms, Maurer_gpu = 2.976ms, Wenbo_cpu = 1.68ms, Wenbo_gpu = 0.684ms


size = 65536, Maurer_cpu = 1.966ms, Maurer_gpu = 2.727ms, Wenbo_cpu = 1.886ms, Wenbo_gpu = 0.676ms


size = 73984, Maurer_cpu = 2.099ms, Maurer_gpu = 3.746ms, Wenbo_cpu = 2.085ms, Wenbo_gpu = 0.734ms


size = 82944, Maurer_cpu = 2.316ms, Maurer_gpu = 3.453ms, Wenbo_cpu = 2.256ms, Wenbo_gpu = 0.865ms


size = 92416, Maurer_cpu = 2.575ms, Maurer_gpu = 3.69ms, Wenbo_cpu = 2.505ms, Wenbo_gpu = 1.039ms


size = 102400, Maurer_cpu = 2.76ms, Maurer_gpu = 4.046ms, Wenbo_cpu = 2.733ms, Wenbo_gpu = 1.143ms


size = 112896, Maurer_cpu = 3.004ms, Maurer_gpu = 4.725ms, Wenbo_cpu = 2.934ms, Wenbo_gpu = 1.145ms


size = 123904, Maurer_cpu = 3.295ms, Maurer_gpu = 4.775ms, Wenbo_cpu = 3.23ms, Wenbo_gpu = 0.75ms


size = 135424, Maurer_cpu = 3.694ms, Maurer_gpu = 4.864ms, Wenbo_cpu = 3.487ms, Wenbo_gpu = 1.237ms


size = 147456, Maurer_cpu = 4.007ms, Maurer_gpu = 5.465ms, Wenbo_cpu = 3.823ms, Wenbo_gpu = 1.275ms


size = 160000, Maurer_cpu = 4.186ms, Maurer_gpu = 6.138ms, Wenbo_cpu = 3.995ms, Wenbo_gpu = 1.253ms


size = 173056, Maurer_cpu = 4.506ms, Maurer_gpu = 5.551ms, Wenbo_cpu = 4.354ms, Wenbo_gpu = 1.285ms


size = 186624, Maurer_cpu = 4.86ms, Maurer_gpu = 6.271ms, Wenbo_cpu = 4.651ms, Wenbo_gpu = 0.888ms


size = 200704, Maurer_cpu = 5.086ms, Maurer_gpu = 6.597ms, Wenbo_cpu = 4.958ms, Wenbo_gpu = 4.349ms


size = 215296, Maurer_cpu = 5.372ms, Maurer_gpu = 7.039ms, Wenbo_cpu = 5.349ms, Wenbo_gpu = 1.364ms


size = 230400, Maurer_cpu = 5.779ms, Maurer_gpu = 7.558ms, Wenbo_cpu = 5.701ms, Wenbo_gpu = 2.173ms


size = 246016, Maurer_cpu = 6.147ms, Maurer_gpu = 7.722ms, Wenbo_cpu = 6.065ms, Wenbo_gpu = 1.45ms


size = 262144, Maurer_cpu = 6.636ms, Maurer_gpu = 8.083ms, Wenbo_cpu = 7.136ms, Wenbo_gpu = 2.386ms


#### Save Data

In [20]:
df_loop_2D = DataFrame(size = sizes_2D_loop,
Maurer_cpu_hd_cpu_min = loop_2d_Maurer_cpu_hd_cpu_min,
Maurer_cpu_hd_cpu_std = loop_2d_Maurer_cpu_hd_cpu_std,
Maurer_cpu_hd_gpu_min = loop_2d_Maurer_cpu_hd_gpu_min,
Maurer_cpu_hd_gpu_std = loop_2d_Maurer_cpu_hd_gpu_std,
Wenbo_cpu_hd_cpu_min = loop_2d_Wenbo_cpu_hd_cpu_min,
Wenbo_cpu_hd_cpu_std = loop_2d_Wenbo_cpu_hd_cpu_std,
Wenbo_gpu_hd_gpu_min = loop_2d_Wenbo_gpu_hd_gpu_min,
Wenbo_gpu_hd_gpu_std = loop_2d_Wenbo_gpu_hd_gpu_std)

CSV.write("Julia_Loop_Results/HD_2D_In_Loop_Jan_11.csv", df_loop_2D)

"Julia_Loop_Results/HD_2D_In_Loop_Jan_11.csv"

### 3D

In [23]:
# loop_3d_Maurer_cpu_hd_cpu_min = []
# loop_3d_Maurer_cpu_hd_cpu_std = []

# loop_3d_Maurer_cpu_hd_gpu_min = []
# loop_3d_Maurer_cpu_hd_gpu_std = []

# loop_3d_Wenbo_cpu_hd_cpu_min = []
# loop_3d_Wenbo_cpu_hd_cpu_std = []

# loop_3d_Wenbo_gpu_hd_gpu_min = []
# loop_3d_Wenbo_gpu_hd_gpu_std = []

# model_org = unet3D_new(1, 2)

for (idx, case) in enumerate(test_cases_3D_loop[4:8])
    # load case
    x_org, mask_org = case
    
    #---Start testing---#

    # tfm = Maurer, tfm_device = cpu, hd_device = cpu
    Maurer_cpu_hd_cpu = []
    t1 = Dates.Time(now())
    for j =1:1000 # 1000 Evaluations
        # deepcopy to reset grad
        x, mask, model = deepcopy(x_org), deepcopy(mask_org), deepcopy(model_org)
        # Model forward
        pred = model(x)
        #---Loss begin---#
        t = @timed begin 
            loss = hausdorff_loss_3d_cpu_Maurer_loop(pred[:,:,:,2:end,:], mask)
        end
        #---Loss finished---#
        push!(Maurer_cpu_hd_cpu, t.time*1e9)
        if (Dates.Time(now()) - t1).value > 1e9*300
            break
        end
    end
    # tfm = Maurer, tfm_device = cpu, hd_device = gpu
    Maurer_cpu_hd_gpu = []
    t1 = Dates.Time(now())
    for j =1:1000 # 1000 Evaluations
        # deepcopy to reset grad
        x, mask, model = deepcopy(x_org) |> gpu, deepcopy(mask_org) |> gpu, deepcopy(model_org) |> gpu
        # Model forward
        pred = model(x)
        #---Loss begin---#
        t = @timed begin 
            loss = hausdorff_loss_3d_gpu_Maurer_loop(pred[:,:,:,2:end,:], mask)
        end
        #---Loss finished---#
        push!(Maurer_cpu_hd_gpu, t.time*1e9)
        if (Dates.Time(now()) - t1).value > 1e9*300
            break
        end
    end
    # tfm = Wenbo, tfm_device = cpu, hd_device = cpu
    Wenbo_cpu_hd_cpu = []
    t1 = Dates.Time(now())
    for j =1:1000 # 1000 Evaluations
        # deepcopy to reset grad
        x, mask, model = deepcopy(x_org), deepcopy(mask_org), deepcopy(model_org)
        # Model forward
        pred = model(x)
        #---Loss begin---#
        t = @timed begin 
            loss = hausdorff_loss_cpu_Wenbo_loop(pred[:,:,:,2:end,:], mask)
        end
        #---Loss finished---#
        push!(Wenbo_cpu_hd_cpu, t.time*1e9)
        if (Dates.Time(now()) - t1).value > 1e9*300
            break
        end
    end
    # tfm = Wenbo, tfm_device = gpu, hd_device = gpu
    Wenbo_gpu_hd_gpu = []
    t1 = Dates.Time(now())
    for j =1:1000 # 1000 Evaluations
        # deepcopy to reset grad
        x, mask, model = deepcopy(x_org) |> gpu, deepcopy(mask_org) |> gpu, deepcopy(model_org) |> gpu
        # Model forward
        pred = model(x)
        #---Loss begin---#
        t = @timed begin 
            loss = hausdorff_loss_gpu_Wenbo_loop(pred[:,:,:,2:end,:], mask)
        end
        #---Loss finished---#
        push!(Wenbo_gpu_hd_gpu, t.time*1e9)
        if (Dates.Time(now()) - t1).value > 1e9*300
            break
        end
    end

    #---Finished testing---#
    append!(loop_3d_Maurer_cpu_hd_cpu_min, minimum(Maurer_cpu_hd_cpu))
    append!(loop_3d_Maurer_cpu_hd_cpu_std, Statistics.std(Maurer_cpu_hd_cpu))
    
    append!(loop_3d_Maurer_cpu_hd_gpu_min, minimum(Maurer_cpu_hd_gpu))
    append!(loop_3d_Maurer_cpu_hd_gpu_std, Statistics.std(Maurer_cpu_hd_gpu))
    
    append!(loop_3d_Wenbo_cpu_hd_cpu_min, minimum(Wenbo_cpu_hd_cpu))
    append!(loop_3d_Wenbo_cpu_hd_cpu_std, Statistics.std(Wenbo_cpu_hd_cpu))

    append!(loop_3d_Wenbo_gpu_hd_gpu_min, minimum(Wenbo_gpu_hd_gpu))
    append!(loop_3d_Wenbo_gpu_hd_gpu_std, Statistics.std(Wenbo_gpu_hd_gpu))

    println("size = $(sizes_3D_loop[idx]), Maurer_cpu = $(round(loop_3d_Maurer_cpu_hd_cpu_min[end]*1e-6, digits = 3))ms, Maurer_gpu = $(round(loop_3d_Maurer_cpu_hd_gpu_min[end]*1e-6, digits = 3))ms, Wenbo_cpu = $(round(loop_3d_Wenbo_cpu_hd_cpu_min[end]*1e-6, digits = 3))ms, Wenbo_gpu = $(round(loop_3d_Wenbo_gpu_hd_gpu_min[end]*1e-6, digits = 3))ms")

end

size = 4096, Maurer_cpu = 9.038ms, Maurer_gpu = 10.16ms, Wenbo_cpu = 7.074ms, Wenbo_gpu = 4.632ms


size = 32768, Maurer_cpu = 16.31ms, Maurer_gpu = 31.973ms, Wenbo_cpu = 11.745ms, Wenbo_gpu = 16.534ms


size = 110592, Maurer_cpu = 29.846ms, Maurer_gpu = 56.551ms, Wenbo_cpu = 22.328ms, Wenbo_gpu = 12.296ms




size = 262144, Maurer_cpu = 46.049ms, Maurer_gpu = 57.85ms, Wenbo_cpu = 32.69ms, Wenbo_gpu = 13.51ms




size = 512000, Maurer_cpu = 125.771ms, Maurer_gpu = 139.761ms, Wenbo_cpu = 63.219ms, Wenbo_gpu = 14.903ms


In [24]:
df_loop_3D = DataFrame(size = sizes_3D_loop,
Maurer_cpu_hd_cpu_min = loop_3d_Maurer_cpu_hd_cpu_min,
Maurer_cpu_hd_cpu_std = loop_3d_Maurer_cpu_hd_cpu_std,
Maurer_cpu_hd_gpu_min = loop_3d_Maurer_cpu_hd_gpu_min,
Maurer_cpu_hd_gpu_std = loop_3d_Maurer_cpu_hd_gpu_std,
Wenbo_cpu_hd_cpu_min = loop_3d_Wenbo_cpu_hd_cpu_min,
Wenbo_cpu_hd_cpu_std = loop_3d_Wenbo_cpu_hd_cpu_std,
Wenbo_gpu_hd_gpu_min = loop_3d_Wenbo_gpu_hd_gpu_min,
Wenbo_gpu_hd_gpu_std = loop_3d_Wenbo_gpu_hd_gpu_std)

CSV.write("Julia_Loop_Results/HD_3D_In_Loop_Jan_11.csv", df_loop_3D)

"Julia_Loop_Results/HD_3D_In_Loop_Jan_11.csv"