From 7ca72c77855fc12c73c2c0ff750fa4a6b8bf4cc0 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner Date: Sun, 3 Sep 2023 10:11:42 -0500 Subject: [PATCH] document :radixsort performance (#720) --- src/counts.jl | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index d7485fd30..790c44fd6 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -258,12 +258,14 @@ raw counts. - `:radixsort`: if `radixsort_safe(eltype(x)) == true` then use the [radix sort](https://en.wikipedia.org/wiki/Radix_sort) algorithm to sort the input vector which will generally lead to - shorter running time. However the radix sort algorithm creates a - copy of the input vector and hence uses more RAM. Choose `:dict` - if the amount of available RAM is a limitation. + shorter running time for large `x` with many duplicates. However + the radix sort algorithm creates a copy of the input vector and + hence uses more RAM. Choose `:dict` if the amount of available + RAM is a limitation. - `:dict`: use `Dict`-based method which is generally slower but uses less - RAM and is safe for any data type. + RAM, is safe for any data type, is faster for small arrays, and + is faster when there are not many duplicates. """ addcounts!(cm::Dict, x; alg = :auto) = _addcounts!(eltype(x), cm, x, alg = alg) @@ -430,12 +432,14 @@ raw counts. - `:radixsort`: if `radixsort_safe(eltype(x)) == true` then use the [radix sort](https://en.wikipedia.org/wiki/Radix_sort) algorithm to sort the input vector which will generally lead to - shorter running time. However the radix sort algorithm creates a - copy of the input vector and hence uses more RAM. Choose `:dict` - if the amount of available RAM is a limitation. + shorter running time for large `x` with many duplicates. However + the radix sort algorithm creates a copy of the input vector and + hence uses more RAM. Choose `:dict` if the amount of available + RAM is a limitation. - `:dict`: use `Dict`-based method which is generally slower but uses less - RAM and is safe for any data type. + RAM, is safe for any data type, is faster for small arrays, and + is faster when there are not many duplicates. """ countmap(x; alg = :auto) = addcounts!(Dict{eltype(x),Int}(), x; alg = alg) countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcounts!(Dict{T,W}(), x, wv)