From 3767de3d44c1b55c2ce60038f4b81cb7a7d27a65 Mon Sep 17 00:00:00 2001 From: Moelf Date: Thu, 1 Oct 2020 21:57:07 -0400 Subject: [PATCH 01/26] Make countmap support iterators --- src/counts.jl | 7 +++++-- test/counts.jl | 27 ++++++++++++++++++++++++--- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index 2b017e4b9..3d3d64b00 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -255,7 +255,7 @@ raw counts. - `:dict`: use `Dict`-based method which is generally slower but uses less RAM and is safe for any data type. """ -function addcounts!(cm::Dict{T}, x::AbstractArray{T}; alg = :auto) where T +function addcounts!(cm::Dict{T}, x; alg = :auto) where T # if it's safe to be sorted using radixsort then it should be faster # albeit using more RAM if radixsort_safe(T) && (alg == :auto || alg == :radixsort) @@ -269,7 +269,7 @@ function addcounts!(cm::Dict{T}, x::AbstractArray{T}; alg = :auto) where T end """Dict-based addcounts method""" -function addcounts_dict!(cm::Dict{T}, x::AbstractArray{T}) where T +function addcounts_dict!(cm::Dict{T}, x) where T for v in x index = ht_keyindex2!(cm, v) if index > 0 @@ -388,6 +388,9 @@ of occurrences. """ countmap(x::AbstractArray{T}; alg = :auto) where {T} = addcounts!(Dict{T,Int}(), x; alg = alg) countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcounts!(Dict{T,W}(), x, wv) +countmap(x) = addcounts!(Dict{eltype(x),Int}(), x; alg = :dict) +countmap(x, wv::AbstractVector{W}) where {W<:Real} = addcounts!(Dict{eltype(x),W}(), collect(x), wv) +countmap(x, wv) = countmap(collect(x), collect(wv)) """ diff --git a/test/counts.jl b/test/counts.jl index 2fd508327..fb90348ee 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -105,10 +105,31 @@ cm = Dict{Int, Int}() StatsBase.addcounts_dict!(cm,xx) @test cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) +# test countmap for general iterators cm = countmap(x, weights(w)) -@test cm["a"] == 5.5 -@test cm["b"] == 4.5 -@test cm["c"] == 3.5 +cm_itr= countmap(skipmissing(x), weights(w)) +cm_itr2 = countmap(skipmissing(x), skipmissing(w)) +for c in (cm, cm_itr, cm_itr2) + @test c["a"] == 5.5 + @test c["b"] == 4.5 + @test c["c"] == 3.5 +end + +xx_missing = skipmissing([missing, "b", "a", "a", "b", "c"]) +cm_missing = countmap(xx_missing) + +@test cm_missing["a"] == 2 +@test cm_missing["b"] == 2 +@test cm_missing["c"] == 1 + +w_missing = [0.5, 0.5, 0.5, 0.5, 1] +cm_missing2 = countmap(xx_missing, w_missing) +cm_missing3 = countmap(xx_missing, skipmissing(w_missing)) +for c in (cm_missing2, cm_missing3) + @test c["a"] == 1 + @test c["b"] == 1 + @test c["c"] == 1 +end @test cm == countmap(x, w) From 6f411d49a0abc5d339c2a69d45309b7326f488b5 Mon Sep 17 00:00:00 2001 From: Jerry Ling Date: Fri, 2 Oct 2020 13:35:30 -0400 Subject: [PATCH 02/26] Update src/counts.jl Co-authored-by: Milan Bouchet-Valat --- src/counts.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index 3d3d64b00..4cc14d2bd 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -255,10 +255,10 @@ raw counts. - `:dict`: use `Dict`-based method which is generally slower but uses less RAM and is safe for any data type. """ -function addcounts!(cm::Dict{T}, x; alg = :auto) where T +function addcounts!(cm::Dict, x; alg = :auto) # if it's safe to be sorted using radixsort then it should be faster # albeit using more RAM - if radixsort_safe(T) && (alg == :auto || alg == :radixsort) + if radixsort_safe(eltype(x)) && (alg == :auto || alg == :radixsort) addcounts_radixsort!(cm, x) elseif alg == :radixsort throw(ArgumentError("`alg = :radixsort` is chosen but type `radixsort_safe($T)` did not return `true`; use `alg = :auto` or `alg = :dict` instead")) From c2b81e28582c4406b93aab0b60c01d4cfad25cbf Mon Sep 17 00:00:00 2001 From: Moelf Date: Fri, 2 Oct 2020 13:36:56 -0400 Subject: [PATCH 03/26] disallow weights be interator --- src/counts.jl | 1 - test/counts.jl | 12 ++++-------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index 4cc14d2bd..90a4f5712 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -390,7 +390,6 @@ countmap(x::AbstractArray{T}; alg = :auto) where {T} = addcounts!(Dict{T,Int}(), countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcounts!(Dict{T,W}(), x, wv) countmap(x) = addcounts!(Dict{eltype(x),Int}(), x; alg = :dict) countmap(x, wv::AbstractVector{W}) where {W<:Real} = addcounts!(Dict{eltype(x),W}(), collect(x), wv) -countmap(x, wv) = countmap(collect(x), collect(wv)) """ diff --git a/test/counts.jl b/test/counts.jl index fb90348ee..3af7bf0b4 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -108,8 +108,7 @@ StatsBase.addcounts_dict!(cm,xx) # test countmap for general iterators cm = countmap(x, weights(w)) cm_itr= countmap(skipmissing(x), weights(w)) -cm_itr2 = countmap(skipmissing(x), skipmissing(w)) -for c in (cm, cm_itr, cm_itr2) +for c in (cm, cm_itr) @test c["a"] == 5.5 @test c["b"] == 4.5 @test c["c"] == 3.5 @@ -124,12 +123,9 @@ cm_missing = countmap(xx_missing) w_missing = [0.5, 0.5, 0.5, 0.5, 1] cm_missing2 = countmap(xx_missing, w_missing) -cm_missing3 = countmap(xx_missing, skipmissing(w_missing)) -for c in (cm_missing2, cm_missing3) - @test c["a"] == 1 - @test c["b"] == 1 - @test c["c"] == 1 -end +@test cm_missing2["a"] == 1 +@test cm_missing2["b"] == 1 +@test cm_missing2["c"] == 1 @test cm == countmap(x, w) From 5144581a26fd8917e16b89af8d8035e2bf505a5c Mon Sep 17 00:00:00 2001 From: Moelf Date: Fri, 2 Oct 2020 14:47:48 -0400 Subject: [PATCH 04/26] address comments --- test/counts.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/counts.jl b/test/counts.jl index 3af7bf0b4..bda3c955d 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -102,8 +102,10 @@ StatsBase.addcounts_radixsort!(cm,xx2) # testing the Dict-based addcounts cm = Dict{Int, Int}() +cm_itr = Dict{Int, Int}() StatsBase.addcounts_dict!(cm,xx) -@test cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) +StatsBase.addcounts_dict!(cm_itr,skipmissing(xx)) +@test cm_itr == cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) # test countmap for general iterators cm = countmap(x, weights(w)) From 0b3ff6c6061f36ff4149c861b31ddf8c461faae7 Mon Sep 17 00:00:00 2001 From: Moelf Date: Sat, 3 Oct 2020 11:55:06 -0400 Subject: [PATCH 05/26] make :auto default again for iterators --- src/counts.jl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index 90a4f5712..e53dd1caa 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -318,8 +318,7 @@ const BaseRadixSortSafeTypes = Union{Int8, Int16, Int32, Int64, Int128, Float32, Float64} "Can the type be safely sorted by radixsort" -radixsort_safe(::Type{T}) where {T<:BaseRadixSortSafeTypes} = true -radixsort_safe(::Type) = false +radixsort_safe(::Type{T}) where T = T<:BaseRadixSortSafeTypes function _addcounts_radix_sort_loop!(cm::Dict{T}, sx::AbstractArray{T}) where T last_sx = sx[1] @@ -352,6 +351,9 @@ function addcounts_radixsort!(cm::Dict{T}, x::AbstractArray{T}) where T # It seems that sort is inferred in Julia 0.7. return _addcounts_radix_sort_loop!(cm, sx) end +# general iterator form +addcounts_radixsort!(cm::Dict{T}, x) where T = addcounts_radixsort!(cm, collect(x)) + function addcounts!(cm::Dict{T}, x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} n = length(x) @@ -388,7 +390,7 @@ of occurrences. """ countmap(x::AbstractArray{T}; alg = :auto) where {T} = addcounts!(Dict{T,Int}(), x; alg = alg) countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcounts!(Dict{T,W}(), x, wv) -countmap(x) = addcounts!(Dict{eltype(x),Int}(), x; alg = :dict) +countmap(x) = addcounts!(Dict{eltype(x),Int}(), x; alg = :auto) countmap(x, wv::AbstractVector{W}) where {W<:Real} = addcounts!(Dict{eltype(x),W}(), collect(x), wv) From 5d5ddc71d7f455e2a3b5cf198ebc2ba935570b07 Mon Sep 17 00:00:00 2001 From: Jerry Ling Date: Sat, 3 Oct 2020 13:19:01 -0400 Subject: [PATCH 06/26] Update src/counts.jl Co-authored-by: Milan Bouchet-Valat --- src/counts.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/counts.jl b/src/counts.jl index e53dd1caa..336e401a7 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -391,7 +391,6 @@ of occurrences. countmap(x::AbstractArray{T}; alg = :auto) where {T} = addcounts!(Dict{T,Int}(), x; alg = alg) countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcounts!(Dict{T,W}(), x, wv) countmap(x) = addcounts!(Dict{eltype(x),Int}(), x; alg = :auto) -countmap(x, wv::AbstractVector{W}) where {W<:Real} = addcounts!(Dict{eltype(x),W}(), collect(x), wv) """ From 6e40ec06a1532eaddf6b3e86e9a8c7e01e8aa323 Mon Sep 17 00:00:00 2001 From: Moelf Date: Sat, 3 Oct 2020 13:26:40 -0400 Subject: [PATCH 07/26] remove iterator + weights combination in countmap --- test/counts.jl | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/test/counts.jl b/test/counts.jl index bda3c955d..4eb73f039 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -109,12 +109,9 @@ StatsBase.addcounts_dict!(cm_itr,skipmissing(xx)) # test countmap for general iterators cm = countmap(x, weights(w)) -cm_itr= countmap(skipmissing(x), weights(w)) -for c in (cm, cm_itr) - @test c["a"] == 5.5 - @test c["b"] == 4.5 - @test c["c"] == 3.5 -end +@test cm["a"] == 5.5 +@test cm["b"] == 4.5 +@test cm["c"] == 3.5 xx_missing = skipmissing([missing, "b", "a", "a", "b", "c"]) cm_missing = countmap(xx_missing) @@ -123,12 +120,6 @@ cm_missing = countmap(xx_missing) @test cm_missing["b"] == 2 @test cm_missing["c"] == 1 -w_missing = [0.5, 0.5, 0.5, 0.5, 1] -cm_missing2 = countmap(xx_missing, w_missing) -@test cm_missing2["a"] == 1 -@test cm_missing2["b"] == 1 -@test cm_missing2["c"] == 1 - @test cm == countmap(x, w) pm = proportionmap(x, weights(w)) From 0933b7143762408b379abad3ab69ed9aca863b8a Mon Sep 17 00:00:00 2001 From: Moelf Date: Sat, 3 Oct 2020 13:43:11 -0400 Subject: [PATCH 08/26] make iterator radixsort! less allocation --- src/counts.jl | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index 336e401a7..8469039e6 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -351,8 +351,12 @@ function addcounts_radixsort!(cm::Dict{T}, x::AbstractArray{T}) where T # It seems that sort is inferred in Julia 0.7. return _addcounts_radix_sort_loop!(cm, sx) end -# general iterator form -addcounts_radixsort!(cm::Dict{T}, x) where T = addcounts_radixsort!(cm, collect(x)) + +# fall-back for `x` an iterator +function addcounts_radixsort!(cm::Dict{T}, x) where T + sx = sort!(collect(x), alg = RadixSort) + return _addcounts_radix_sort_loop!(cm, sx) +end function addcounts!(cm::Dict{T}, x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} From f0cb37077729e5956d8f0940626cd871365b3b7e Mon Sep 17 00:00:00 2001 From: Moelf Date: Sat, 3 Oct 2020 14:03:17 -0400 Subject: [PATCH 09/26] make iterator countmap faster on small byte types --- src/counts.jl | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index 8469039e6..a966a9ea1 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -394,8 +394,15 @@ of occurrences. """ countmap(x::AbstractArray{T}; alg = :auto) where {T} = addcounts!(Dict{T,Int}(), x; alg = alg) countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcounts!(Dict{T,W}(), x, wv) -countmap(x) = addcounts!(Dict{eltype(x),Int}(), x; alg = :auto) - +# fall-back for iterator `x` +function countmap(x) + if eltype(x) <: Union{Bool, UInt8, UInt16, Int8, Int16} + # faster `addcounts!` specialized + addcounts!(Dict{eltype(x),Int}(), collect(x); alg = :auto) + else + addcounts!(Dict{eltype(x),Int}(), x; alg = :auto) + end +end """ proportionmap(x) From 7cc1e4313f45bbbd833c0b92abd29ab89e9f59fd Mon Sep 17 00:00:00 2001 From: Moelf Date: Sat, 3 Oct 2020 14:33:13 -0400 Subject: [PATCH 10/26] alternative approach --- src/counts.jl | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index a966a9ea1..586c7c3e2 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -255,10 +255,12 @@ raw counts. - `:dict`: use `Dict`-based method which is generally slower but uses less RAM and is safe for any data type. """ -function addcounts!(cm::Dict, x; alg = :auto) +addcounts!(cm::Dict, x; alg = :auto) = _addcounts!(eltype(x), cm, x) +# manual dispatch for `x` being iterator +function _addcounts!(::Type{T}, cm::Dict{T}, x; alg = :auto) where T # if it's safe to be sorted using radixsort then it should be faster # albeit using more RAM - if radixsort_safe(eltype(x)) && (alg == :auto || alg == :radixsort) + if radixsort_safe(T) && (alg == :auto || alg == :radixsort) addcounts_radixsort!(cm, x) elseif alg == :radixsort throw(ArgumentError("`alg = :radixsort` is chosen but type `radixsort_safe($T)` did not return `true`; use `alg = :auto` or `alg = :dict` instead")) @@ -286,14 +288,14 @@ end # faster results and less memory usage. However we still wish to enable others # to write generic algorithms, therefore the methods below still accept the # `alg` argument but it is ignored. -function addcounts!(cm::Dict{Bool}, x::AbstractArray{Bool}; alg = :ignored) +function _addcounts!(::Type{Bool}, cm::Dict{Bool}, x::AbstractArray{Bool}; alg = :ignored) sumx = sum(x) cm[true] = get(cm, true, 0) + sumx cm[false] = get(cm, false, 0) + length(x) - sumx cm end -function addcounts!(cm::Dict{T}, x::AbstractArray{T}; alg = :ignored) where T <: Union{UInt8, UInt16, Int8, Int16} +function _addcounts!(::Type{T}, cm::Dict{T}, x::AbstractArray{T}; alg = :ignored) where T <: Union{UInt8, UInt16, Int8, Int16} counts = zeros(Int, 2^(8sizeof(T))) @inbounds for xi in x @@ -394,15 +396,8 @@ of occurrences. """ countmap(x::AbstractArray{T}; alg = :auto) where {T} = addcounts!(Dict{T,Int}(), x; alg = alg) countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcounts!(Dict{T,W}(), x, wv) -# fall-back for iterator `x` -function countmap(x) - if eltype(x) <: Union{Bool, UInt8, UInt16, Int8, Int16} - # faster `addcounts!` specialized - addcounts!(Dict{eltype(x),Int}(), collect(x); alg = :auto) - else - addcounts!(Dict{eltype(x),Int}(), x; alg = :auto) - end -end +countmap(x) = addcounts!(Dict{eltype(x),Int}(), x; alg = :auto) + """ proportionmap(x) From 0b8b0f7178436c60481f7b6bd15de58de98dfe65 Mon Sep 17 00:00:00 2001 From: Jerry Ling Date: Sat, 3 Oct 2020 15:00:14 -0400 Subject: [PATCH 11/26] Update src/counts.jl Co-authored-by: Milan Bouchet-Valat --- src/counts.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/counts.jl b/src/counts.jl index 586c7c3e2..8bfbf898d 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -257,7 +257,7 @@ raw counts. """ addcounts!(cm::Dict, x; alg = :auto) = _addcounts!(eltype(x), cm, x) # manual dispatch for `x` being iterator -function _addcounts!(::Type{T}, cm::Dict{T}, x; alg = :auto) where T +function _addcounts!(::Type{T}, cm::Dict, x; alg = :auto) where T # if it's safe to be sorted using radixsort then it should be faster # albeit using more RAM if radixsort_safe(T) && (alg == :auto || alg == :radixsort) From e4bc127f2aec9bdaad552fc0f99d435a16784fca Mon Sep 17 00:00:00 2001 From: Jerry Ling Date: Sat, 3 Oct 2020 15:00:45 -0400 Subject: [PATCH 12/26] Update src/counts.jl Co-authored-by: Milan Bouchet-Valat --- src/counts.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/counts.jl b/src/counts.jl index 8bfbf898d..4f2496c5f 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -360,7 +360,6 @@ function addcounts_radixsort!(cm::Dict{T}, x) where T return _addcounts_radix_sort_loop!(cm, sx) end - function addcounts!(cm::Dict{T}, x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} n = length(x) length(wv) == n || throw(DimensionMismatch()) From d618674192e299450a8df0a0e8ff784ea8a1f581 Mon Sep 17 00:00:00 2001 From: Jerry Ling Date: Sat, 3 Oct 2020 15:01:08 -0400 Subject: [PATCH 13/26] Update src/counts.jl Co-authored-by: Milan Bouchet-Valat --- src/counts.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/counts.jl b/src/counts.jl index 4f2496c5f..6ab8a89fe 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -395,7 +395,7 @@ of occurrences. """ countmap(x::AbstractArray{T}; alg = :auto) where {T} = addcounts!(Dict{T,Int}(), x; alg = alg) countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcounts!(Dict{T,W}(), x, wv) -countmap(x) = addcounts!(Dict{eltype(x),Int}(), x; alg = :auto) +countmap(x; alg = :auto) = addcounts!(Dict{eltype(x),Int}(), x; alg = alg) """ From 20b5c2c0d70a350f43744204b13ab27663c77966 Mon Sep 17 00:00:00 2001 From: Jerry Ling Date: Sat, 3 Oct 2020 15:01:16 -0400 Subject: [PATCH 14/26] Update src/counts.jl Co-authored-by: Milan Bouchet-Valat --- src/counts.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/counts.jl b/src/counts.jl index 6ab8a89fe..65fb017ae 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -295,7 +295,7 @@ function _addcounts!(::Type{Bool}, cm::Dict{Bool}, x::AbstractArray{Bool}; alg = cm end -function _addcounts!(::Type{T}, cm::Dict{T}, x::AbstractArray{T}; alg = :ignored) where T <: Union{UInt8, UInt16, Int8, Int16} +function _addcounts!(::Type{T}, cm::Dict{T}, x; alg = :ignored) where T <: Union{UInt8, UInt16, Int8, Int16} counts = zeros(Int, 2^(8sizeof(T))) @inbounds for xi in x From 456bc9f656ac32266edb5ecba8fe10c9b0eb1bbc Mon Sep 17 00:00:00 2001 From: Jerry Ling Date: Sat, 3 Oct 2020 15:01:32 -0400 Subject: [PATCH 15/26] Update src/counts.jl Co-authored-by: Milan Bouchet-Valat --- src/counts.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/counts.jl b/src/counts.jl index 65fb017ae..c0483bfe9 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -288,7 +288,7 @@ end # faster results and less memory usage. However we still wish to enable others # to write generic algorithms, therefore the methods below still accept the # `alg` argument but it is ignored. -function _addcounts!(::Type{Bool}, cm::Dict{Bool}, x::AbstractArray{Bool}; alg = :ignored) +function _addcounts!(::Type{Bool}, cm::Dict{Bool}, x; alg = :ignored) sumx = sum(x) cm[true] = get(cm, true, 0) + sumx cm[false] = get(cm, false, 0) + length(x) - sumx From af528bbe99ec58a14de1b31e68b41eee8773180f Mon Sep 17 00:00:00 2001 From: Jerry Ling Date: Sat, 3 Oct 2020 15:01:43 -0400 Subject: [PATCH 16/26] Update src/counts.jl Co-authored-by: Milan Bouchet-Valat --- src/counts.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/counts.jl b/src/counts.jl index c0483bfe9..8fc74965e 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -255,7 +255,8 @@ raw counts. - `:dict`: use `Dict`-based method which is generally slower but uses less RAM and is safe for any data type. """ -addcounts!(cm::Dict, x; alg = :auto) = _addcounts!(eltype(x), cm, x) +addcounts!(cm::Dict, x; alg = :auto) = _addcounts!(eltype(x), cm, x, alg = alg) + # manual dispatch for `x` being iterator function _addcounts!(::Type{T}, cm::Dict, x; alg = :auto) where T # if it's safe to be sorted using radixsort then it should be faster From df3828214d638a2f195feaccd46791997578aa9d Mon Sep 17 00:00:00 2001 From: Moelf Date: Sat, 3 Oct 2020 15:15:39 -0400 Subject: [PATCH 17/26] add tests for Bool / small byte size iterator --- src/counts.jl | 6 +++--- test/counts.jl | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index 8fc74965e..e26aee6aa 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -289,12 +289,13 @@ end # faster results and less memory usage. However we still wish to enable others # to write generic algorithms, therefore the methods below still accept the # `alg` argument but it is ignored. -function _addcounts!(::Type{Bool}, cm::Dict{Bool}, x; alg = :ignored) +function _addcounts!(::Type{Bool}, cm::Dict{Bool}, x::AbstractArray{Bool}; alg = :ignored) sumx = sum(x) cm[true] = get(cm, true, 0) + sumx cm[false] = get(cm, false, 0) + length(x) - sumx cm end +_addcounts!(::Type{Bool}, cm::Dict{Bool}, x; alg = :ignored) = _addcounts!(Bool, cm, collect(x); alg = alg) function _addcounts!(::Type{T}, cm::Dict{T}, x; alg = :ignored) where T <: Union{UInt8, UInt16, Int8, Int16} counts = zeros(Int, 2^(8sizeof(T))) @@ -394,9 +395,8 @@ of occurrences. - `:dict`: use `Dict`-based method which is generally slower but uses less RAM and is safe for any data type. """ -countmap(x::AbstractArray{T}; alg = :auto) where {T} = addcounts!(Dict{T,Int}(), x; alg = alg) -countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcounts!(Dict{T,W}(), x, wv) countmap(x; alg = :auto) = addcounts!(Dict{eltype(x),Int}(), x; alg = alg) +countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcounts!(Dict{T,W}(), x, wv) """ diff --git a/test/counts.jl b/test/counts.jl index 4eb73f039..22aae4101 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -129,11 +129,12 @@ pm = proportionmap(x, weights(w)) # testing small bits type bx = [true, false, true, true, false] -@test countmap(bx) == Dict(true => 3, false => 2) +@test countmap(skipmissing(bx)) == countmap(bx) == Dict(true => 3, false => 2) for T in [UInt8, UInt16, Int8, Int16] tx = T[typemin(T), 8, typemax(T), 19, 8] - @test countmap(tx) == Dict(typemin(T) => 1, typemax(T) => 1, 8 => 2, 19 => 1) + tx_missing = skipmissing(T[typemin(T), 8, typemax(T), 19, 8]) + @test countmap(tx) == countmap(tx_missing) == Dict(typemin(T) => 1, typemax(T) => 1, 8 => 2, 19 => 1) end @testset "views" begin From e016e6b061f0874e5cd5e1f2a731c056cec224d5 Mon Sep 17 00:00:00 2001 From: Moelf Date: Sat, 3 Oct 2020 17:36:35 -0400 Subject: [PATCH 18/26] Faster Bool iterator --- src/counts.jl | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/counts.jl b/src/counts.jl index e26aee6aa..bc65e3bc2 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -295,7 +295,19 @@ function _addcounts!(::Type{Bool}, cm::Dict{Bool}, x::AbstractArray{Bool}; alg = cm[false] = get(cm, false, 0) + length(x) - sumx cm end -_addcounts!(::Type{Bool}, cm::Dict{Bool}, x; alg = :ignored) = _addcounts!(Bool, cm, collect(x); alg = alg) + +#speailized for `Bool` iterator +function _addcounts!(::Type{Bool}, cm::Dict{Bool}, x; alg = :ignored) + sumx = 0 + len = 0 + for i in x + sumx += i + len += 1 + end + cm[true] = get(cm, true, 0) + sumx + cm[false] = get(cm, false, 0) + len - sumx + cm +end function _addcounts!(::Type{T}, cm::Dict{T}, x; alg = :ignored) where T <: Union{UInt8, UInt16, Int8, Int16} counts = zeros(Int, 2^(8sizeof(T))) From 68cbca141a4184874676b3197ca3e26b7c268ffd Mon Sep 17 00:00:00 2001 From: Jerry Ling Date: Sun, 4 Oct 2020 11:03:53 -0400 Subject: [PATCH 19/26] Update src/counts.jl Co-authored-by: Milan Bouchet-Valat --- src/counts.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/counts.jl b/src/counts.jl index bc65e3bc2..170922bdc 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -296,7 +296,7 @@ function _addcounts!(::Type{Bool}, cm::Dict{Bool}, x::AbstractArray{Bool}; alg = cm end -#speailized for `Bool` iterator +# specialized for `Bool` iterator function _addcounts!(::Type{Bool}, cm::Dict{Bool}, x; alg = :ignored) sumx = 0 len = 0 From 7ffa1bb7f92c9b1ef30644c40e50d7f7c05c5318 Mon Sep 17 00:00:00 2001 From: Jerry Ling Date: Sun, 4 Oct 2020 11:04:09 -0400 Subject: [PATCH 20/26] Update src/counts.jl Co-authored-by: Milan Bouchet-Valat --- src/counts.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/counts.jl b/src/counts.jl index 170922bdc..3278fce9d 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -257,7 +257,6 @@ raw counts. """ addcounts!(cm::Dict, x; alg = :auto) = _addcounts!(eltype(x), cm, x, alg = alg) -# manual dispatch for `x` being iterator function _addcounts!(::Type{T}, cm::Dict, x; alg = :auto) where T # if it's safe to be sorted using radixsort then it should be faster # albeit using more RAM From 397dced50c8bd7b6ff642f796277afff1895a01c Mon Sep 17 00:00:00 2001 From: Moelf Date: Sun, 4 Oct 2020 12:11:38 -0400 Subject: [PATCH 21/26] More tests --- test/counts.jl | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/test/counts.jl b/test/counts.jl index 22aae4101..640472ade 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -90,6 +90,17 @@ pm = proportionmap(x) xx = repeat([6, 1, 3, 1], outer=100_000) cm = countmap(xx) @test cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) +#with iterator +xx_missing = skipmissing(xx) +cm_missing = countmap(xx_missing) +@test cm_missing isa Dict{Int, Int} +@test cm_missing == cm + +cm_any_itr = countmap((i for i in xx)) +@test cm_any_itr[1] == 200_000 +@test cm_any_itr[3] == 100_000 +@test cm_any_itr[6] == 100_000 +@test cm_any_itr isa Dict{Any,Int} #no knowledge about type # testing the radixsort-based addcounts xx = repeat([6, 1, 3, 1], outer=100_000) @@ -99,6 +110,12 @@ StatsBase.addcounts_radixsort!(cm,xx) xx2 = repeat([7, 1, 3, 1], outer=100_000) StatsBase.addcounts_radixsort!(cm,xx2) @test cm == Dict(1 => 400_000, 3 => 200_000, 6 => 100_000, 7 => 100_000) +# with iterator +cm_missing = Dict{Int, Int}() +StatsBase.addcounts_radixsort!(cm_missing,skipmissing(xx)) +@test cm_missing == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) +StatsBase.addcounts_radixsort!(cm_missing,skipmissing(xx2)) +@test cm_missing == Dict(1 => 400_000, 3 => 200_000, 6 => 100_000, 7 => 100_000) # testing the Dict-based addcounts cm = Dict{Int, Int}() @@ -106,20 +123,13 @@ cm_itr = Dict{Int, Int}() StatsBase.addcounts_dict!(cm,xx) StatsBase.addcounts_dict!(cm_itr,skipmissing(xx)) @test cm_itr == cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) +@test cm_itr isa Dict{Int, Int} -# test countmap for general iterators cm = countmap(x, weights(w)) @test cm["a"] == 5.5 @test cm["b"] == 4.5 @test cm["c"] == 3.5 -xx_missing = skipmissing([missing, "b", "a", "a", "b", "c"]) -cm_missing = countmap(xx_missing) - -@test cm_missing["a"] == 2 -@test cm_missing["b"] == 2 -@test cm_missing["c"] == 1 - @test cm == countmap(x, w) pm = proportionmap(x, weights(w)) @@ -129,12 +139,16 @@ pm = proportionmap(x, weights(w)) # testing small bits type bx = [true, false, true, true, false] -@test countmap(skipmissing(bx)) == countmap(bx) == Dict(true => 3, false => 2) +cm_bx_missing = countmap(skipmissing(bx)) +@test cm_bx_missing == countmap(bx) == Dict(true => 3, false => 2) +@test cm_bx_missing isa Dict{Bool, Int} for T in [UInt8, UInt16, Int8, Int16] tx = T[typemin(T), 8, typemax(T), 19, 8] tx_missing = skipmissing(T[typemin(T), 8, typemax(T), 19, 8]) - @test countmap(tx) == countmap(tx_missing) == Dict(typemin(T) => 1, typemax(T) => 1, 8 => 2, 19 => 1) + cm_tx_missing = countmap(tx_missing) + @test cm_tx_missing == countmap(tx) == Dict(typemin(T) => 1, typemax(T) => 1, 8 => 2, 19 => 1) + @test cm_tx_missing isa Dict{T, Int} end @testset "views" begin From 931e79d4fb37083916c05909c931f8b068f7861c Mon Sep 17 00:00:00 2001 From: Jerry Ling Date: Mon, 5 Oct 2020 10:21:49 -0400 Subject: [PATCH 22/26] Update test/counts.jl Co-authored-by: Milan Bouchet-Valat --- test/counts.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/counts.jl b/test/counts.jl index 640472ade..5a6d81ad8 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -90,9 +90,9 @@ pm = proportionmap(x) xx = repeat([6, 1, 3, 1], outer=100_000) cm = countmap(xx) @test cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000) -#with iterator -xx_missing = skipmissing(xx) -cm_missing = countmap(xx_missing) + +# with iterator +cm_missing = countmap(skipmissing(xx)) @test cm_missing isa Dict{Int, Int} @test cm_missing == cm From b22e376c4bd2fd77610684df7ab8d4d2a8b294b4 Mon Sep 17 00:00:00 2001 From: Jerry Ling Date: Mon, 5 Oct 2020 10:24:35 -0400 Subject: [PATCH 23/26] Update test/counts.jl Co-authored-by: Milan Bouchet-Valat --- test/counts.jl | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/test/counts.jl b/test/counts.jl index 5a6d81ad8..860d7007f 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -97,10 +97,8 @@ cm_missing = countmap(skipmissing(xx)) @test cm_missing == cm cm_any_itr = countmap((i for i in xx)) -@test cm_any_itr[1] == 200_000 -@test cm_any_itr[3] == 100_000 -@test cm_any_itr[6] == 100_000 -@test cm_any_itr isa Dict{Any,Int} #no knowledge about type +@test cm_any_itr isa Dict{Any,Int} # no knowledge about type +@test cm_missing == cm # testing the radixsort-based addcounts xx = repeat([6, 1, 3, 1], outer=100_000) From 99fb67938d69f26371d40caca02bdc2badb60c18 Mon Sep 17 00:00:00 2001 From: Moelf Date: Tue, 6 Oct 2020 11:28:10 -0400 Subject: [PATCH 24/26] Add non-radix iterator countmap test --- test/counts.jl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/counts.jl b/test/counts.jl index 860d7007f..711393ddc 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -80,6 +80,13 @@ cm = countmap(x) @test cm["a"] == 3 @test cm["b"] == 2 @test cm["c"] == 1 + +# iterator, non-radixsort +cm_missing = countmap(skipmissing(x)) +cm_itr = countmap((i for i in x)) +@test cm_missing == cm +@test cm_itr == cm + pm = proportionmap(x) @test pm["a"] ≈ (1/2) @test pm["b"] ≈ (1/3) From 31ac4de905f131f34b7b297331dc9b2cc327ca98 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Tue, 6 Oct 2020 18:20:23 +0200 Subject: [PATCH 25/26] Update test/counts.jl --- test/counts.jl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/counts.jl b/test/counts.jl index 711393ddc..aba46b99a 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -83,9 +83,10 @@ cm = countmap(x) # iterator, non-radixsort cm_missing = countmap(skipmissing(x)) -cm_itr = countmap((i for i in x)) -@test cm_missing == cm -@test cm_itr == cm +cm_any_itr = countmap((i for i in x)) +@test cm_missing == cm_any_itr == cm +@test cm_missing isa Dict{String, Int} +@test cm_any_itr isa Dict{String, Int} pm = proportionmap(x) @test pm["a"] ≈ (1/2) From 008b1f7346d350850eb84381fc2ee0052fa4176e Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Tue, 6 Oct 2020 18:23:54 +0200 Subject: [PATCH 26/26] Update counts.jl --- test/counts.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/counts.jl b/test/counts.jl index aba46b99a..9f684df86 100644 --- a/test/counts.jl +++ b/test/counts.jl @@ -86,7 +86,7 @@ cm_missing = countmap(skipmissing(x)) cm_any_itr = countmap((i for i in x)) @test cm_missing == cm_any_itr == cm @test cm_missing isa Dict{String, Int} -@test cm_any_itr isa Dict{String, Int} +@test cm_any_itr isa Dict{Any, Int} pm = proportionmap(x) @test pm["a"] ≈ (1/2)