diff --git a/src/dataframerow/utils.jl b/src/dataframerow/utils.jl index 37a727bb9f..550b2b1175 100644 --- a/src/dataframerow/utils.jl +++ b/src/dataframerow/utils.jl @@ -23,6 +23,7 @@ end function hashrows_col!(h::Vector{UInt}, n::Vector{Bool}, v::AbstractVector{T}, + rp::Nothing, firstcol::Bool) where T @inbounds for i in eachindex(h) el = v[i] @@ -37,17 +38,24 @@ end # should give the same hash as AbstractVector{T} function hashrows_col!(h::Vector{UInt}, n::Vector{Bool}, - v::AbstractCategoricalVector, + v::AbstractVector, + rp::Any, firstcol::Bool) - levs = levels(v) # When hashing the first column, no need to take into account previous hash, # which is always zero - if firstcol - hashes = Vector{UInt}(undef, length(levs)+1) - hashes[1] = hash(missing) - hashes[2:end] .= hash.(levs) - @inbounds for (i, ref) in enumerate(v.refs) - h[i] = hashes[ref+1] + # also when the number of values in the pool is more than half the length + # of the vector avoid using this path. 50% is roughly based on benchmarks + if firstcol && 2 * length(rp) < length(v) + hashes = Vector{UInt}(undef, length(rp)) + @inbounds for (i, v) in zip(eachindex(hashes), rp) + hashes[i] = hash(v) + end + + fi = firstindex(rp) + # here we rely on the fact that `DataAPI.refpool` has a continuous + # block of indices + @inbounds for (i, ref) in enumerate(DataAPI.refarray(v)) + h[i] = hashes[ref+1-fi] end else @inbounds for (i, x) in enumerate(v) @@ -67,7 +75,8 @@ function hashrows(cols::Tuple{Vararg{AbstractVector}}, skipmissing::Bool) rhashes = zeros(UInt, len) missings = fill(false, skipmissing ? len : 0) for (i, col) in enumerate(cols) - hashrows_col!(rhashes, missings, col, i == 1) + rp = DataAPI.refpool(col) + hashrows_col!(rhashes, missings, col, rp, i == 1) end return (rhashes, missings) end diff --git a/test/grouping.jl b/test/grouping.jl index cafb5d7e08..952337b92f 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -3173,4 +3173,19 @@ end :min => min.(df.y, df.z), :max => max.(df.y, df.z), :y => df.y) |> sort end +@testset "hashing of pooled vectors" begin + # test both hashrow calculation paths - the of pool length threshold is 50% + for x in ([1:9; fill(1, 101)], [1:100;], + [1:9; fill(missing, 101)], [1:99; missing]) + x1 = PooledArray(x); + x2 = categorical(x); + @test DataFrames.hashrows((x,), false) == + DataFrames.hashrows((x1,), false) == + DataFrames.hashrows((x2,), false) + @test DataFrames.hashrows((x,), true) == + DataFrames.hashrows((x1,), true) == + DataFrames.hashrows((x2,), true) + end +end + end # module