make hashrows_col! not depend on CategoricalArrays.jl (#2518)

JuliaData · Nov 7, 2020 · b9e47e6 · b9e47e6
1 parent 6bc0ae8
commit b9e47e6
Show file tree

Hide file tree

Showing 2 changed files with 33 additions and 9 deletions.
diff --git a/src/dataframerow/utils.jl b/src/dataframerow/utils.jl
@@ -23,6 +23,7 @@ end
 function hashrows_col!(h::Vector{UInt},
                        n::Vector{Bool},
                        v::AbstractVector{T},
+                       rp::Nothing,
                        firstcol::Bool) where T
     @inbounds for i in eachindex(h)
         el = v[i]
@@ -37,17 +38,24 @@ end
 # should give the same hash as AbstractVector{T}
 function hashrows_col!(h::Vector{UInt},
                        n::Vector{Bool},
-                       v::AbstractCategoricalVector,
+                       v::AbstractVector,
+                       rp::Any,
                        firstcol::Bool)
-    levs = levels(v)
     # When hashing the first column, no need to take into account previous hash,
     # which is always zero
-    if firstcol
-        hashes = Vector{UInt}(undef, length(levs)+1)
-        hashes[1] = hash(missing)
-        hashes[2:end] .= hash.(levs)
-        @inbounds for (i, ref) in enumerate(v.refs)
-            h[i] = hashes[ref+1]
+    # also when the number of values in the pool is more than half the length
+    # of the vector avoid using this path. 50% is roughly based on benchmarks
+    if firstcol && 2 * length(rp) < length(v)
+        hashes = Vector{UInt}(undef, length(rp))
+        @inbounds for (i, v) in zip(eachindex(hashes), rp)
+            hashes[i] = hash(v)
+        end
+
+        fi = firstindex(rp)
+        # here we rely on the fact that `DataAPI.refpool` has a continuous
+        # block of indices
+        @inbounds for (i, ref) in enumerate(DataAPI.refarray(v))
+            h[i] = hashes[ref+1-fi]
         end
     else
         @inbounds for (i, x) in enumerate(v)
@@ -67,7 +75,8 @@ function hashrows(cols::Tuple{Vararg{AbstractVector}}, skipmissing::Bool)
     rhashes = zeros(UInt, len)
     missings = fill(false, skipmissing ? len : 0)
     for (i, col) in enumerate(cols)
-        hashrows_col!(rhashes, missings, col, i == 1)
+        rp = DataAPI.refpool(col)
+        hashrows_col!(rhashes, missings, col, rp, i == 1)
     end
     return (rhashes, missings)
 end

diff --git a/test/grouping.jl b/test/grouping.jl
@@ -3178,4 +3178,19 @@ end
                     :min => min.(df.y, df.z), :max => max.(df.y, df.z), :y => df.y) |> sort
 end
 
+@testset "hashing of pooled vectors" begin
+    # test both hashrow calculation paths - the of pool length threshold is 50%
+    for x in ([1:9; fill(1, 101)], [1:100;],
+              [1:9; fill(missing, 101)], [1:99; missing])
+        x1 = PooledArray(x);
+        x2 = categorical(x);
+        @test DataFrames.hashrows((x,), false) ==
+              DataFrames.hashrows((x1,), false) ==
+              DataFrames.hashrows((x2,), false)
+        @test DataFrames.hashrows((x,), true) ==
+              DataFrames.hashrows((x1,), true) ==
+              DataFrames.hashrows((x2,), true)
+    end
+end
+
 end # module