From b7204205682c22e479f17a24a775e1412f32adec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Fri, 6 Nov 2020 15:42:07 +0100
Subject: [PATCH 1/3] make hashrows_col! not depend on CategoricalArrays.jl

---
 src/dataframerow/utils.jl | 45 ++++++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 12 deletions(-)

diff --git a/src/dataframerow/utils.jl b/src/dataframerow/utils.jl
index 37a727bb9f..b2604070c9 100644
--- a/src/dataframerow/utils.jl
+++ b/src/dataframerow/utils.jl
@@ -35,19 +35,35 @@ function hashrows_col!(h::Vector{UInt},
 end
 
 # should give the same hash as AbstractVector{T}
-function hashrows_col!(h::Vector{UInt},
-                       n::Vector{Bool},
-                       v::AbstractCategoricalVector,
-                       firstcol::Bool)
-    levs = levels(v)
+function hashrows_col_pool!(h::Vector{UInt},
+                            n::Vector{Bool},
+                            v::AbstractVector,
+                            rp::AbstractVector, # this condition is currently met in all implementations, but is not part of the API
+                            firstcol::Bool)
     # When hashing the first column, no need to take into account previous hash,
     # which is always zero
-    if firstcol
-        hashes = Vector{UInt}(undef, length(levs)+1)
-        hashes[1] = hash(missing)
-        hashes[2:end] .= hash.(levs)
-        @inbounds for (i, ref) in enumerate(v.refs)
-            h[i] = hashes[ref+1]
+    # also when there are more than 90% of refs in the pool than the length of the
+    # vector avoid using this path. 90% is picked heuristically
+    if firstcol && length(rp) < 0.9length(v)
+        hashes = Vector{UInt}(undef, length(rp))
+        @inbounds for (i, v) in zip(eachindex(hashes), rp)
+            hashes[i] = hash(v)
+        end
+
+        fi = firstindex(rp)
+        if fi == 1
+            @inbounds for (i, ref) in enumerate(DataAPI.refarray(v))
+                h[i] = hashes[ref]
+            end
+        elseif fi == 0
+            @inbounds for (i, ref) in enumerate(DataAPI.refarray(v))
+                h[i] = hashes[ref+1]
+            end
+        else
+            # currently no implementation of DataAPI.jl interface hits this branch
+            @inbounds for (i, ref) in enumerate(DataAPI.refarray(v))
+                h[i] = hashes[ref+1-fi]
+            end
         end
     else
         @inbounds for (i, x) in enumerate(v)
@@ -67,7 +83,12 @@ function hashrows(cols::Tuple{Vararg{AbstractVector}}, skipmissing::Bool)
     rhashes = zeros(UInt, len)
     missings = fill(false, skipmissing ? len : 0)
     for (i, col) in enumerate(cols)
-        hashrows_col!(rhashes, missings, col, i == 1)
+        rp = DataAPI.refpool(x)
+        if rp === nothing
+            hashrows_col!(rhashes, missings, col, i == 1)
+        else
+            hashrows_col_pool!(rhashes, missings, col, rp, i == 1)
+        end
     end
     return (rhashes, missings)
 end

From 0d66e90ec572fac8bcba32a830f2247ebb2952a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Fri, 6 Nov 2020 17:45:46 +0100
Subject: [PATCH 2/3] Update src/dataframerow/utils.jl

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 src/dataframerow/utils.jl | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/src/dataframerow/utils.jl b/src/dataframerow/utils.jl
index b2604070c9..20e858cad5 100644
--- a/src/dataframerow/utils.jl
+++ b/src/dataframerow/utils.jl
@@ -51,19 +51,8 @@ function hashrows_col_pool!(h::Vector{UInt},
         end
 
         fi = firstindex(rp)
-        if fi == 1
-            @inbounds for (i, ref) in enumerate(DataAPI.refarray(v))
-                h[i] = hashes[ref]
-            end
-        elseif fi == 0
-            @inbounds for (i, ref) in enumerate(DataAPI.refarray(v))
-                h[i] = hashes[ref+1]
-            end
-        else
-            # currently no implementation of DataAPI.jl interface hits this branch
-            @inbounds for (i, ref) in enumerate(DataAPI.refarray(v))
-                h[i] = hashes[ref+1-fi]
-            end
+        @inbounds for (i, ref) in enumerate(DataAPI.refarray(v))
+            h[i] = hashes[ref+1-fi]
         end
     else
         @inbounds for (i, x) in enumerate(v)

From 6584636452962747f8b27042f84fd739a8b17051 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Fri, 6 Nov 2020 18:38:48 +0100
Subject: [PATCH 3/3] fixes after code review

---
 src/dataframerow/utils.jl | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/dataframerow/utils.jl b/src/dataframerow/utils.jl
index 20e858cad5..e73f765683 100644
--- a/src/dataframerow/utils.jl
+++ b/src/dataframerow/utils.jl
@@ -23,6 +23,7 @@ end
 function hashrows_col!(h::Vector{UInt},
                        n::Vector{Bool},
                        v::AbstractVector{T},
+                       rp::Nothing,
                        firstcol::Bool) where T
     @inbounds for i in eachindex(h)
         el = v[i]
@@ -35,11 +36,11 @@ function hashrows_col!(h::Vector{UInt},
 end
 
 # should give the same hash as AbstractVector{T}
-function hashrows_col_pool!(h::Vector{UInt},
-                            n::Vector{Bool},
-                            v::AbstractVector,
-                            rp::AbstractVector, # this condition is currently met in all implementations, but is not part of the API
-                            firstcol::Bool)
+function hashrows_col!(h::Vector{UInt},
+                       n::Vector{Bool},
+                       v::AbstractVector,
+                       rp::Any,
+                       firstcol::Bool)
     # When hashing the first column, no need to take into account previous hash,
     # which is always zero
     # also when there are more than 90% of refs in the pool than the length of the
@@ -51,6 +52,8 @@ function hashrows_col_pool!(h::Vector{UInt},
         end
 
         fi = firstindex(rp)
+        # here we rely on the fact that `DataAPI.refpool` supports a continuous
+        # block of indices
         @inbounds for (i, ref) in enumerate(DataAPI.refarray(v))
             h[i] = hashes[ref+1-fi]
         end
@@ -72,12 +75,8 @@ function hashrows(cols::Tuple{Vararg{AbstractVector}}, skipmissing::Bool)
     rhashes = zeros(UInt, len)
     missings = fill(false, skipmissing ? len : 0)
     for (i, col) in enumerate(cols)
-        rp = DataAPI.refpool(x)
-        if rp === nothing
-            hashrows_col!(rhashes, missings, col, i == 1)
-        else
-            hashrows_col_pool!(rhashes, missings, col, rp, i == 1)
-        end
+        rp = DataAPI.refpool(col)
+        hashrows_col!(rhashes, missings, col, rp, i == 1)
     end
     return (rhashes, missings)
 end