From b7204205682c22e479f17a24a775e1412f32adec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 6 Nov 2020 15:42:07 +0100 Subject: [PATCH 1/3] make hashrows_col! not depend on CategoricalArrays.jl --- src/dataframerow/utils.jl | 45 ++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/src/dataframerow/utils.jl b/src/dataframerow/utils.jl index 37a727bb9f..b2604070c9 100644 --- a/src/dataframerow/utils.jl +++ b/src/dataframerow/utils.jl @@ -35,19 +35,35 @@ function hashrows_col!(h::Vector{UInt}, end # should give the same hash as AbstractVector{T} -function hashrows_col!(h::Vector{UInt}, - n::Vector{Bool}, - v::AbstractCategoricalVector, - firstcol::Bool) - levs = levels(v) +function hashrows_col_pool!(h::Vector{UInt}, + n::Vector{Bool}, + v::AbstractVector, + rp::AbstractVector, # this condition is currently met in all implementations, but is not part of the API + firstcol::Bool) # When hashing the first column, no need to take into account previous hash, # which is always zero - if firstcol - hashes = Vector{UInt}(undef, length(levs)+1) - hashes[1] = hash(missing) - hashes[2:end] .= hash.(levs) - @inbounds for (i, ref) in enumerate(v.refs) - h[i] = hashes[ref+1] + # also when there are more than 90% of refs in the pool than the length of the + # vector avoid using this path. 90% is picked heuristically + if firstcol && length(rp) < 0.9length(v) + hashes = Vector{UInt}(undef, length(rp)) + @inbounds for (i, v) in zip(eachindex(hashes), rp) + hashes[i] = hash(v) + end + + fi = firstindex(rp) + if fi == 1 + @inbounds for (i, ref) in enumerate(DataAPI.refarray(v)) + h[i] = hashes[ref] + end + elseif fi == 0 + @inbounds for (i, ref) in enumerate(DataAPI.refarray(v)) + h[i] = hashes[ref+1] + end + else + # currently no implementation of DataAPI.jl interface hits this branch + @inbounds for (i, ref) in enumerate(DataAPI.refarray(v)) + h[i] = hashes[ref+1-fi] + end end else @inbounds for (i, x) in enumerate(v) @@ -67,7 +83,12 @@ function hashrows(cols::Tuple{Vararg{AbstractVector}}, skipmissing::Bool) rhashes = zeros(UInt, len) missings = fill(false, skipmissing ? len : 0) for (i, col) in enumerate(cols) - hashrows_col!(rhashes, missings, col, i == 1) + rp = DataAPI.refpool(x) + if rp === nothing + hashrows_col!(rhashes, missings, col, i == 1) + else + hashrows_col_pool!(rhashes, missings, col, rp, i == 1) + end end return (rhashes, missings) end From 0d66e90ec572fac8bcba32a830f2247ebb2952a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 6 Nov 2020 17:45:46 +0100 Subject: [PATCH 2/3] Update src/dataframerow/utils.jl Co-authored-by: Milan Bouchet-Valat --- src/dataframerow/utils.jl | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/src/dataframerow/utils.jl b/src/dataframerow/utils.jl index b2604070c9..20e858cad5 100644 --- a/src/dataframerow/utils.jl +++ b/src/dataframerow/utils.jl @@ -51,19 +51,8 @@ function hashrows_col_pool!(h::Vector{UInt}, end fi = firstindex(rp) - if fi == 1 - @inbounds for (i, ref) in enumerate(DataAPI.refarray(v)) - h[i] = hashes[ref] - end - elseif fi == 0 - @inbounds for (i, ref) in enumerate(DataAPI.refarray(v)) - h[i] = hashes[ref+1] - end - else - # currently no implementation of DataAPI.jl interface hits this branch - @inbounds for (i, ref) in enumerate(DataAPI.refarray(v)) - h[i] = hashes[ref+1-fi] - end + @inbounds for (i, ref) in enumerate(DataAPI.refarray(v)) + h[i] = hashes[ref+1-fi] end else @inbounds for (i, x) in enumerate(v) From 6584636452962747f8b27042f84fd739a8b17051 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 6 Nov 2020 18:38:48 +0100 Subject: [PATCH 3/3] fixes after code review --- src/dataframerow/utils.jl | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/dataframerow/utils.jl b/src/dataframerow/utils.jl index 20e858cad5..e73f765683 100644 --- a/src/dataframerow/utils.jl +++ b/src/dataframerow/utils.jl @@ -23,6 +23,7 @@ end function hashrows_col!(h::Vector{UInt}, n::Vector{Bool}, v::AbstractVector{T}, + rp::Nothing, firstcol::Bool) where T @inbounds for i in eachindex(h) el = v[i] @@ -35,11 +36,11 @@ function hashrows_col!(h::Vector{UInt}, end # should give the same hash as AbstractVector{T} -function hashrows_col_pool!(h::Vector{UInt}, - n::Vector{Bool}, - v::AbstractVector, - rp::AbstractVector, # this condition is currently met in all implementations, but is not part of the API - firstcol::Bool) +function hashrows_col!(h::Vector{UInt}, + n::Vector{Bool}, + v::AbstractVector, + rp::Any, + firstcol::Bool) # When hashing the first column, no need to take into account previous hash, # which is always zero # also when there are more than 90% of refs in the pool than the length of the @@ -51,6 +52,8 @@ function hashrows_col_pool!(h::Vector{UInt}, end fi = firstindex(rp) + # here we rely on the fact that `DataAPI.refpool` supports a continuous + # block of indices @inbounds for (i, ref) in enumerate(DataAPI.refarray(v)) h[i] = hashes[ref+1-fi] end @@ -72,12 +75,8 @@ function hashrows(cols::Tuple{Vararg{AbstractVector}}, skipmissing::Bool) rhashes = zeros(UInt, len) missings = fill(false, skipmissing ? len : 0) for (i, col) in enumerate(cols) - rp = DataAPI.refpool(x) - if rp === nothing - hashrows_col!(rhashes, missings, col, i == 1) - else - hashrows_col_pool!(rhashes, missings, col, rp, i == 1) - end + rp = DataAPI.refpool(col) + hashrows_col!(rhashes, missings, col, rp, i == 1) end return (rhashes, missings) end