Skip to content

Commit

Permalink
Merge b46ed00 into 4b58684
Browse files Browse the repository at this point in the history
  • Loading branch information
nalimilan committed Sep 21, 2020
2 parents 4b58684 + b46ed00 commit 18de384
Show file tree
Hide file tree
Showing 4 changed files with 234 additions and 56 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Expand Up @@ -35,7 +35,7 @@ test = ["DataStructures", "DataValues", "Dates", "Logging", "Random", "Test"]

[compat]
julia = "1"
CategoricalArrays = "0.8"
CategoricalArrays = "0.8.3"
Compat = "2.2, 3"
DataAPI = "1.2"
InvertedIndices = "1"
Expand Down
158 changes: 115 additions & 43 deletions src/dataframerow/utils.jl
Expand Up @@ -94,10 +94,20 @@ isequal_row(cols1::Tuple{Vararg{AbstractVector}}, r1::Int,
# 4) whether groups are already sorted
# Optional `groups` vector is set to the group indices of each row (starting at 1)
# With skipmissing=true, rows with missing values are attributed index 0.
row_group_slots(cols::Tuple{Vararg{AbstractVector}},
hash::Val = Val(true),
groups::Union{Vector{Int}, Nothing} = nothing,
skipmissing::Bool = false,
sort::Bool = false)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} =
row_group_slots(cols, DataAPI.refpool.(cols), hash, groups, skipmissing, sort)

# Generic fallback method based on open adressing hash table
function row_group_slots(cols::Tuple{Vararg{AbstractVector}},
refpools::Any,
hash::Val = Val(true),
groups::Union{Vector{Int}, Nothing} = nothing,
skipmissing::Bool = false)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool}
skipmissing::Bool = false,
sort::Bool = false)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool}
@assert groups === nothing || length(groups) == length(cols[1])
rhashes, missings = hashrows(cols, skipmissing)
# inspired by Dict code from base cf. https://github.com/JuliaData/DataTables.jl/pull/17#discussion_r102481481
Expand Down Expand Up @@ -140,70 +150,133 @@ function row_group_slots(cols::Tuple{Vararg{AbstractVector}},
return ngroups, rhashes, gslots, false
end

nlevels(x::PooledArray) = length(x.pool)
nlevels(x) = length(levels(x))

function row_group_slots(cols::NTuple{N,<:Union{CategoricalVector,PooledVector}},
# Optimized method for arrays for which DataAPI.refpool is defined and returns an AbstractVector
function row_group_slots(cols::NTuple{N,<:AbstractVector},
refpools::NTuple{N,<:AbstractVector},
hash::Val{false},
groups::Union{Vector{Int}, Nothing} = nothing,
skipmissing::Bool = false)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} where N
skipmissing::Bool = false,
sort::Bool = false)::Tuple{Int, Vector{UInt}, Vector{Int}, Bool} where N
# Computing neither hashes nor groups isn't very useful,
# and this method needs to allocate a groups vector anyway
@assert groups !== nothing && all(col -> length(col) == length(groups), cols)

refpools = map(DataAPI.refpool, cols)
refs = map(DataAPI.refarray, cols)
missinginds = map(refpools) do refpool
eltype(refpool) >: Missing ?
something(findfirst(ismissing, refpool), lastindex(refpool)+1) : lastindex(refpool)+1
end

# If skipmissing=true, rows with missings all go to group 0,
# which will be removed by functions down the stream
ngroupstup = map(cols) do c
nlevels(c) + (!skipmissing && eltype(c) >: Missing)
ngroupstup = map(refpools, missinginds) do refpool, missingind
len = length(refpool)
if skipmissing && missingind <= lastindex(refpool)
return len - 1
else
return len
end
end
ngroups = prod(ngroupstup)

# Fall back to hashing if there would be too many empty combinations.
# Fall back to hashing if there would be too many empty combinations
# or if the pool does not contain only unique values
# The first check ensures the computation of ngroups did not overflow.
# The rationale for the 2 threshold is that while the fallback method is always slower,
# it allocates a hash table of size length(groups) instead of the remap vector
# of size ngroups (i.e. the number of possible combinations) in this method:
# so it makes sense to allocate more memory for better performance,
# but it needs to remain reasonable compared with the size of the data frame.
if prod(Int128.(ngroupstup)) > typemax(Int) || ngroups > 2 * length(groups)
anydups = !all(allunique, refpools)
if prod(Int128.(ngroupstup)) > typemax(Int) ||
ngroups > 2 * length(groups) ||
anydups
# In the simplest case, we can work directly with the reference codes
newcols = (skipmissing && any(refpool -> eltype(refpool) >: Missing, refpools)) ||
sort ||
anydups ? cols : refs
return invoke(row_group_slots,
Tuple{Tuple{Vararg{AbstractVector}}, Val,
Union{Vector{Int}, Nothing}, Bool},
cols, hash, groups, skipmissing)
Tuple{Tuple{Vararg{AbstractVector}}, Any, Val,
Union{Vector{Int}, Nothing}, Bool, Bool},
newcols, refpools, hash, groups, skipmissing, sort)
end

seen = fill(false, ngroups)
# Compute vector mapping missing to -1 if skipmissing=true
refmaps = map(cols) do col
nlevs = nlevels(col)
refmap = collect(-1:(nlevs-1))
# First value in refmap is only used by CategoricalArray
# (corresponds to ref 0, i.e. missing values)
refmap[1] = skipmissing ? -1 : nlevs
if col isa PooledArray{>: Missing} && skipmissing
missingind = get(col.invpool, missing, 0)
if missingind > 0
refmap[missingind+1] = -1
refmap[missingind+2:end] .-= 1
end
end
refmap
end
strides = (cumprod(collect(reverse(ngroupstup)))[end-1:-1:1]..., 1)::NTuple{N,Int}
@inbounds for i in eachindex(groups)
local refs
let i=i # Workaround for julia#15276
refs = map(c -> c.refs[i], cols)
firstinds = map(firstindex, refpools)
if sort
nminds = map(refpools, missinginds) do refpool, missingind
missingind > lastindex(refpool) ?
eachindex(refpool) : setdiff(eachindex(refpool), missingind)
end
vals = map((m, r, s) -> m[r+1] * s, refmaps, refs, strides)
j = sum(vals) + 1
# x < 0 happens with -1 in refmap, which corresponds to missing
if skipmissing && any(x -> x < 0, vals)
j = 0
if skipmissing
sorted = all(issorted(view(refpool, nmind))
for (refpool, nmind) in zip(refpools, nminds))
else
seen[j] = true
sorted = all(issorted, refpools)
end
else
sorted = false
end
if sort && !sorted
# Compute vector mapping missing to -1 if skipmissing=true
refmaps = map(cols, refpools, missinginds, nminds) do col, refpool, missingind, nmind
refmap = collect(0:length(refpool)-1)
if skipmissing
fi = firstindex(refpool)
if missingind <= lastindex(refpool)
refmap[missingind-fi+1] = -1
refmap[missingind-fi+2:end] .-= 1
end
if sort
perm = sortperm(view(refpool, nmind))
invpermute!(view(refmap, nmind .- fi .+ 1), perm)
end
elseif sort
# collect is needed for CategoricalRefPool
invpermute!(refmap, sortperm(collect(refpool)))
end
refmap
end
@inbounds for i in eachindex(groups)
local refs_i
let i=i # Workaround for julia#15276
refs_i = map(c -> c[i], refs)
end
vals = map((m, r, s, fi) -> m[r-fi+1] * s, refmaps, refs_i, strides, firstinds)
j = sum(vals) + 1
# x < 0 happens with -1 in refmap, which corresponds to missing
if skipmissing && any(x -> x < 0, vals)
j = 0
else
seen[j] = true
end
groups[i] = j
end
else
@inbounds for i in eachindex(groups)
local refs_i
let i=i # Workaround for julia#15276
refs_i = map(refs, missinginds) do ref, missingind
r = Int(ref[i])
if skipmissing
return r == missingind ? -1 : (r > missingind ? r-1 : r)
else
return r
end
end
end
vals = map((r, s, fi) -> (r-fi) * s, refs_i, strides, firstinds)
j = sum(vals) + 1
# x < 0 happens with -1, which corresponds to missing
if skipmissing && any(x -> x < 0, vals)
j = 0
else
seen[j] = true
end
groups[i] = j
end
groups[i] = j
end
if !all(seen) # Compress group indices to remove unused ones
oldngroups = ngroups
Expand All @@ -220,8 +293,7 @@ function row_group_slots(cols::NTuple{N,<:Union{CategoricalVector,PooledVector}}
# To catch potential bugs inducing unnecessary computations
@assert oldngroups != ngroups
end
sorted = all(col -> col isa CategoricalVector, cols)
return ngroups, UInt[], Int[], sorted
return ngroups, UInt[], Int[], sort
end


Expand Down Expand Up @@ -267,7 +339,7 @@ end
function group_rows(df::AbstractDataFrame)
groups = Vector{Int}(undef, nrow(df))
ngroups, rhashes, gslots, sorted =
row_group_slots(ntuple(i -> df[!, i], ncol(df)), Val(true), groups, false)
row_group_slots(ntuple(i -> df[!, i], ncol(df)), Val(true), groups, false, false)
rperm, starts, stops = compute_indices(groups, ngroups)
return RowGroupDict(df, rhashes, gslots, groups, rperm, starts, stops)
end
Expand Down
3 changes: 2 additions & 1 deletion src/groupeddataframe/splitapplycombine.jl
Expand Up @@ -160,7 +160,8 @@ function groupby(df::AbstractDataFrame, cols;

groups = Vector{Int}(undef, nrow(df))
ngroups, rhashes, gslots, sorted =
row_group_slots(ntuple(i -> sdf[!, i], ncol(sdf)), Val(false), groups, skipmissing)
row_group_slots(ntuple(i -> sdf[!, i], ncol(sdf)), Val(false),
groups, skipmissing, sort)

gd = GroupedDataFrame(df, copy(_names(sdf)), groups, nothing, nothing, nothing, ngroups, nothing,
Threads.ReentrantLock())
Expand Down
127 changes: 116 additions & 11 deletions test/grouping.jl
Expand Up @@ -594,6 +594,122 @@ end
end
end

@testset "grouping arrays that allow missing without missings" begin
xv = ["A", "B", "B", "B", "A", "B", "A", "A"]
yv = ["B", "A", "A", "B", "A", "B", "A", "A"]
xvars = (xv,
categorical(xv),
levels!(categorical(xv), ["A", "B", "X"]),
levels!(categorical(xv), ["X", "B", "A"]),
_levels!(PooledArray(xv), ["A", "B"]),
_levels!(PooledArray(xv), ["B", "A", "X"]),
_levels!(PooledArray(xv), ["X", "A", "B"]))
yvars = (yv,
categorical(yv),
levels!(categorical(yv), ["A", "B", "X"]),
levels!(categorical(yv), ["B", "X", "A"]),
_levels!(PooledArray(yv), ["A", "B"]),
_levels!(PooledArray(yv), ["A", "B", "X"]),
_levels!(PooledArray(yv), ["B", "A", "X"]))
for x in xvars, y in yvars,
fx in (identity, allowmissing),
fy in (identity, allowmissing)
df = DataFrame(Key1 = fx(x), Key2 = fy(y), Value = 1:8)

@testset "sort=false, skipmissing=false" begin
gd = groupby_checked(df, :Key1)
@test length(gd) == 2
@test isequal_unordered(gd, [
DataFrame(Key1="A", Key2=["B", "A", "A", "A"], Value=[1, 5, 7, 8]),
DataFrame(Key1="B", Key2=["A", "A", "B", "B"], Value=[2, 3, 4, 6]),
])

gd = groupby_checked(df, [:Key1, :Key2])
@test length(gd) == 4
@test isequal_unordered(gd, [
DataFrame(Key1="A", Key2="A", Value=[5, 7, 8]),
DataFrame(Key1="A", Key2="B", Value=1),
DataFrame(Key1="B", Key2="A", Value=[2, 3]),
DataFrame(Key1="B", Key2="B", Value=[4, 6])
])
end

@testset "sort=false, skipmissing=true" begin
gd = groupby_checked(df, :Key1, skipmissing=true)
@test length(gd) == 2
@test isequal_unordered(gd, [
DataFrame(Key1="A", Key2=["B", "A", "A", "A"], Value=[1, 5, 7, 8]),
DataFrame(Key1="B", Key2=["A", "A", "B", "B"], Value=[2, 3, 4, 6])
])

gd = groupby_checked(df, [:Key1, :Key2], skipmissing=true)
@test length(gd) == 4
@test isequal_unordered(gd, [
DataFrame(Key1="A", Key2="A", Value=[5, 7, 8]),
DataFrame(Key1="A", Key2="B", Value=1),
DataFrame(Key1="B", Key2="A", Value=[2, 3]),
DataFrame(Key1="B", Key2="B", Value=[4, 6])
])
end

@testset "sort=true, skipmissing=false" begin
gd = groupby_checked(df, :Key1, sort=true)
@test length(gd) == 2
@test isequal_unordered(gd, [
DataFrame(Key1="A", Key2=["B", "A", "A", "A"], Value=[1, 5, 7, 8]),
DataFrame(Key1="B", Key2=["A", "A", "B", "B"], Value=[2, 3, 4, 6]),
])
@test issorted(vcat(gd...), :Key1)

gd = groupby_checked(df, [:Key1, :Key2], sort=true)
@test length(gd) == 4
@test isequal_unordered(gd, [
DataFrame(Key1="A", Key2="A", Value=[5, 7, 8]),
DataFrame(Key1="A", Key2="B", Value=1),
DataFrame(Key1="B", Key2="A", Value=[2, 3]),
DataFrame(Key1="B", Key2="B", Value=[4, 6]),
])
@test issorted(vcat(gd...), [:Key1, :Key2])
end

@testset "sort=true, skipmissing=true" begin
gd = groupby_checked(df, :Key1, sort=true, skipmissing=true)
@test length(gd) == 2
@test isequal_unordered(gd, [
DataFrame(Key1="A", Key2=["B", "A", "A", "A"], Value=[1, 5, 7, 8]),
DataFrame(Key1="B", Key2=["A", "A", "B", "B"], Value=[2, 3, 4, 6])
])
@test issorted(vcat(gd...), :Key1)

gd = groupby_checked(df, [:Key1, :Key2], sort=true, skipmissing=true)
@test length(gd) == 4
@test isequal_unordered(gd, [
DataFrame(Key1="A", Key2="A", Value=[5, 7, 8]),
DataFrame(Key1="A", Key2="B", Value=1),
DataFrame(Key1="B", Key2="A", Value=[2, 3]),
DataFrame(Key1="B", Key2="B", Value=[4, 6])
])
@test issorted(vcat(gd...), [:Key1, :Key2])
end
end
end

@testset "grouping refarray with fallback" begin
# The high number of categories compared to the number of rows triggers the use
# of the fallback grouping method
for x in ([3, 1, 2], [3, 1, missing])
df = DataFrame(x=categorical(x, levels=10000:-1:1),
x2=categorical(x, levels=3:-1:1),
y=[1, 2, 3])
for skipmissing in (true, false)
@test groupby(df, :x, sort=true, skipmissing=skipmissing)
groupby(df, :x, sort=true, skipmissing=skipmissing)
@test isequal_unordered(groupby(df, :x, skipmissing=skipmissing),
collect(AbstractDataFrame, groupby(df, :x, skipmissing=skipmissing)))
end
end
end

@testset "grouping with three keys" begin
# We need many rows so that optimized CategoricalArray method is used
xv = rand(["A", "B", missing], 100)
Expand Down Expand Up @@ -632,17 +748,6 @@ end
dfs = [groupby_checked(dfb, [:Key1, :Key2, :Key3], sort=true, skipmissing=true)...]
@test isequal_unordered(gd, dfs)
@test issorted(vcat(gd...), [:Key1, :Key2, :Key3])

# This is an implementation detail but it allows checking
# that the optimized method is used
if df.Key1 isa CategoricalVector &&
df.Key2 isa CategoricalVector &&
df.Key3 isa CategoricalVector
@test groupby_checked(df, [:Key1, :Key2, :Key3], sort=true)
groupby_checked(df, [:Key1, :Key2, :Key3], sort=false)
@test groupby_checked(df, [:Key1, :Key2, :Key3], sort=true, skipmissing=true)
groupby_checked(df, [:Key1, :Key2, :Key3], sort=false, skipmissing=true)
end
end
end

Expand Down

0 comments on commit 18de384

Please sign in to comment.