Skip to content

Commit

Permalink
Use copy! in joins to preserve levels of CategoricalArray columns (#1266
Browse files Browse the repository at this point in the history
)
  • Loading branch information
cjprybol authored and nalimilan committed Nov 21, 2017
1 parent 2d29d2a commit ff2729b
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 11 deletions.
18 changes: 7 additions & 11 deletions src/abstractdataframe/join.jl
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,11 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol,
_similar = kind == :inner ? similar : similar_missing
for (i, col) in enumerate(columns(joiner.dfl))
cols[i] = _similar(col, nrow)
fillcolumn!(cols[i], col, all_orig_left_ixs)
copy!(cols[i], view(col, all_orig_left_ixs))
end
for (i, col) in enumerate(columns(dfr_noon))
cols[i+ncleft] = _similar(col, nrow)
fillcolumn!(cols[i+ncleft], col, all_orig_right_ixs)
copy!(cols[i+ncleft], view(col, all_orig_right_ixs))
permute!(cols[i+ncleft], right_perm)
end
res = DataFrame(cols, vcat(names(joiner.dfl), names(dfr_noon)))
Expand All @@ -89,20 +89,13 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol,
# need to be taken from the right
for (on_col_ix, on_col) in enumerate(joiner.on_cols)
# fix the result of the rightjoin by taking the nonmissing values from the right table
offset = nrow - length(rightonly_ixs.orig)
fillcolumn!(res[on_col], joiner.dfr_on[on_col_ix], rightonly_ixs.orig, offset)
offset = nrow - length(rightonly_ixs.orig) + 1
copy!(res[on_col], offset, view(joiner.dfr_on[on_col_ix], rightonly_ixs.orig))
end
end
return res
end

function fillcolumn!(dfcol::AbstractVector{T1}, refcol::AbstractVector{T2},
indices::Vector{Int}, offset::Int=0) where {T1, T2}
@inbounds for (j, k) in enumerate(indices)
dfcol[j+offset] = refcol[k]
end
end

# map the indices of the left and right joined tables
# to the indices of the rows in the resulting table
# if `nothing` is given, the corresponding map is not built
Expand Down Expand Up @@ -222,6 +215,9 @@ join(df1::AbstractDataFrame,
For the three join operations that may introduce missing values (`:outer`, `:left`,
and `:right`), all columns of the returned data table will support missing values.
When merging `on` categorical columns that differ in the ordering of their levels, the
ordering of the left `DataFrame` takes precedence over the ordering of the right `DataFrame`
### Result
* `::DataFrame` : the joined DataFrame
Expand Down
46 changes: 46 additions & 0 deletions test/join.jl
Original file line number Diff line number Diff line change
Expand Up @@ -325,4 +325,50 @@ module TestJoin
@test all(isa.(o(on).columns,
[CategoricalVector{Union{T, Missing}} for T in (Int, Float64)]))
end

@testset "maintain CategoricalArray levels ordering on join - non-`on` cols" begin
A = DataFrame(a = [1, 2, 3], b = ["a", "b", "c"])
B = DataFrame(b = ["a", "b", "c"], c = CategoricalVector(["a", "b", "b"]))
levels!(B[:c], ["b", "a"])
@test levels(join(A, B, on=:b, kind=:inner)[:c]) == ["b", "a"]
@test levels(join(B, A, on=:b, kind=:inner)[:c]) == ["b", "a"]
@test levels(join(A, B, on=:b, kind=:left)[:c]) == ["b", "a"]
@test levels(join(A, B, on=:b, kind=:right)[:c]) == ["b", "a"]
@test levels(join(A, B, on=:b, kind=:outer)[:c]) == ["b", "a"]
@test levels(join(B, A, on=:b, kind = :semi)[:c]) == ["b", "a"]
end

@testset "maintain CategoricalArray levels ordering on join - ordering conflicts" begin
A = DataFrame(a = [1, 2, 3, 4], b = CategoricalVector(["a", "b", "c", "d"]))
levels!(A[:b], ["d", "c", "b", "a"])
B = DataFrame(b = CategoricalVector(["a", "b", "c"]), c = [5, 6, 7])
@test levels(join(A, B, on=:b, kind=:inner)[:b]) == ["d", "c", "b", "a"]
@test levels(join(B, A, on=:b, kind=:inner)[:b]) == ["a", "b", "c"]
@test levels(join(A, B, on=:b, kind=:left)[:b]) == ["d", "c", "b", "a"]
@test levels(join(B, A, on=:b, kind=:left)[:b]) == ["a", "b", "c"]
@test levels(join(A, B, on=:b, kind=:right)[:b]) == ["d", "c", "b", "a"]
@test levels(join(B, A, on=:b, kind=:right)[:b]) == ["a", "b", "d", "c"]
@test levels(join(B, A, on=:b, kind=:outer)[:b]) == ["a", "b", "d", "c"]
@test levels(join(A, B, on=:b, kind=:outer)[:b]) == ["d", "c", "b", "a"]
@test levels(join(A, B, on=:b, kind = :semi)[:b]) == ["d", "c", "b", "a"]
@test levels(join(B, A, on=:b, kind = :semi)[:b]) == ["a", "b", "c"]
end

@testset "maintain CategoricalArray levels ordering on join - left is categorical" begin
A = DataFrame(a = [1, 2, 3, 4], b = CategoricalVector(["a", "b", "c", "d"]))
levels!(A[:b], ["d", "c", "b", "a"])
B = DataFrame(b = ["a", "b", "c"], c = [5, 6, 7])
@test levels(join(A, B, on=:b)[:b]) == ["d", "c", "b", "a"]
@test levels(join(B, A, on=:b)[:b]) == ["a", "b", "c"]
@test levels(join(A, B, on=:b, kind=:inner)[:b]) == ["d", "c", "b", "a"]
@test levels(join(B, A, on=:b, kind=:inner)[:b]) == ["a", "b", "c"]
@test levels(join(A, B, on=:b, kind=:left)[:b]) == ["d", "c", "b", "a"]
@test levels(join(B, A, on=:b, kind=:left)[:b]) == ["a", "b", "c"]
@test levels(join(A, B, on=:b, kind=:right)[:b]) == ["d", "c", "b", "a"]
@test levels(join(B, A, on=:b, kind=:right)[:b]) == ["a", "b", "c", "d"]
@test levels(join(A, B, on=:b, kind=:outer)[:b]) == ["d", "c", "b", "a"]
@test levels(join(B, A, on=:b, kind=:outer)[:b]) == ["a", "b", "c", "d"]
@test levels(join(A, B, on=:b, kind = :semi)[:b]) == ["d", "c", "b", "a"]
@test levels(join(B, A, on=:b, kind = :semi)[:b]) == ["a", "b", "c"]
end
end

0 comments on commit ff2729b

Please sign in to comment.