Skip to content

Commit

Permalink
Merge de9a640 into 902e131
Browse files Browse the repository at this point in the history
  • Loading branch information
cjprybol committed Oct 26, 2017
2 parents 902e131 + de9a640 commit a0921a4
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 0 deletions.
14 changes: 14 additions & 0 deletions src/abstractdataframe/join.jl
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,13 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol,
for (i, col) in enumerate(columns(joiner.dfl))
cols[i] = _similar(col, nrow)
fillcolumn!(cols[i], col, all_orig_left_ixs)
isa(col, CategoricalArray) && levels!(cols[i], levels(col))
end
for (i, col) in enumerate(columns(dfr_noon))
cols[i+ncleft] = _similar(col, nrow)
fillcolumn!(cols[i+ncleft], col, all_orig_right_ixs)
permute!(cols[i+ncleft], right_perm)
isa(col, CategoricalArray) && levels!(cols[i+ncleft], levels(col))
end
res = DataFrame(cols, vcat(names(joiner.dfl), names(dfr_noon)))

Expand All @@ -91,6 +93,15 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol,
# fix the result of the rightjoin by taking the nonnull values from the right table
offset = nrow - length(rightonly_ixs.orig)
fillcolumn!(res[on_col], joiner.dfr_on[on_col_ix], rightonly_ixs.orig, offset)
if isa(res[on_col], CategoricalArray) &&
isa(joiner.dfr_on[on_col_ix], CategoricalArray)
newlevels, ordered =
CategoricalArrays.mergelevels(isordered(res[on_col]),
levels(res[on_col]),
levels(joiner.dfr_on[on_col_ix]))
ordered!(res[on_col], ordered)
levels!(res[on_col], newlevels)
end
end
end
return res
Expand Down Expand Up @@ -222,6 +233,9 @@ join(df1::AbstractDataFrame,
For the three join operations that may introduce missing values (`:outer`, `:left`,
and `:right`), all columns of the returned data table will be nullable.
When merging `on` categorical columns that differ in the ordering of their levels, the
ordering of the left `DataFrame` takes precedence over the ordering of the right `DataFrame`
### Result
* `::DataFrame` : the joined DataFrame
Expand Down
46 changes: 46 additions & 0 deletions test/join.jl
Original file line number Diff line number Diff line change
Expand Up @@ -325,4 +325,50 @@ module TestJoin
@test all(isa.(o(on).columns,
[CategoricalVector{Union{T, Null}} for T in (Int, Float64)]))
end

@testset "maintain Categorical levels ordering on join - non-`on` cols" begin
A = DataFrame(a = [1, 2, 3], b = ["a", "b", "c"])
c = levels!(categorical(["a", "b", "b"]), ["b", "a"])
B = DataFrame(b = ["a", "b", "c"], c = c)
@test levels(join(A, B, on=:b, kind=:inner)[:c]) == ["b", "a"]
@test levels(join(B, A, on=:b, kind=:inner)[:c]) == ["b", "a"]
@test levels(join(A, B, on=:b, kind=:left)[:c]) == ["b", "a"]
@test levels(join(A, B, on=:b, kind=:right)[:c]) == ["b", "a"]
@test levels(join(A, B, on=:b, kind=:outer)[:c]) == ["b", "a"]
@test levels(join(B, A, on=:b, kind = :semi)[:c]) == ["b", "a"]
end

@testset "maintain Categorical levels ordering on join - conflicting ordering" begin
A = DataFrame(a = [1, 2, 3, 4], b = CategoricalVector(["a", "b", "c", "d"]))
levels!(A[:b], ["d", "c", "b", "a"])
B = DataFrame(b = CategoricalVector(["a", "b", "c"]), c = [5, 6, 7])
@test levels(join(A, B, on=:b, kind=:inner)[:b]) == ["d", "c", "b", "a"]
@test levels(join(B, A, on=:b, kind=:inner)[:b]) == ["a", "b", "c"]
@test levels(join(A, B, on=:b, kind=:left)[:b]) == ["d", "c", "b", "a"]
@test levels(join(B, A, on=:b, kind=:left)[:b]) == ["a", "b", "c"]
@test levels(join(A, B, on=:b, kind=:right)[:b]) == ["d", "c", "b", "a"]
@test levels(join(B, A, on=:b, kind=:right)[:b]) == ["a", "b", "c", "d"]
@test levels(join(B, A, on=:b, kind=:outer)[:b]) == ["a", "b", "c", "d"]
@test levels(join(A, B, on=:b, kind=:outer)[:b]) == ["d", "c", "b", "a"]
@test levels(join(A, B, on=:b, kind = :semi)[:b]) == ["d", "c", "b", "a"]
@test levels(join(B, A, on=:b, kind = :semi)[:b]) == ["a", "b", "c"]
end

@testset "maintain Categorical levels ordering on join - only 1 is categorical" begin
A = DataFrame(a = [1, 2, 3, 4], b = CategoricalVector(["a", "b", "c", "d"]))
levels!(A[:b], ["d", "c", "b", "a"])
B = DataFrame(b = ["a", "b", "c"], c = [5, 6, 7])
@test levels(join(A, B, on=:b)[:b]) == ["d", "c", "b", "a"]
@test levels(join(B, A, on=:b)[:b]) == ["a", "b", "c"]
@test levels(join(A, B, on=:b, kind=:inner)[:b]) == ["d", "c", "b", "a"]
@test levels(join(B, A, on=:b, kind=:inner)[:b]) == ["a", "b", "c"]
@test levels(join(A, B, on=:b, kind=:left)[:b]) == ["d", "c", "b", "a"]
@test levels(join(B, A, on=:b, kind=:left)[:b]) == ["a", "b", "c"]
@test levels(join(A, B, on=:b, kind=:right)[:b]) == ["d", "c", "b", "a"]
@test levels(join(B, A, on=:b, kind=:right)[:b]) == ["a", "b", "c", "d"]
@test levels(join(A, B, on=:b, kind=:outer)[:b]) == ["d", "c", "b", "a"]
@test levels(join(B, A, on=:b, kind=:outer)[:b]) == ["a", "b", "c", "d"]
@test levels(join(A, B, on=:b, kind = :semi)[:b]) == ["d", "c", "b", "a"]
@test levels(join(B, A, on=:b, kind = :semi)[:b]) == ["a", "b", "c"]
end
end

0 comments on commit a0921a4

Please sign in to comment.