From ccd30da1abb04ba07b5a6a6d7fd7616ab55a55f4 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Tue, 17 Oct 2017 12:51:01 -0700 Subject: [PATCH 1/4] Ensure levels are maintained when joining df's with Categorical cols --- src/abstractdataframe/join.jl | 2 ++ test/join.jl | 12 ++++++++++++ 2 files changed, 14 insertions(+) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 3df256e601..880f559e94 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -76,11 +76,13 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, for (i, col) in enumerate(columns(joiner.dfl)) cols[i] = _similar(col, nrow) fillcolumn!(cols[i], col, all_orig_left_ixs) + isa(col, CategoricalArray) && levels!(cols[i], levels(col)) end for (i, col) in enumerate(columns(dfr_noon)) cols[i+ncleft] = _similar(col, nrow) fillcolumn!(cols[i+ncleft], col, all_orig_right_ixs) permute!(cols[i+ncleft], right_perm) + isa(col, CategoricalArray) && levels!(cols[i+ncleft], levels(col)) end res = DataFrame(cols, vcat(names(joiner.dfl), names(dfr_noon))) diff --git a/test/join.jl b/test/join.jl index 62b7e53850..8ee360b45c 100644 --- a/test/join.jl +++ b/test/join.jl @@ -325,4 +325,16 @@ module TestJoin @test all(isa.(o(on).columns, [CategoricalVector{Union{T, Null}} for T in (Int, Float64)])) end + + @testset "maintain Categorical levels ordering on join" begin + A = DataFrame(a = [1,2,3], b = ["a", "b", "c"]); + B = DataFrame(b = ["a", "b", "c"], c = levels!(categorical(["a", "b", "b"]), ["b", "a"])); + @test levels(join(A, B, on=:b)[:c]) == ["b", "a"] + @test levels(join(B, A, on=:b)[:c]) == ["b", "a"] + @test levels(join(A, B, on=:b, kind=:inner)[:c]) == ["b", "a"] + @test levels(join(A, B, on=:b, kind=:left)[:c]) == ["b", "a"] + @test levels(join(A, B, on=:b, kind=:right)[:c]) == ["b", "a"] + @test levels(join(A, B, on=:b, kind=:outer)[:c]) == ["b", "a"] + @test levels(join(B, A, on=:b, kind = :semi)[:c]) == ["b", "a"] + end end From cec3821230aea079df3276e09d233c63d8152339 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Thu, 26 Oct 2017 11:38:53 -0700 Subject: [PATCH 2/4] Introduce mergelevels from CategoricalArrays to handle right outer join Add additional tests to assert how levels are handled when DFs have different numbers of levels and when only one of the columns is categorical --- src/abstractdataframe/join.jl | 9 +++++++ test/join.jl | 44 +++++++++++++++++++++++++++++++---- 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 880f559e94..828c6968f4 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -93,6 +93,15 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, # fix the result of the rightjoin by taking the nonnull values from the right table offset = nrow - length(rightonly_ixs.orig) fillcolumn!(res[on_col], joiner.dfr_on[on_col_ix], rightonly_ixs.orig, offset) + if isa(res[on_col], CategoricalArray) && + isa(joiner.dfr_on[on_col_ix], CategoricalArray) + newlevels, ordered = + CategoricalArrays.mergelevels(isordered(res[on_col]), + levels(res[on_col]), + levels(joiner.dfr_on[on_col_ix])) + ordered!(res[on_col], ordered) + levels!(res[on_col], newlevels) + end end end return res diff --git a/test/join.jl b/test/join.jl index 8ee360b45c..77ea937d1d 100644 --- a/test/join.jl +++ b/test/join.jl @@ -326,15 +326,49 @@ module TestJoin [CategoricalVector{Union{T, Null}} for T in (Int, Float64)])) end - @testset "maintain Categorical levels ordering on join" begin - A = DataFrame(a = [1,2,3], b = ["a", "b", "c"]); - B = DataFrame(b = ["a", "b", "c"], c = levels!(categorical(["a", "b", "b"]), ["b", "a"])); - @test levels(join(A, B, on=:b)[:c]) == ["b", "a"] - @test levels(join(B, A, on=:b)[:c]) == ["b", "a"] + @testset "maintain Categorical levels ordering on join - equal size" begin + A = DataFrame(a = [1, 2, 3], b = ["a", "b", "c"]) + c = levels!(categorical(["a", "b", "b"]), ["b", "a"]) + B = DataFrame(b = ["a", "b", "c"], c = c) @test levels(join(A, B, on=:b, kind=:inner)[:c]) == ["b", "a"] + @test levels(join(B, A, on=:b, kind=:inner)[:c]) == ["b", "a"] @test levels(join(A, B, on=:b, kind=:left)[:c]) == ["b", "a"] @test levels(join(A, B, on=:b, kind=:right)[:c]) == ["b", "a"] @test levels(join(A, B, on=:b, kind=:outer)[:c]) == ["b", "a"] @test levels(join(B, A, on=:b, kind = :semi)[:c]) == ["b", "a"] end + + @testset "maintain Categorical levels ordering on join - conflicting ordering" begin + A = DataFrame(a = [1, 2, 3, 4], b = CategoricalVector(["a", "b", "c", "d"])) + levels!(A[:b], ["d", "c", "b", "a"]) + B = DataFrame(b = CategoricalVector(["a", "b", "c"]), c = [5, 6, 7]) + @test levels(join(A, B, on=:b, kind=:inner)[:b]) == ["d", "c", "b", "a"] + @test levels(join(B, A, on=:b, kind=:inner)[:b]) == ["a", "b", "c"] + @test levels(join(A, B, on=:b, kind=:left)[:b]) == ["d", "c", "b", "a"] + @test levels(join(B, A, on=:b, kind=:left)[:b]) == ["a", "b", "c"] + @test levels(join(A, B, on=:b, kind=:right)[:b]) == ["d", "c", "b", "a"] + @test levels(join(B, A, on=:b, kind=:right)[:b]) == ["a", "b", "c", "d"] + @test levels(join(B, A, on=:b, kind=:outer)[:b]) == ["a", "b", "c", "d"] + @test levels(join(A, B, on=:b, kind=:outer)[:b]) == ["d", "c", "b", "a"] + @test levels(join(A, B, on=:b, kind = :semi)[:b]) == ["d", "c", "b", "a"] + @test levels(join(B, A, on=:b, kind = :semi)[:b]) == ["a", "b", "c"] + end + + @testset "maintain Categorical levels ordering on join - only 1 is categorical" begin + A = DataFrame(a = [1, 2, 3, 4], b = CategoricalVector(["a", "b", "c", "d"])) + levels!(A[:b], ["d", "c", "b", "a"]) + B = DataFrame(b = ["a", "b", "c"], c = [5, 6, 7]) + @test levels(join(A, B, on=:b)[:b]) == ["d", "c", "b", "a"] + @test_throws MethodError levels(join(B, A, on=:b)[:b]) + @test levels(join(A, B, on=:b, kind=:inner)[:b]) == ["d", "c", "b", "a"] + @test_throws MethodError levels(join(B, A, on=:b, kind=:inner)[:b]) + @test levels(join(A, B, on=:b, kind=:left)[:b]) == ["d", "c", "b", "a"] + @test_throws MethodError levels(join(B, A, on=:b, kind=:left)[:b]) + @test levels(join(A, B, on=:b, kind=:right)[:b]) == ["d", "c", "b", "a"] + @test_throws MethodError levels(join(B, A, on=:b, kind=:right)[:b]) + @test levels(join(A, B, on=:b, kind=:outer)[:b]) == ["d", "c", "b", "a"] + @test_throws MethodError levels(join(B, A, on=:b, kind=:outer)[:b]) + @test levels(join(A, B, on=:b, kind = :semi)[:b]) == ["d", "c", "b", "a"] + @test_throws MethodError levels(join(B, A, on=:b, kind = :semi)[:b]) + end end From 1d92fa745271d06cc4f93cc31cee42e1aecc789e Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Thu, 26 Oct 2017 12:02:37 -0700 Subject: [PATCH 3/4] Update tests w/ new CategoricalArrays & Vector promotion mechanics Wrote first set of tests that threw methoderrors with 1 version behind current, updated to reflect improved promotion mechanics. Add brief note to join docstring to explain precedence of ordering when joining on categorical columns --- src/abstractdataframe/join.jl | 3 +++ test/join.jl | 14 +++++++------- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index 828c6968f4..abb0718f65 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -233,6 +233,9 @@ join(df1::AbstractDataFrame, For the three join operations that may introduce missing values (`:outer`, `:left`, and `:right`), all columns of the returned data table will be nullable. +When merging `on` categorical columns that differ in the ordering of their levels, the +ordering of the left dataframe will take precedent over the ordering of the right dataframe + ### Result * `::DataFrame` : the joined DataFrame diff --git a/test/join.jl b/test/join.jl index 77ea937d1d..a70964d668 100644 --- a/test/join.jl +++ b/test/join.jl @@ -326,7 +326,7 @@ module TestJoin [CategoricalVector{Union{T, Null}} for T in (Int, Float64)])) end - @testset "maintain Categorical levels ordering on join - equal size" begin + @testset "maintain Categorical levels ordering on join - non-`on` cols" begin A = DataFrame(a = [1, 2, 3], b = ["a", "b", "c"]) c = levels!(categorical(["a", "b", "b"]), ["b", "a"]) B = DataFrame(b = ["a", "b", "c"], c = c) @@ -359,16 +359,16 @@ module TestJoin levels!(A[:b], ["d", "c", "b", "a"]) B = DataFrame(b = ["a", "b", "c"], c = [5, 6, 7]) @test levels(join(A, B, on=:b)[:b]) == ["d", "c", "b", "a"] - @test_throws MethodError levels(join(B, A, on=:b)[:b]) + @test levels(join(B, A, on=:b)[:b]) == ["a", "b", "c"] @test levels(join(A, B, on=:b, kind=:inner)[:b]) == ["d", "c", "b", "a"] - @test_throws MethodError levels(join(B, A, on=:b, kind=:inner)[:b]) + @test levels(join(B, A, on=:b, kind=:inner)[:b]) == ["a", "b", "c"] @test levels(join(A, B, on=:b, kind=:left)[:b]) == ["d", "c", "b", "a"] - @test_throws MethodError levels(join(B, A, on=:b, kind=:left)[:b]) + @test levels(join(B, A, on=:b, kind=:left)[:b]) == ["a", "b", "c"] @test levels(join(A, B, on=:b, kind=:right)[:b]) == ["d", "c", "b", "a"] - @test_throws MethodError levels(join(B, A, on=:b, kind=:right)[:b]) + @test levels(join(B, A, on=:b, kind=:right)[:b]) == ["a", "b", "c", "d"] @test levels(join(A, B, on=:b, kind=:outer)[:b]) == ["d", "c", "b", "a"] - @test_throws MethodError levels(join(B, A, on=:b, kind=:outer)[:b]) + @test levels(join(B, A, on=:b, kind=:outer)[:b]) == ["a", "b", "c", "d"] @test levels(join(A, B, on=:b, kind = :semi)[:b]) == ["d", "c", "b", "a"] - @test_throws MethodError levels(join(B, A, on=:b, kind = :semi)[:b]) + @test levels(join(B, A, on=:b, kind = :semi)[:b]) == ["a", "b", "c"] end end From de9a64039011d56b637667cf60eab03870ff7827 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Thu, 26 Oct 2017 13:05:33 -0700 Subject: [PATCH 4/4] Improve docstring addition explaining categorical array level ordering --- src/abstractdataframe/join.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index abb0718f65..ac02227c91 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -234,7 +234,7 @@ For the three join operations that may introduce missing values (`:outer`, `:lef and `:right`), all columns of the returned data table will be nullable. When merging `on` categorical columns that differ in the ordering of their levels, the -ordering of the left dataframe will take precedent over the ordering of the right dataframe +ordering of the left `DataFrame` takes precedence over the ordering of the right `DataFrame` ### Result