Use copy! in joins to preserve levels of CategoricalArray columns (#1266

)
JuliaData · Nov 21, 2017 · ff2729b · ff2729b
1 parent 2d29d2a
commit ff2729b
Show file tree

Hide file tree

Showing 2 changed files with 53 additions and 11 deletions.
diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl
@@ -75,11 +75,11 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol,
     _similar = kind == :inner ? similar : similar_missing
     for (i, col) in enumerate(columns(joiner.dfl))
         cols[i] = _similar(col, nrow)
-        fillcolumn!(cols[i], col, all_orig_left_ixs)
+        copy!(cols[i], view(col, all_orig_left_ixs))
     end
     for (i, col) in enumerate(columns(dfr_noon))
         cols[i+ncleft] = _similar(col, nrow)
-        fillcolumn!(cols[i+ncleft], col, all_orig_right_ixs)
+        copy!(cols[i+ncleft], view(col, all_orig_right_ixs))
         permute!(cols[i+ncleft], right_perm)
     end
     res = DataFrame(cols, vcat(names(joiner.dfl), names(dfr_noon)))
@@ -89,20 +89,13 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol,
         # need to be taken from the right
         for (on_col_ix, on_col) in enumerate(joiner.on_cols)
             # fix the result of the rightjoin by taking the nonmissing values from the right table
-            offset = nrow - length(rightonly_ixs.orig)
-            fillcolumn!(res[on_col], joiner.dfr_on[on_col_ix], rightonly_ixs.orig, offset)
+            offset = nrow - length(rightonly_ixs.orig) + 1
+            copy!(res[on_col], offset, view(joiner.dfr_on[on_col_ix], rightonly_ixs.orig))
         end
     end
     return res
 end
 
-function fillcolumn!(dfcol::AbstractVector{T1}, refcol::AbstractVector{T2},
-                     indices::Vector{Int}, offset::Int=0) where {T1, T2}
-    @inbounds for (j, k) in enumerate(indices)
-        dfcol[j+offset] = refcol[k]
-    end
-end
-
 # map the indices of the left and right joined tables
 # to the indices of the rows in the resulting table
 # if `nothing` is given, the corresponding map is not built
@@ -222,6 +215,9 @@ join(df1::AbstractDataFrame,
 For the three join operations that may introduce missing values (`:outer`, `:left`,
 and `:right`), all columns of the returned data table will support missing values.
 
+When merging `on` categorical columns that differ in the ordering of their levels, the
+ordering of the left `DataFrame` takes precedence over the ordering of the right `DataFrame`
+
 ### Result
 
 * `::DataFrame` : the joined DataFrame

diff --git a/test/join.jl b/test/join.jl
@@ -325,4 +325,50 @@ module TestJoin
         @test all(isa.(o(on).columns,
                        [CategoricalVector{Union{T, Missing}} for T in (Int, Float64)]))
     end
+
+    @testset "maintain CategoricalArray levels ordering on join - non-`on` cols" begin
+        A = DataFrame(a = [1, 2, 3], b = ["a", "b", "c"])
+        B = DataFrame(b = ["a", "b", "c"], c = CategoricalVector(["a", "b", "b"]))
+        levels!(B[:c], ["b", "a"])
+        @test levels(join(A, B, on=:b, kind=:inner)[:c]) == ["b", "a"]
+        @test levels(join(B, A, on=:b, kind=:inner)[:c]) == ["b", "a"]
+        @test levels(join(A, B, on=:b, kind=:left)[:c]) == ["b", "a"]
+        @test levels(join(A, B, on=:b, kind=:right)[:c]) == ["b", "a"]
+        @test levels(join(A, B, on=:b, kind=:outer)[:c]) == ["b", "a"]
+        @test levels(join(B, A, on=:b, kind = :semi)[:c]) == ["b", "a"]
+    end
+
+    @testset "maintain CategoricalArray levels ordering on join - ordering conflicts" begin
+        A = DataFrame(a = [1, 2, 3, 4], b = CategoricalVector(["a", "b", "c", "d"]))
+        levels!(A[:b], ["d", "c", "b", "a"])
+        B = DataFrame(b = CategoricalVector(["a", "b", "c"]), c = [5, 6, 7])
+        @test levels(join(A, B, on=:b, kind=:inner)[:b]) == ["d", "c", "b", "a"]
+        @test levels(join(B, A, on=:b, kind=:inner)[:b]) == ["a", "b", "c"]
+        @test levels(join(A, B, on=:b, kind=:left)[:b]) == ["d", "c", "b", "a"]
+        @test levels(join(B, A, on=:b, kind=:left)[:b]) == ["a", "b", "c"]
+        @test levels(join(A, B, on=:b, kind=:right)[:b]) == ["d", "c", "b", "a"]
+        @test levels(join(B, A, on=:b, kind=:right)[:b]) == ["a", "b", "d", "c"]
+        @test levels(join(B, A, on=:b, kind=:outer)[:b]) == ["a", "b", "d", "c"]
+        @test levels(join(A, B, on=:b, kind=:outer)[:b]) == ["d", "c", "b", "a"]
+        @test levels(join(A, B, on=:b, kind = :semi)[:b]) == ["d", "c", "b", "a"]
+        @test levels(join(B, A, on=:b, kind = :semi)[:b]) == ["a", "b", "c"]
+    end
+
+    @testset "maintain CategoricalArray levels ordering on join - left is categorical" begin
+        A = DataFrame(a = [1, 2, 3, 4], b = CategoricalVector(["a", "b", "c", "d"]))
+        levels!(A[:b], ["d", "c", "b", "a"])
+        B = DataFrame(b = ["a", "b", "c"], c = [5, 6, 7])
+        @test levels(join(A, B, on=:b)[:b]) == ["d", "c", "b", "a"]
+        @test levels(join(B, A, on=:b)[:b]) == ["a", "b", "c"]
+        @test levels(join(A, B, on=:b, kind=:inner)[:b]) == ["d", "c", "b", "a"]
+        @test levels(join(B, A, on=:b, kind=:inner)[:b]) == ["a", "b", "c"]
+        @test levels(join(A, B, on=:b, kind=:left)[:b]) == ["d", "c", "b", "a"]
+        @test levels(join(B, A, on=:b, kind=:left)[:b]) == ["a", "b", "c"]
+        @test levels(join(A, B, on=:b, kind=:right)[:b]) == ["d", "c", "b", "a"]
+        @test levels(join(B, A, on=:b, kind=:right)[:b]) == ["a", "b", "c", "d"]
+        @test levels(join(A, B, on=:b, kind=:outer)[:b]) == ["d", "c", "b", "a"]
+        @test levels(join(B, A, on=:b, kind=:outer)[:b]) == ["a", "b", "c", "d"]
+        @test levels(join(A, B, on=:b, kind = :semi)[:b]) == ["d", "c", "b", "a"]
+        @test levels(join(B, A, on=:b, kind = :semi)[:b]) == ["a", "b", "c"]
+    end
 end