Merge 2bd3188 into 72a6304

JuliaData · Jun 15, 2018 · 7be0e7b · 7be0e7b
2 parents 72a6304 + 2bd3188
commit 7be0e7b
Show file tree

Hide file tree

Showing 2 changed files with 90 additions and 2 deletions.
diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl
@@ -213,7 +213,8 @@ function update_row_maps!(left_table::AbstractDataFrame,
 end
 
 """
-    join(df1, df2; on = Symbol[], kind = :inner, makeunique = false)
+    join(df1, df2; on = Symbol[], kind = :inner, makeunique = false,
+         validate = (false, false))
 
 Join two `DataFrame` objects
 
@@ -247,6 +248,12 @@ Join two `DataFrame` objects
   if `true`, duplicate names will be suffixed with `_i`
   (`i` starting at 1 for the first duplicate).
 
+* `validate` : whether to check that columns passed as the `on` argument
+   define unique keys in each input data frame (according to [`isequal`](@ref)).
+   Can be a tuple or a pair, with the first element indicating whether to
+   run check for `df1` and the second element for `df2`.
+   By default no check is performed.
+
 For the three join operations that may introduce missing values (`:outer`, `:left`,
 and `:right`), all columns of the returned data table will support missing values.
 
@@ -280,7 +287,8 @@ join(name, job2, on = :ID => :identifier)
 function Base.join(df1::AbstractDataFrame,
                    df2::AbstractDataFrame;
                    on::Union{<:OnType, AbstractVector{<:OnType}} = Symbol[],
-                   kind::Symbol = :inner, makeunique::Bool=false)
+                   kind::Symbol = :inner, makeunique::Bool=false,
+                   validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false))
     if kind == :cross
         (on == Symbol[]) || throw(ArgumentError("Cross joins don't use argument 'on'."))
         return crossjoin(df1, df2, makeunique=makeunique)
@@ -290,6 +298,26 @@ function Base.join(df1::AbstractDataFrame,
 
     joiner = DataFrameJoiner(df1, df2, on)
 
+    # Check merge key validity
+    left_invalid = validate[1] ? any(nonunique(joiner.dfl, joiner.left_on)) : false
+    right_invalid = validate[2] ? any(nonunique(joiner.dfr, joiner.right_on)) : false
+
+    if left_invalid && right_invalid
+        first_error_df1 = findfirst(nonunique(joiner.dfl, joiner.left_on))
+        first_error_df2 = findfirst(nonunique(joiner.dfr, joiner.right_on))
+        throw(ArgumentError("Merge key(s) are not unique in either df1 or df2. " *
+                            "First duplicate in df1 at $first_error_df1. " *
+                            "First duplicate in df2 at $first_error_df2"))
+    elseif left_invalid
+        first_error = findfirst(nonunique(joiner.dfl, joiner.left_on))
+        throw(ArgumentError("Merge key(s) in df1 are not unique. " *
+                            "First duplicate at row $first_error"))
+    elseif right_invalid
+        first_error = findfirst(nonunique(joiner.dfr, joiner.right_on))
+        throw(ArgumentError("Merge key(s) in df2 are not unique. " *
+                            "First duplicate at row $first_error"))
+    end
+
     if kind == :inner
         compose_joined_table(joiner, kind, update_row_maps!(joiner.dfl_on, joiner.dfr_on,
                                                             group_rows(joiner.dfr_on),

diff --git a/test/join.jl b/test/join.jl
@@ -578,4 +578,64 @@ module TestJoin
         @test eltypes(join(l, r, on=:b, kind=:outer, makeunique=true)) ==
             [Union{Int, Missing}, CS, Union{Int, Missing}]
     end
+
+    @testset "test checks of merge key uniqueness" begin
+        @test_throws ArgumentError join(name, job, on=:ID, validate=(false, true))
+        @test_throws ArgumentError join(name, job, on=:ID, validate=(true, true))
+        @test_throws ArgumentError join(job, name, on=:ID, validate=(true, false))
+        @test_throws ArgumentError join(job, name, on=:ID, validate=(true, true))
+        @test_throws ArgumentError join(job, job, on=:ID, validate=(true, true))
+
+
+        @test join(name, job, on=:ID, validate=(true, false)) ==  inner
+        @test join(name, job, on=:ID, kind=:inner, validate=(false, false)) == inner
+
+        # Make sure ok with various special values
+        for special in [missing, NaN, 0.0, -0.0]
+            name_w_special = DataFrame(ID = [1, 2, 3, special],
+                                       Name = ["John Doe", "Jane Doe", "Joe Blogs", "Maria Tester"])
+            @test join(name_w_special, job, on=:ID, validate=(true, false)) ==  inner
+
+            # Make sure duplicated special values still an exception
+            name_w_special_dups = DataFrame(ID = [1, 2, 3, special, special],
+                                            Name = ["John Doe", "Jane Doe", "Joe Blogs",
+                                                    "Maria Tester", "Jill Jillerson"])
+            @test_throws ArgumentError join(name_w_special_dups, name, on=:ID,
+                                            validate=(true, false))
+        end
+
+        # Check 0.0 and -0.0 seen as different
+        name_w_zeros = DataFrame(ID = [1, 2, 3, 0.0, -0.0],
+                                 Name = ["John Doe", "Jane Doe",
+                                         "Joe Blogs", "Maria Tester",
+                                         "Jill Jillerson"])
+        name_w_zeros2 = DataFrame(ID = [1, 2, 3, 0.0, -0.0],
+                                  Name = ["John Doe", "Jane Doe",
+                                          "Joe Blogs", "Maria Tester",
+                                          "Jill Jillerson"],
+                                  Name_1 = ["John Doe", "Jane Doe",
+                                            "Joe Blogs", "Maria Tester",
+                                            "Jill Jillerson"])
+
+        @test join(name_w_zeros, name_w_zeros, on=:ID, validate=(true, true)) ≅ name_w_zeros2
+
+        # Check for multiple-column merge keys
+        name_multi = DataFrame(ID1 = [1, 1, 2],
+                               ID2 = ["a", "b", "a"],
+                               Name = ["John Doe", "Jane Doe", "Joe Blogs"])
+        job_multi = DataFrame(ID1 = [1, 2, 2, 4],
+                              ID2 = ["a", "b", "b", "c"],
+                              Job = ["Lawyer", "Doctor", "Florist", "Farmer"])
+        outer_multi = DataFrame(ID1 = [1, 1, 2, 2, 2, 4],
+                                ID2 = ["a", "b", "a", "b", "b", "c"],
+                                Name = ["John Doe", "Jane Doe", "Joe Blogs",
+                                        missing, missing, missing],
+                                Job = ["Lawyer", missing, missing,
+                                       "Doctor", "Florist",  "Farmer"])
+
+         @test join(name_multi, job_multi, on=[:ID1, :ID2], kind=:outer,
+                    validate=(true, false)) ≅ outer_multi
+         @test_throws ArgumentError join(name_multi, job_multi, on=[:ID1, :ID2], kind=:outer,
+                                         validate=(false, true))
+    end
 end