Skip to content

Commit

Permalink
Merge 2bd3188 into 72a6304
Browse files Browse the repository at this point in the history
  • Loading branch information
nickeubank committed Jun 15, 2018
2 parents 72a6304 + 2bd3188 commit 7be0e7b
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 2 deletions.
32 changes: 30 additions & 2 deletions src/abstractdataframe/join.jl
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,8 @@ function update_row_maps!(left_table::AbstractDataFrame,
end

"""
join(df1, df2; on = Symbol[], kind = :inner, makeunique = false)
join(df1, df2; on = Symbol[], kind = :inner, makeunique = false,
validate = (false, false))
Join two `DataFrame` objects
Expand Down Expand Up @@ -247,6 +248,12 @@ Join two `DataFrame` objects
if `true`, duplicate names will be suffixed with `_i`
(`i` starting at 1 for the first duplicate).
* `validate` : whether to check that columns passed as the `on` argument
define unique keys in each input data frame (according to [`isequal`](@ref)).
Can be a tuple or a pair, with the first element indicating whether to
run check for `df1` and the second element for `df2`.
By default no check is performed.
For the three join operations that may introduce missing values (`:outer`, `:left`,
and `:right`), all columns of the returned data table will support missing values.
Expand Down Expand Up @@ -280,7 +287,8 @@ join(name, job2, on = :ID => :identifier)
function Base.join(df1::AbstractDataFrame,
df2::AbstractDataFrame;
on::Union{<:OnType, AbstractVector{<:OnType}} = Symbol[],
kind::Symbol = :inner, makeunique::Bool=false)
kind::Symbol = :inner, makeunique::Bool=false,
validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false))
if kind == :cross
(on == Symbol[]) || throw(ArgumentError("Cross joins don't use argument 'on'."))
return crossjoin(df1, df2, makeunique=makeunique)
Expand All @@ -290,6 +298,26 @@ function Base.join(df1::AbstractDataFrame,

joiner = DataFrameJoiner(df1, df2, on)

# Check merge key validity
left_invalid = validate[1] ? any(nonunique(joiner.dfl, joiner.left_on)) : false
right_invalid = validate[2] ? any(nonunique(joiner.dfr, joiner.right_on)) : false

if left_invalid && right_invalid
first_error_df1 = findfirst(nonunique(joiner.dfl, joiner.left_on))
first_error_df2 = findfirst(nonunique(joiner.dfr, joiner.right_on))
throw(ArgumentError("Merge key(s) are not unique in either df1 or df2. " *
"First duplicate in df1 at $first_error_df1. " *
"First duplicate in df2 at $first_error_df2"))
elseif left_invalid
first_error = findfirst(nonunique(joiner.dfl, joiner.left_on))
throw(ArgumentError("Merge key(s) in df1 are not unique. " *
"First duplicate at row $first_error"))
elseif right_invalid
first_error = findfirst(nonunique(joiner.dfr, joiner.right_on))
throw(ArgumentError("Merge key(s) in df2 are not unique. " *
"First duplicate at row $first_error"))
end

if kind == :inner
compose_joined_table(joiner, kind, update_row_maps!(joiner.dfl_on, joiner.dfr_on,
group_rows(joiner.dfr_on),
Expand Down
60 changes: 60 additions & 0 deletions test/join.jl
Original file line number Diff line number Diff line change
Expand Up @@ -578,4 +578,64 @@ module TestJoin
@test eltypes(join(l, r, on=:b, kind=:outer, makeunique=true)) ==
[Union{Int, Missing}, CS, Union{Int, Missing}]
end

@testset "test checks of merge key uniqueness" begin
@test_throws ArgumentError join(name, job, on=:ID, validate=(false, true))
@test_throws ArgumentError join(name, job, on=:ID, validate=(true, true))
@test_throws ArgumentError join(job, name, on=:ID, validate=(true, false))
@test_throws ArgumentError join(job, name, on=:ID, validate=(true, true))
@test_throws ArgumentError join(job, job, on=:ID, validate=(true, true))


@test join(name, job, on=:ID, validate=(true, false)) == inner
@test join(name, job, on=:ID, kind=:inner, validate=(false, false)) == inner

# Make sure ok with various special values
for special in [missing, NaN, 0.0, -0.0]
name_w_special = DataFrame(ID = [1, 2, 3, special],
Name = ["John Doe", "Jane Doe", "Joe Blogs", "Maria Tester"])
@test join(name_w_special, job, on=:ID, validate=(true, false)) == inner

# Make sure duplicated special values still an exception
name_w_special_dups = DataFrame(ID = [1, 2, 3, special, special],
Name = ["John Doe", "Jane Doe", "Joe Blogs",
"Maria Tester", "Jill Jillerson"])
@test_throws ArgumentError join(name_w_special_dups, name, on=:ID,
validate=(true, false))
end

# Check 0.0 and -0.0 seen as different
name_w_zeros = DataFrame(ID = [1, 2, 3, 0.0, -0.0],
Name = ["John Doe", "Jane Doe",
"Joe Blogs", "Maria Tester",
"Jill Jillerson"])
name_w_zeros2 = DataFrame(ID = [1, 2, 3, 0.0, -0.0],
Name = ["John Doe", "Jane Doe",
"Joe Blogs", "Maria Tester",
"Jill Jillerson"],
Name_1 = ["John Doe", "Jane Doe",
"Joe Blogs", "Maria Tester",
"Jill Jillerson"])

@test join(name_w_zeros, name_w_zeros, on=:ID, validate=(true, true)) name_w_zeros2

# Check for multiple-column merge keys
name_multi = DataFrame(ID1 = [1, 1, 2],
ID2 = ["a", "b", "a"],
Name = ["John Doe", "Jane Doe", "Joe Blogs"])
job_multi = DataFrame(ID1 = [1, 2, 2, 4],
ID2 = ["a", "b", "b", "c"],
Job = ["Lawyer", "Doctor", "Florist", "Farmer"])
outer_multi = DataFrame(ID1 = [1, 1, 2, 2, 2, 4],
ID2 = ["a", "b", "a", "b", "b", "c"],
Name = ["John Doe", "Jane Doe", "Joe Blogs",
missing, missing, missing],
Job = ["Lawyer", missing, missing,
"Doctor", "Florist", "Farmer"])

@test join(name_multi, job_multi, on=[:ID1, :ID2], kind=:outer,
validate=(true, false)) outer_multi
@test_throws ArgumentError join(name_multi, job_multi, on=[:ID1, :ID2], kind=:outer,
validate=(false, true))
end
end

0 comments on commit 7be0e7b

Please sign in to comment.