Skip to content

Commit

Permalink
[BREAKING] add matchmissing kwarg to joins (#2504)
Browse files Browse the repository at this point in the history
  • Loading branch information
bkamins committed Nov 2, 2020
1 parent 5224175 commit a156ef6
Show file tree
Hide file tree
Showing 4 changed files with 166 additions and 42 deletions.
3 changes: 3 additions & 0 deletions NEWS.md
Expand Up @@ -44,6 +44,9 @@
* in `describe` the specification of custom aggregation is now `function => name`;
old `name => function` order is now deprecated
([#2401](https://github.com/JuliaData/DataFrames.jl/pull/2401))
* in joins passing `NaN` or real or imaginary `-0.0` in `on` column now throws an
error; passing `missing` thows an error unless `matchmissing=:equal` keyword argument
is passed ([#2504](https://github.com/JuliaData/DataFrames.jl/pull/2504))
* `unstack` now produces row and column keys in the order of their first appearance
and has two new keyword arguments `allowmissing` and `allowduplicates`
([#2494](https://github.com/JuliaData/DataFrames.jl/pull/2494))
Expand Down
155 changes: 121 additions & 34 deletions src/abstractdataframe/join.jl
Expand Up @@ -19,7 +19,8 @@ struct DataFrameJoiner
right_on::Vector{Symbol}

function DataFrameJoiner(dfl::AbstractDataFrame, dfr::AbstractDataFrame,
on::Union{<:OnType, AbstractVector})
on::Union{<:OnType, AbstractVector},
matchmissing::Symbol)
on_cols = isa(on, AbstractVector) ? on : [on]
left_on = Symbol[]
right_on = Symbol[]
Expand All @@ -41,7 +42,32 @@ struct DataFrameJoiner
"Symbol or Pair{Symbol,Symbol}."))
end
end
new(dfl, dfr, dfl[!, left_on], dfr[!, right_on], left_on, right_on)
dfl_on = dfl[!, left_on]
dfr_on = dfr[!, right_on]

if matchmissing === :error
for df in (dfl_on, dfr_on), col in eachcol(df)
if any(ismissing, col)
throw(ArgumentError("missing values in key columns are not allowed " *
"when matchmissing == :error"))
end
end
elseif matchmissing !== :equal
throw(ArgumentError("matchmissing allows only :error or :equal"))
end

for df in (dfl_on, dfr_on), col in eachcol(df)
if any(x -> (x isa Union{Complex, Real}) &&
(isnan(x) || isequal(real(x), -0.0) || isequal(imag(x), -0.0)), col)
throw(ArgumentError("currently for numeric values NaN and `-0.0` " *
"in their real or imaginary components are not " *
"allowed. Use CategoricalArrays.jl to wrap " *
"these values in a CategoricalVector to perform " *
"the requested join."))
end
end

new(dfl, dfr, dfl_on, dfr_on, left_on, right_on)
end
end

Expand Down Expand Up @@ -267,15 +293,16 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame;
indicator::Union{Nothing, Symbol, AbstractString},
validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}},
left_rename::Union{Function, AbstractString, Symbol},
right_rename::Union{Function, AbstractString, Symbol})
right_rename::Union{Function, AbstractString, Symbol},
matchmissing::Symbol)
_check_consistency(df1)
_check_consistency(df2)

if on == []
throw(ArgumentError("Missing join argument 'on'."))
end

joiner = DataFrameJoiner(df1, df2, on)
joiner = DataFrameJoiner(df1, df2, on, matchmissing)

# Check merge key validity
left_invalid = validate[1] ? any(nonunique(joiner.dfl, joiner.left_on)) : false
Expand Down Expand Up @@ -448,10 +475,10 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame;
end

"""
innerjoin(df1, df2; on, makeunique = false,
validate = (false, false), renamecols = identity => identity)
innerjoin(df1, df2, dfs...; on, makeunique = false,
validate = (false, false))
innerjoin(df1, df2; on, makeunique=false, validate=(false, false),
renamecols=(identity => identity), matchmissing=:error)
innerjoin(df1, df2, dfs...; on, makeunique=false,
validate=(false, false), matchmissing=:error)
Perform an inner join of two or more data frame objects and return a `DataFrame`
containing the result. An inner join includes rows with keys that match in all
Expand Down Expand Up @@ -486,6 +513,14 @@ The order of rows in the result is undefined and may change in the future releas
to each column name, which is passed to it as a `String`. Note that `renamecols`
does not affect `on` columns, whose names are always taken from the left
data frame and left unchanged.
- `matchmissing` : if equal to `:error` throw an error if `missing` is present
in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
matched (`isequal` is used for comparisons of rows for equality)
It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
imaginary part of the number. If you need to perform a join on such values use
CategoricalArrays.jl and transform a column containing such values into a
`CategoricalVector`.
When merging `on` categorical columns that differ in the ordering of their
levels, the ordering of the left data frame takes precedence over the ordering
Expand Down Expand Up @@ -556,26 +591,30 @@ function innerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame;
on::Union{<:OnType, AbstractVector} = Symbol[],
makeunique::Bool=false,
validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false),
renamecols::Pair=identity => identity)
renamecols::Pair=identity => identity,
matchmissing::Symbol=:error)
if !all(x -> x isa Union{Function, AbstractString, Symbol}, renamecols)
throw(ArgumentError("renamecols keyword argument must be a `Pair`" *
" containing functions, strings, or `Symbol`s"))
end
return _join(df1, df2, on=on, kind=:inner, makeunique=makeunique,
indicator=nothing, validate=validate,
left_rename=first(renamecols), right_rename=last(renamecols))
left_rename=first(renamecols), right_rename=last(renamecols),
matchmissing=matchmissing)
end

innerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame, dfs::AbstractDataFrame...;
on::Union{<:OnType, AbstractVector} = Symbol[],
makeunique::Bool=false,
validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false)) =
validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false),
matchmissing::Symbol=:error) =
innerjoin(innerjoin(df1, df2, on=on, makeunique=makeunique, validate=validate),
dfs..., on=on, makeunique=makeunique, validate=validate)
dfs..., on=on, makeunique=makeunique, validate=validate,
matchmissing=matchmissing)

"""
leftjoin(df1, df2; on, makeunique = false, indicator = nothing,
validate = (false, false), renamecols = identity => identity)
leftjoin(df1, df2; on, makeunique=false, indicator=nothing, validate=(false, false),
renamecols=(identity => identity), matchmissing=:error)
Perform a left join of twodata frame objects and return a `DataFrame` containing
the result. A left join includes all rows from `df1`.
Expand Down Expand Up @@ -611,9 +650,17 @@ The order of rows in the result is undefined and may change in the future releas
to each column name, which is passed to it as a `String`. Note that `renamecols`
does not affect `on` columns, whose names are always taken from the left
data frame and left unchanged.
- `matchmissing` : if equal to `:error` throw an error if `missing` is present
in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
matched (`isequal` is used for comparisons of rows for equality)
All columns of the returned data table will support missing values.
It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
imaginary part of the number. If you need to perform a join on such values use
CategoricalArrays.jl and transform a column containing such values into a
`CategoricalVector`.
When merging `on` categorical columns that differ in the ordering of their
levels, the ordering of the left data frame takes precedence over the ordering
of the right data frame.
Expand Down Expand Up @@ -682,19 +729,21 @@ function leftjoin(df1::AbstractDataFrame, df2::AbstractDataFrame;
on::Union{<:OnType, AbstractVector} = Symbol[],
makeunique::Bool=false, indicator::Union{Nothing, Symbol, AbstractString} = nothing,
validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false),
renamecols::Pair=identity => identity)
renamecols::Pair=identity => identity, matchmissing::Symbol=:error)
if !all(x -> x isa Union{Function, AbstractString, Symbol}, renamecols)
throw(ArgumentError("renamecols keyword argument must be a `Pair`" *
" containing functions, strings, or `Symbol`s"))
end
return _join(df1, df2, on=on, kind=:left, makeunique=makeunique,
indicator=indicator, validate=validate,
left_rename=first(renamecols), right_rename=last(renamecols))
left_rename=first(renamecols), right_rename=last(renamecols),
matchmissing=matchmissing)
end

"""
rightjoin(df1, df2; on, makeunique = false, indicator = nothing,
validate = (false, false), renamecols = identity => identity)
rightjoin(df1, df2; on, makeunique=false, indicator = nothing,
validate=(false, false), renamecols=(identity => identity),
matchmissing=:error)
Perform a right join on two data frame objects and return a `DataFrame` containing
the result. A right join includes all rows from `df2`.
Expand Down Expand Up @@ -730,9 +779,17 @@ The order of rows in the result is undefined and may change in the future releas
to each column name, which is passed to it as a `String`. Note that `renamecols`
does not affect `on` columns, whose names are always taken from the left
data frame and left unchanged.
- `matchmissing` : if equal to `:error` throw an error if `missing` is present
in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
matched (`isequal` is used for comparisons of rows for equality)
All columns of the returned data table will support missing values.
It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
imaginary part of the number. If you need to perform a join on such values use
CategoricalArrays.jl and transform a column containing such values into a
`CategoricalVector`.
When merging `on` categorical columns that differ in the ordering of their
levels, the ordering of the left data frame takes precedence over the ordering
of the right data frame.
Expand Down Expand Up @@ -801,21 +858,22 @@ function rightjoin(df1::AbstractDataFrame, df2::AbstractDataFrame;
on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false,
indicator::Union{Nothing, Symbol, AbstractString} = nothing,
validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false),
renamecols::Pair=identity => identity)
renamecols::Pair=identity => identity, matchmissing::Symbol=:error)
if !all(x -> x isa Union{Function, AbstractString, Symbol}, renamecols)
throw(ArgumentError("renamecols keyword argument must be a `Pair`" *
" containing functions, strings, or `Symbol`s"))
end
return _join(df1, df2, on=on, kind=:right, makeunique=makeunique,
indicator=indicator, validate=validate,
left_rename=first(renamecols), right_rename=last(renamecols))
left_rename=first(renamecols), right_rename=last(renamecols),
matchmissing=matchmissing)
end

"""
outerjoin(df1, df2; on, makeunique = false, indicator = nothing,
validate = (false, false), renamecols = identity => identity)
outerjoin(df1, df2; on, makeunique=false, indicator=nothing, validate=(false, false),
renamecols=(identity => identity), matchmissing=:error)
outerjoin(df1, df2, dfs...; on, makeunique = false,
validate = (false, false))
validate = (false, false), matchmissing=:error)
Perform an outer join of two or more data frame objects and return a `DataFrame`
containing the result. An outer join includes rows with keys that appear in any
Expand Down Expand Up @@ -855,10 +913,17 @@ The order of rows in the result is undefined and may change in the future releas
to each column name, which is passed to it as a `String`. Note that `renamecols`
does not affect `on` columns, whose names are always taken from the left
data frame and left unchanged.
- `matchmissing` : if equal to `:error` throw an error if `missing` is present
in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
matched (`isequal` is used for comparisons of rows for equality)
All columns of the returned data table will support missing values.
It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
imaginary part of the number. If you need to perform a join on such values use
CategoricalArrays.jl and transform a column containing such values into a
`CategoricalVector`.
When merging `on` categorical columns that differ in the ordering of their
levels, the ordering of the left data frame takes precedence over the ordering
of the right data frame.
Expand Down Expand Up @@ -933,24 +998,27 @@ function outerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame;
on::Union{<:OnType, AbstractVector} = Symbol[],
makeunique::Bool=false, indicator::Union{Nothing, Symbol, AbstractString} = nothing,
validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false),
renamecols::Pair=identity => identity)
renamecols::Pair=identity => identity, matchmissing::Symbol=:error)
if !all(x -> x isa Union{Function, AbstractString, Symbol}, renamecols)
throw(ArgumentError("renamecols keyword argument must be a `Pair`" *
" containing functions, strings, or `Symbol`s"))
end
return _join(df1, df2, on=on, kind=:outer, makeunique=makeunique,
indicator=indicator, validate=validate,
left_rename=first(renamecols), right_rename=last(renamecols))
left_rename=first(renamecols), right_rename=last(renamecols),
matchmissing=matchmissing)
end

outerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame, dfs::AbstractDataFrame...;
on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false,
validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false)) =
validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false),
matchmissing::Symbol=:error) =
outerjoin(outerjoin(df1, df2, on=on, makeunique=makeunique, validate=validate),
dfs..., on=on, makeunique=makeunique, validate=validate)
dfs..., on=on, makeunique=makeunique, validate=validate,
matchmissing=matchmissing)

"""
semijoin(df1, df2; on, makeunique = false, validate = (false, false))
semijoin(df1, df2; on, makeunique=false, validate=(false, false), matchmissing=:error)
Perform a semi join of two data frame objects and return a `DataFrame`
containing the result. A semi join returns the subset of rows of `df1` that
Expand Down Expand Up @@ -980,6 +1048,14 @@ The order of rows in the result is undefined and may change in the future releas
Can be a tuple or a pair, with the first element indicating whether to
run check for `df1` and the second element for `df2`.
By default no check is performed.
- `matchmissing` : if equal to `:error` throw an error if `missing` is present
in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
matched (`isequal` is used for comparisons of rows for equality)
It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
imaginary part of the number. If you need to perform a join on such values use
CategoricalArrays.jl and transform a column containing such values into a
`CategoricalVector`.
When merging `on` categorical columns that differ in the ordering of their
levels, the ordering of the left data frame takes precedence over the ordering
Expand Down Expand Up @@ -1044,13 +1120,14 @@ julia> semijoin(name, job2, on = [:ID => :identifier])
"""
semijoin(df1::AbstractDataFrame, df2::AbstractDataFrame;
on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false,
validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false)) =
validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false),
matchmissing::Symbol=:error) =
_join(df1, df2, on=on, kind=:semi, makeunique=makeunique,
indicator=nothing, validate=validate,
left_rename=identity, right_rename=identity)
left_rename=identity, right_rename=identity, matchmissing=matchmissing)

"""
antijoin(df1, df2; on, makeunique = false, validate = (false, false))
antijoin(df1, df2; on, makeunique=false, validate=(false, false), matchmissing=:error)
Perform an anti join of two data frame objects and return a `DataFrame`
containing the result. An anti join returns the subset of rows of `df1` that do
Expand All @@ -1076,6 +1153,14 @@ The order of rows in the result is undefined and may change in the future releas
Can be a tuple or a pair, with the first element indicating whether to
run check for `df1` and the second element for `df2`.
By default no check is performed.
- `matchmissing` : if equal to `:error` throw an error if `missing` is present
in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
matched (`isequal` is used for comparisons of rows for equality)
It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
imaginary part of the number. If you need to perform a join on such values use
CategoricalArrays.jl and transform a column containing such values into a
`CategoricalVector`.
When merging `on` categorical columns that differ in the ordering of their
levels, the ordering of the left data frame takes precedence over the ordering
Expand Down Expand Up @@ -1137,10 +1222,12 @@ julia> antijoin(name, job2, on = [:ID => :identifier])
"""
antijoin(df1::AbstractDataFrame, df2::AbstractDataFrame;
on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false,
validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false)) =
validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false),
matchmissing::Symbol=:error) =
_join(df1, df2, on=on, kind=:anti, makeunique=makeunique,
indicator=nothing, validate=validate,
left_rename=identity, right_rename=identity)
left_rename=identity, right_rename=identity,
matchmissing=matchmissing)

"""
crossjoin(df1, df2, dfs...; makeunique = false)
Expand Down

0 comments on commit a156ef6

Please sign in to comment.