[BREAKING] add matchmissing kwarg to joins (#2504)

JuliaData · Nov 2, 2020 · a156ef6 · a156ef6
1 parent 5224175
commit a156ef6
Show file tree

Hide file tree

Showing 4 changed files with 166 additions and 42 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -44,6 +44,9 @@
 * in `describe` the specification of custom aggregation is now `function => name`;
   old `name => function` order is now deprecated
   ([#2401](https://github.com/JuliaData/DataFrames.jl/pull/2401))
+* in joins passing `NaN` or real or imaginary `-0.0` in `on` column now throws an
+  error; passing `missing` thows an error unless `matchmissing=:equal` keyword argument
+  is passed ([#2504](https://github.com/JuliaData/DataFrames.jl/pull/2504))
 * `unstack` now produces row and column keys in the order of their first appearance
    and has two new keyword arguments `allowmissing` and `allowduplicates`
   ([#2494](https://github.com/JuliaData/DataFrames.jl/pull/2494))

diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl
@@ -19,7 +19,8 @@ struct DataFrameJoiner
     right_on::Vector{Symbol}
 
     function DataFrameJoiner(dfl::AbstractDataFrame, dfr::AbstractDataFrame,
-                             on::Union{<:OnType, AbstractVector})
+                             on::Union{<:OnType, AbstractVector},
+                             matchmissing::Symbol)
         on_cols = isa(on, AbstractVector) ? on : [on]
         left_on = Symbol[]
         right_on = Symbol[]
@@ -41,7 +42,32 @@ struct DataFrameJoiner
                                     "Symbol or Pair{Symbol,Symbol}."))
             end
         end
-        new(dfl, dfr, dfl[!, left_on], dfr[!, right_on], left_on, right_on)
+        dfl_on = dfl[!, left_on]
+        dfr_on = dfr[!, right_on]
+
+        if matchmissing === :error
+            for df in (dfl_on, dfr_on), col in eachcol(df)
+                if any(ismissing, col)
+                    throw(ArgumentError("missing values in key columns are not allowed " *
+                                        "when matchmissing == :error"))
+                end
+            end
+        elseif matchmissing !== :equal
+            throw(ArgumentError("matchmissing allows only :error or :equal"))
+        end
+
+        for df in (dfl_on, dfr_on), col in eachcol(df)
+            if any(x -> (x isa Union{Complex, Real}) &&
+                        (isnan(x) || isequal(real(x), -0.0) || isequal(imag(x), -0.0)), col)
+                throw(ArgumentError("currently for numeric values NaN and `-0.0` " *
+                                    "in their real or imaginary components are not " *
+                                    "allowed. Use CategoricalArrays.jl to wrap " *
+                                    "these values in a CategoricalVector to perform " *
+                                    "the requested join."))
+            end
+        end
+
+        new(dfl, dfr, dfl_on, dfr_on, left_on, right_on)
     end
 end
 
@@ -267,15 +293,16 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame;
                indicator::Union{Nothing, Symbol, AbstractString},
                validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}},
                left_rename::Union{Function, AbstractString, Symbol},
-               right_rename::Union{Function, AbstractString, Symbol})
+               right_rename::Union{Function, AbstractString, Symbol},
+               matchmissing::Symbol)
     _check_consistency(df1)
     _check_consistency(df2)
 
     if on == []
         throw(ArgumentError("Missing join argument 'on'."))
     end
 
-    joiner = DataFrameJoiner(df1, df2, on)
+    joiner = DataFrameJoiner(df1, df2, on, matchmissing)
 
     # Check merge key validity
     left_invalid = validate[1] ? any(nonunique(joiner.dfl, joiner.left_on)) : false
@@ -448,10 +475,10 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame;
 end
 
 """
-    innerjoin(df1, df2; on, makeunique = false,
-              validate = (false, false), renamecols = identity => identity)
-    innerjoin(df1, df2, dfs...; on, makeunique = false,
-              validate = (false, false))
+    innerjoin(df1, df2; on, makeunique=false, validate=(false, false),
+              renamecols=(identity => identity), matchmissing=:error)
+    innerjoin(df1, df2, dfs...; on, makeunique=false,
+              validate=(false, false), matchmissing=:error)
 
 Perform an inner join of two or more data frame objects and return a `DataFrame`
 containing the result. An inner join includes rows with keys that match in all
@@ -486,6 +513,14 @@ The order of rows in the result is undefined and may change in the future releas
   to each column name, which is passed to it as a `String`. Note that `renamecols`
   does not affect `on` columns, whose names are always taken from the left
   data frame and left unchanged.
+- `matchmissing` : if equal to `:error` throw an error if `missing` is present
+  in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
+  matched (`isequal` is used for comparisons of rows for equality)
+
+It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
+imaginary part of the number. If you need to perform a join on such values use
+CategoricalArrays.jl and transform a column containing such values into a
+`CategoricalVector`.
 
 When merging `on` categorical columns that differ in the ordering of their
 levels, the ordering of the left data frame takes precedence over the ordering
@@ -556,26 +591,30 @@ function innerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame;
                    on::Union{<:OnType, AbstractVector} = Symbol[],
                    makeunique::Bool=false,
                    validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false),
-                   renamecols::Pair=identity => identity)
+                   renamecols::Pair=identity => identity,
+                   matchmissing::Symbol=:error)
     if !all(x -> x isa Union{Function, AbstractString, Symbol}, renamecols)
         throw(ArgumentError("renamecols keyword argument must be a `Pair`" *
                             " containing functions, strings, or `Symbol`s"))
     end
     return _join(df1, df2, on=on, kind=:inner, makeunique=makeunique,
                  indicator=nothing, validate=validate,
-                 left_rename=first(renamecols), right_rename=last(renamecols))
+                 left_rename=first(renamecols), right_rename=last(renamecols),
+                 matchmissing=matchmissing)
 end
 
 innerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame, dfs::AbstractDataFrame...;
           on::Union{<:OnType, AbstractVector} = Symbol[],
           makeunique::Bool=false,
-          validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false)) =
+          validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false),
+          matchmissing::Symbol=:error) =
     innerjoin(innerjoin(df1, df2, on=on, makeunique=makeunique, validate=validate),
-              dfs..., on=on, makeunique=makeunique, validate=validate)
+              dfs..., on=on, makeunique=makeunique, validate=validate,
+              matchmissing=matchmissing)
 
 """
-    leftjoin(df1, df2; on, makeunique = false, indicator = nothing,
-             validate = (false, false), renamecols = identity => identity)
+    leftjoin(df1, df2; on, makeunique=false, indicator=nothing, validate=(false, false),
+             renamecols=(identity => identity), matchmissing=:error)
 
 Perform a left join of twodata frame objects and return a `DataFrame` containing
 the result. A left join includes all rows from `df1`.
@@ -611,9 +650,17 @@ The order of rows in the result is undefined and may change in the future releas
   to each column name, which is passed to it as a `String`. Note that `renamecols`
   does not affect `on` columns, whose names are always taken from the left
   data frame and left unchanged.
+- `matchmissing` : if equal to `:error` throw an error if `missing` is present
+  in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
+  matched (`isequal` is used for comparisons of rows for equality)
 
 All columns of the returned data table will support missing values.
 
+It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
+imaginary part of the number. If you need to perform a join on such values use
+CategoricalArrays.jl and transform a column containing such values into a
+`CategoricalVector`.
+
 When merging `on` categorical columns that differ in the ordering of their
 levels, the ordering of the left data frame takes precedence over the ordering
 of the right data frame.
@@ -682,19 +729,21 @@ function leftjoin(df1::AbstractDataFrame, df2::AbstractDataFrame;
          on::Union{<:OnType, AbstractVector} = Symbol[],
          makeunique::Bool=false, indicator::Union{Nothing, Symbol, AbstractString} = nothing,
          validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false),
-         renamecols::Pair=identity => identity)
+         renamecols::Pair=identity => identity, matchmissing::Symbol=:error)
     if !all(x -> x isa Union{Function, AbstractString, Symbol}, renamecols)
         throw(ArgumentError("renamecols keyword argument must be a `Pair`" *
                             " containing functions, strings, or `Symbol`s"))
     end
     return _join(df1, df2, on=on, kind=:left, makeunique=makeunique,
                  indicator=indicator, validate=validate,
-                 left_rename=first(renamecols), right_rename=last(renamecols))
+                 left_rename=first(renamecols), right_rename=last(renamecols),
+                 matchmissing=matchmissing)
 end
 
 """
-    rightjoin(df1, df2; on, makeunique = false, indicator = nothing,
-              validate = (false, false), renamecols = identity => identity)
+    rightjoin(df1, df2; on, makeunique=false, indicator = nothing,
+              validate=(false, false), renamecols=(identity => identity),
+              matchmissing=:error)
 
 Perform a right join on two data frame objects and return a `DataFrame` containing
 the result. A right join includes all rows from `df2`.
@@ -730,9 +779,17 @@ The order of rows in the result is undefined and may change in the future releas
   to each column name, which is passed to it as a `String`. Note that `renamecols`
   does not affect `on` columns, whose names are always taken from the left
   data frame and left unchanged.
+- `matchmissing` : if equal to `:error` throw an error if `missing` is present
+  in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
+  matched (`isequal` is used for comparisons of rows for equality)
 
 All columns of the returned data table will support missing values.
 
+It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
+imaginary part of the number. If you need to perform a join on such values use
+CategoricalArrays.jl and transform a column containing such values into a
+`CategoricalVector`.
+
 When merging `on` categorical columns that differ in the ordering of their
 levels, the ordering of the left data frame takes precedence over the ordering
 of the right data frame.
@@ -801,21 +858,22 @@ function rightjoin(df1::AbstractDataFrame, df2::AbstractDataFrame;
           on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false,
           indicator::Union{Nothing, Symbol, AbstractString} = nothing,
           validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false),
-          renamecols::Pair=identity => identity)
+          renamecols::Pair=identity => identity, matchmissing::Symbol=:error)
     if !all(x -> x isa Union{Function, AbstractString, Symbol}, renamecols)
         throw(ArgumentError("renamecols keyword argument must be a `Pair`" *
                             " containing functions, strings, or `Symbol`s"))
     end
     return _join(df1, df2, on=on, kind=:right, makeunique=makeunique,
                  indicator=indicator, validate=validate,
-                 left_rename=first(renamecols), right_rename=last(renamecols))
+                 left_rename=first(renamecols), right_rename=last(renamecols),
+                 matchmissing=matchmissing)
 end
 
 """
-    outerjoin(df1, df2; on, makeunique = false, indicator = nothing,
-              validate = (false, false), renamecols = identity => identity)
+    outerjoin(df1, df2; on, makeunique=false, indicator=nothing, validate=(false, false),
+              renamecols=(identity => identity), matchmissing=:error)
     outerjoin(df1, df2, dfs...; on, makeunique = false,
-              validate = (false, false))
+              validate = (false, false), matchmissing=:error)
 
 Perform an outer join of two or more data frame objects and return a `DataFrame`
 containing the result. An outer join includes rows with keys that appear in any
@@ -855,10 +913,17 @@ The order of rows in the result is undefined and may change in the future releas
   to each column name, which is passed to it as a `String`. Note that `renamecols`
   does not affect `on` columns, whose names are always taken from the left
   data frame and left unchanged.
-
+- `matchmissing` : if equal to `:error` throw an error if `missing` is present
+  in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
+  matched (`isequal` is used for comparisons of rows for equality)
 
 All columns of the returned data table will support missing values.
 
+It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
+imaginary part of the number. If you need to perform a join on such values use
+CategoricalArrays.jl and transform a column containing such values into a
+`CategoricalVector`.
+
 When merging `on` categorical columns that differ in the ordering of their
 levels, the ordering of the left data frame takes precedence over the ordering
 of the right data frame.
@@ -933,24 +998,27 @@ function outerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame;
           on::Union{<:OnType, AbstractVector} = Symbol[],
           makeunique::Bool=false, indicator::Union{Nothing, Symbol, AbstractString} = nothing,
           validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false),
-          renamecols::Pair=identity => identity)
+          renamecols::Pair=identity => identity, matchmissing::Symbol=:error)
     if !all(x -> x isa Union{Function, AbstractString, Symbol}, renamecols)
         throw(ArgumentError("renamecols keyword argument must be a `Pair`" *
                             " containing functions, strings, or `Symbol`s"))
     end
     return _join(df1, df2, on=on, kind=:outer, makeunique=makeunique,
                  indicator=indicator, validate=validate,
-                 left_rename=first(renamecols), right_rename=last(renamecols))
+                 left_rename=first(renamecols), right_rename=last(renamecols),
+                 matchmissing=matchmissing)
 end
 
 outerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame, dfs::AbstractDataFrame...;
           on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false,
-          validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false)) =
+          validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false),
+          matchmissing::Symbol=:error) =
     outerjoin(outerjoin(df1, df2, on=on, makeunique=makeunique, validate=validate),
-              dfs..., on=on, makeunique=makeunique, validate=validate)
+              dfs..., on=on, makeunique=makeunique, validate=validate,
+              matchmissing=matchmissing)
 
 """
-    semijoin(df1, df2; on, makeunique = false, validate = (false, false))
+    semijoin(df1, df2; on, makeunique=false, validate=(false, false), matchmissing=:error)
 
 Perform a semi join of two data frame objects and return a `DataFrame`
 containing the result. A semi join returns the subset of rows of `df1` that
@@ -980,6 +1048,14 @@ The order of rows in the result is undefined and may change in the future releas
    Can be a tuple or a pair, with the first element indicating whether to
    run check for `df1` and the second element for `df2`.
    By default no check is performed.
+- `matchmissing` : if equal to `:error` throw an error if `missing` is present
+  in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
+  matched (`isequal` is used for comparisons of rows for equality)
+
+It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
+imaginary part of the number. If you need to perform a join on such values use
+CategoricalArrays.jl and transform a column containing such values into a
+`CategoricalVector`.
 
 When merging `on` categorical columns that differ in the ordering of their
 levels, the ordering of the left data frame takes precedence over the ordering
@@ -1044,13 +1120,14 @@ julia> semijoin(name, job2, on = [:ID => :identifier])
 """
 semijoin(df1::AbstractDataFrame, df2::AbstractDataFrame;
          on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false,
-         validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false)) =
+         validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false),
+         matchmissing::Symbol=:error) =
     _join(df1, df2, on=on, kind=:semi, makeunique=makeunique,
           indicator=nothing, validate=validate,
-          left_rename=identity, right_rename=identity)
+          left_rename=identity, right_rename=identity, matchmissing=matchmissing)
 
 """
-    antijoin(df1, df2; on, makeunique = false, validate = (false, false))
+    antijoin(df1, df2; on, makeunique=false, validate=(false, false), matchmissing=:error)
 
 Perform an anti join of two data frame objects and return a `DataFrame`
 containing the result. An anti join returns the subset of rows of `df1` that do
@@ -1076,6 +1153,14 @@ The order of rows in the result is undefined and may change in the future releas
    Can be a tuple or a pair, with the first element indicating whether to
    run check for `df1` and the second element for `df2`.
    By default no check is performed.
+- `matchmissing` : if equal to `:error` throw an error if `missing` is present
+  in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
+  matched (`isequal` is used for comparisons of rows for equality)
+
+It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
+imaginary part of the number. If you need to perform a join on such values use
+CategoricalArrays.jl and transform a column containing such values into a
+`CategoricalVector`.
 
 When merging `on` categorical columns that differ in the ordering of their
 levels, the ordering of the left data frame takes precedence over the ordering
@@ -1137,10 +1222,12 @@ julia> antijoin(name, job2, on = [:ID => :identifier])
 """
 antijoin(df1::AbstractDataFrame, df2::AbstractDataFrame;
          on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false,
-         validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false)) =
+         validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false),
+         matchmissing::Symbol=:error) =
     _join(df1, df2, on=on, kind=:anti, makeunique=makeunique,
           indicator=nothing, validate=validate,
-          left_rename=identity, right_rename=identity)
+          left_rename=identity, right_rename=identity,
+          matchmissing=matchmissing)
 
 """
     crossjoin(df1, df2, dfs...; makeunique = false)