Merge 482df4e into 740978e

JuliaData · Dec 16, 2017 · d4624d1 · d4624d1
2 parents 740978e + 482df4e
commit d4624d1
Show file tree

Hide file tree

Showing 10 changed files with 197 additions and 67 deletions.
diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
@@ -110,7 +110,7 @@ names!(df::AbstractDataFrame, vals)
 * `df` : the AbstractDataFrame
 * `vals` : column names, normally a Vector{Symbol} the same length as
   the number of columns in `df`
-* `allow_duplicates` : if `false` (the default), an error will be raised
+* `makeunique` : if `false` (the default), an error will be raised
   if duplicate names are found; if `true`, duplicate names will be suffixed
   with `_i` (`i` starting at 1 for the first duplicate).
 
@@ -125,12 +125,17 @@ names!(df::AbstractDataFrame, vals)
 df = DataFrame(i = 1:10, x = rand(10), y = rand(["a", "b", "c"], 10))
 names!(df, [:a, :b, :c])
 names!(df, [:a, :b, :a])  # throws ArgumentError
-names!(df, [:a, :b, :a], allow_duplicates=true)  # renames second :a to :a_1
+names!(df, [:a, :b, :a], makeunique=true)  # renames second :a to :a_1
 ```
 
 """
-function names!(df::AbstractDataFrame, vals; allow_duplicates=false)
-    names!(index(df), vals; allow_duplicates=allow_duplicates)
+# TODO: remove allow_duplicates after deprecation period
+function names!(df::AbstractDataFrame, vals; allow_duplicates=false, makeunique::Bool=false)
+    if allow_duplicates
+        Base.depwarn("Keyword allow_duplicates is deprecated. Use makeunique.", :names!)
+        makeunique = true
+    end
+    names!(index(df), vals, makeunique=makeunique)
     return df
 end
 
@@ -172,6 +177,9 @@ rename(f::Function, df::AbstractDataFrame)
 
 * `::AbstractDataFrame` : the updated result
 
+New names are processed sequentially. A new name must not exist in the `DataFrame`
+at the moment an attempt to rename a column is performed.
+
 **Examples**
 
 ```julia
@@ -678,18 +686,25 @@ without(df::AbstractDataFrame, c::Any) = without(df, index(df)[c])
 
 # hcat's first argument must be an AbstractDataFrame
 # or AbstractVector if the second argument is AbstractDataFrame
-# Trailing arguments (currently) may also be vectors or scalars.
+# Trailing arguments (currently) may also be vectors.
 
 # hcat! is defined in DataFrames/DataFrames.jl
 # Its first argument (currently) must be a DataFrame.
 
 # catch-all to cover cases where indexing returns a DataFrame and copy doesn't
-Base.hcat(df::AbstractDataFrame, x) = hcat!(df[:, :], x)
-Base.hcat(x, df::AbstractDataFrame) = hcat!(x, df[:, :])
-Base.hcat(df1::AbstractDataFrame, df2::AbstractDataFrame) = hcat!(df1[:, :], df2)
 
-Base.hcat(df::AbstractDataFrame, x, y...) = hcat!(hcat(df, x), y...)
-Base.hcat(df1::AbstractDataFrame, df2::AbstractDataFrame, dfn::AbstractDataFrame...) = hcat!(hcat(df1, df2), dfn...)
+# TODO: after deprecation period change all to makeunique::Bool=false
+Base.hcat(df::AbstractDataFrame, x; makeunique::Bool=true) =
+    hcat!(df[:, :], x, makeunique=makeunique)
+Base.hcat(x, df::AbstractDataFrame; makeunique::Bool=true) =
+    hcat!(x, df[:, :], makeunique=makeunique)
+Base.hcat(df1::AbstractDataFrame, df2::AbstractDataFrame; makeunique::Bool=true) =
+    hcat!(df1[:, :], df2, makeunique=makeunique)
+Base.hcat(df::AbstractDataFrame, x, y...; makeunique::Bool=true) =
+    hcat!(hcat(df, x, makeunique=makeunique), y..., makeunique=makeunique)
+Base.hcat(df1::AbstractDataFrame, df2::AbstractDataFrame, dfn::AbstractDataFrame...;
+          makeunique::Bool=true) =
+    hcat!(hcat(df1, df2, makeunique=makeunique), dfn..., makeunique=makeunique)
 
 @generated function promote_col_type(cols::AbstractVector...)
     T = mapreduce(x -> Missings.T(eltype(x)), promote_type, cols)

diff --git a/src/abstractdataframe/io.jl b/src/abstractdataframe/io.jl
@@ -307,5 +307,6 @@ DataFrame(sink, sch::Data.Schema, ::Type{S}, append::Bool;
     append!(sink.columns[col], column)
 end
 
-
-Data.close!(df::DataFrameStream) = DataFrame(collect(Any, df.columns), Symbol.(df.header))
+# TODO: after deprecation period change all to makeunique::Bool=false
+Data.close!(df::DataFrameStream, makeunique::Bool=true) =
+    DataFrame(collect(Any, df.columns), Symbol.(df.header), makeunique=makeunique)
diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl
@@ -48,9 +48,12 @@ Base.length(x::RowIndexMap) = length(x.orig)
 
 # composes the joined data table using the maps between the left and right
 # table rows and the indices of rows in the result
+
+# TODO: after deprecation period change all to makeunique::Bool=false
 function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol,
                               left_ixs::RowIndexMap, leftonly_ixs::RowIndexMap,
-                              right_ixs::RowIndexMap, rightonly_ixs::RowIndexMap)
+                              right_ixs::RowIndexMap, rightonly_ixs::RowIndexMap;
+                              makeunique::Bool=true)
     @assert length(left_ixs) == length(right_ixs)
     # compose left half of the result taking all left columns
     all_orig_left_ixs = vcat(left_ixs.orig, leftonly_ixs.orig)
@@ -95,7 +98,7 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol,
         copy!(cols[i+ncleft], view(col, all_orig_right_ixs))
         permute!(cols[i+ncleft], right_perm)
     end
-    res = DataFrame(cols, vcat(names(joiner.dfl), names(dfr_noon)))
+    res = DataFrame(cols, vcat(names(joiner.dfl), names(dfr_noon)), makeunique=makeunique)
 
     if length(rightonly_ixs.join) > 0
         # some left rows are missings, so the values of the "on" columns
@@ -253,13 +256,14 @@ join(name, job2, on = :ID => :identifier)
 ```
 
 """
+# TODO: after deprecation period change all to makeunique::Bool=false
 function Base.join(df1::AbstractDataFrame,
                    df2::AbstractDataFrame;
                    on::Union{<:OnType, AbstractVector{<:OnType}} = Symbol[],
-                   kind::Symbol = :inner)
+                   kind::Symbol = :inner, makeunique::Bool=true)
     if kind == :cross
         (on == Symbol[]) || throw(ArgumentError("Cross joins don't use argument 'on'."))
-        return crossjoin(df1, df2)
+        return crossjoin(df1, df2, makeunique=makeunique)
     elseif on == Symbol[]
         throw(ArgumentError("Missing join argument 'on'."))
     end
@@ -269,19 +273,23 @@ function Base.join(df1::AbstractDataFrame,
     if kind == :inner
         compose_joined_table(joiner, kind, update_row_maps!(joiner.dfl_on, joiner.dfr_on,
                                                             group_rows(joiner.dfr_on),
-                                                            true, false, true, false)...)
+                                                            true, false, true, false)...,
+                                                            makeunique=makeunique)
     elseif kind == :left
         compose_joined_table(joiner, kind, update_row_maps!(joiner.dfl_on, joiner.dfr_on,
                                                             group_rows(joiner.dfr_on),
-                                                            true, true, true, false)...)
+                                                            true, true, true, false)...,
+                                                            makeunique=makeunique)
     elseif kind == :right
         compose_joined_table(joiner, kind, update_row_maps!(joiner.dfr_on, joiner.dfl_on,
                                                             group_rows(joiner.dfl_on),
-                                                            true, true, true, false)[[3, 4, 1, 2]]...)
+                                                            true, true, true, false)[[3, 4, 1, 2]]...,
+                                                            makeunique=makeunique)
     elseif kind == :outer
         compose_joined_table(joiner, kind, update_row_maps!(joiner.dfl_on, joiner.dfr_on,
                                                             group_rows(joiner.dfr_on),
-                                                            true, true, true, true)...)
+                                                            true, true, true, true)...,
+                                                            makeunique=makeunique)
     elseif kind == :semi
         # hash the right rows
         dfr_on_grp = group_rows(joiner.dfr_on)
@@ -315,10 +323,11 @@ function Base.join(df1::AbstractDataFrame,
     end
 end
 
-function crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame)
+# TODO: after deprecation period change all to makeunique::Bool=false
+function crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; makeunique::Bool=true)
     r1, r2 = size(df1, 1), size(df2, 1)
+    colindex = merge(index(df1), index(df2), makeunique=makeunique)
     cols = Any[[repeat(c, inner=r2) for c in columns(df1)];
                [repeat(c, outer=r1) for c in columns(df2)]]
-    colindex = merge(index(df1), index(df2))
     DataFrame(cols, colindex)
 end
diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl
@@ -84,7 +84,7 @@ function stack(df::AbstractDataFrame, measure_vars::Vector{Int},
     DataFrame(Any[repeat(_names(df)[measure_vars], inner=nrow(df)),   # variable
                   vcat([df[c] for c in measure_vars]...),             # value
                   [repeat(df[c], outer=N) for c in id_vars]...],      # id_var columns
-              cnames)
+              cnames) # TODO update this comment later: now it allows duplicate names, but after deprecation it will not
 end
 function stack(df::AbstractDataFrame, measure_var::Int, id_var::Int;
                variable_name::Symbol=:variable, value_name::Symbol=:value)
@@ -254,7 +254,7 @@ function _unstack(df::AbstractDataFrame, rowkey::Int,
     copy!(col, levs)
     hadmissing && (col[end] = missing)
     df2 = DataFrame(unstacked_val, map(Symbol, levels(keycol)))
-    insert!(df2, 1, col, _names(df)[rowkey])
+    insert!(df2, 1, col, _names(df)[rowkey]) # TODO update this comment later: now it allows duplicate names, but after deprecation it will not
 end
 
 unstack(df::AbstractDataFrame, rowkey::ColumnIndex,
@@ -320,7 +320,7 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Symbol},
         mask_filled[i, j] = true
     end
     df2 = DataFrame(unstacked_val, map(Symbol, levels(keycol)))
-    hcat(df1, df2)
+    hcat(df1, df2) # TODO update this comment later: now it allows duplicate names, but after deprecation it will not
 end
 
 unstack(df::AbstractDataFrame) = unstack(df, :id, :variable, :value)
@@ -526,7 +526,7 @@ function stackdf(df::AbstractDataFrame, measure_vars::Vector{Int},
     DataFrame(Any[RepeatedVector(_names(df)[measure_vars], nrow(df), 1),   # variable
                   StackedVector(Any[df[:,c] for c in measure_vars]),     # value
                   [RepeatedVector(df[:,c], 1, N) for c in id_vars]...],     # id_var columns
-              cnames)
+              cnames) # duplicate names not allowed
 end
 function stackdf(df::AbstractDataFrame, measure_var::Int, id_var::Int;
                  variable_name::Symbol=:variable, value_name::Symbol=:value)

diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl
@@ -106,27 +106,34 @@ mutable struct DataFrame <: AbstractDataFrame
     end
 end
 
-function DataFrame(pairs::Pair{Symbol,<:Any}...)
+# TODO: after deprecation period change all to makeunique::Bool=false
+function DataFrame(pairs::Pair{Symbol,<:Any}...; makeunique::Bool=true)::DataFrame
     colnames = Symbol[k for (k,v) in pairs]
     columns = Any[v for (k,v) in pairs]
-    DataFrame(columns, Index(colnames))
+    DataFrame(columns, Index(colnames, makeunique=makeunique))
 end
 
+# TODO: after deprecation period change all to makeunique::Bool=false
 function DataFrame(; kwargs...)
     if isempty(kwargs)
         DataFrame(Any[], Index())
     else
-        DataFrame((k => v for (k,v) in kwargs)...)
+        DataFrame((k => v for (k,v) in kwargs)..., makeunique=true)::DataFrame
     end
 end
 
+# TODO: after deprecation period change all to makeunique::Bool=false
 function DataFrame(columns::AbstractVector,
-                   cnames::AbstractVector{Symbol} = gennames(length(columns)))
-    return DataFrame(convert(Vector{Any}, columns), Index(convert(Vector{Symbol}, cnames)))
+                   cnames::AbstractVector{Symbol} = gennames(length(columns));
+                   makeunique::Bool=true)::DataFrame
+    return DataFrame(convert(Vector{Any}, columns), Index(convert(Vector{Symbol}, cnames),
+                     makeunique=makeunique))
 end
 
 # Initialize an empty DataFrame with specific eltypes and names
-function DataFrame(column_eltypes::AbstractVector{T}, cnames::AbstractVector{Symbol}, nrows::Integer) where T<:Type
+# TODO: after deprecation period change all to makeunique::Bool=false
+function DataFrame(column_eltypes::AbstractVector{T}, cnames::AbstractVector{Symbol},
+                   nrows::Integer; makeunique::Bool=true)::DataFrame where T<:Type
     columns = Vector{Any}(length(column_eltypes))
     for (j, elty) in enumerate(column_eltypes)
         if elty >: Missing
@@ -143,13 +150,15 @@ function DataFrame(column_eltypes::AbstractVector{T}, cnames::AbstractVector{Sym
             end
         end
     end
-    return DataFrame(columns, Index(convert(Vector{Symbol}, cnames)))
+    return DataFrame(columns, Index(convert(Vector{Symbol}, cnames), makeunique=makeunique))
 end
 
 # Initialize an empty DataFrame with specific eltypes and names
 # and whether a CategoricalArray should be created
+# TODO: after deprecation period change all to makeunique::Bool=false
 function DataFrame(column_eltypes::AbstractVector{T}, cnames::AbstractVector{Symbol},
-                   categorical::Vector{Bool}, nrows::Integer) where T<:Type
+                   categorical::Vector{Bool}, nrows::Integer;
+                   makeunique::Bool=true)::DataFrame where T<:Type
     # upcast Vector{DataType} -> Vector{Type} which can hold CategoricalValues
     updated_types = convert(Vector{Type}, column_eltypes)
     for i in eachindex(categorical)
@@ -160,7 +169,7 @@ function DataFrame(column_eltypes::AbstractVector{T}, cnames::AbstractVector{Sym
             updated_types[i] = CategoricalValue{updated_types[i]}
         end
     end
-    return DataFrame(updated_types, cnames, nrows)
+    return DataFrame(updated_types, cnames, nrows, makeunique=makeunique)
 end
 
 # Initialize empty DataFrame objects of arbitrary size
@@ -596,10 +605,33 @@ Base.setindex!(df::DataFrame, x::Void, col_ind::Int) = delete!(df, col_ind)
 
 Base.empty!(df::DataFrame) = (empty!(df.columns); empty!(index(df)); df)
 
-function Base.insert!(df::DataFrame, col_ind::Int, item::AbstractVector, name::Symbol)
+# TODO: add docstring of makeunique after 0.4 release of DataFrames
+function Base.insert!(df::DataFrame, col_ind::Int, item::AbstractVector, name::Symbol;
+                      makeunique::Bool=false)
     0 < col_ind <= ncol(df) + 1 || throw(BoundsError())
     size(df, 1) == length(item) || size(df, 1) == 0 || error("number of rows does not match")
 
+    if haskey(df, name)
+        if makeunique
+            k = 1
+            while true
+                # we only make sure that new column name is unique
+                # if df originally had duplicates in names we do not fix it
+                nn = Symbol("$(name)_$k")
+                if !haskey(df, nn)
+                    name = nn
+                    break
+                end
+                k += 1
+            end
+        else
+            # TODO: after 0.4 release of DataFrames remove depwarn and uncomment ArgumentError below
+            Base.depwarn("Inserting duplicate column name is deprecated.", :insert!)
+            # msg = """Duplicate variable name $(name).
+            #      Pass makeunique=true to make it unique using a suffix automatically."""
+            # throw(ArgumentError(msg))
+        end
+    end
     insert!(index(df), col_ind, name)
     insert!(df.columns, col_ind, item)
     df
@@ -609,6 +641,35 @@ function Base.insert!(df::DataFrame, col_ind::Int, item, name::Symbol)
     insert!(df, col_ind, upgrade_scalar(df, item), name)
 end
 
+"""
+Merge `DataFrame`s
+
+
+```julia
+merge!(df::DataFrame, others::AbstractDataFrame...)
+```
+
+For every column `c` with name `n` in `others` sequentially performs `df[n] = c`.
+This behavior is identical with how `merge!` works for any `Associative` type.
+Use `join` if you want to join two `DataFrame`s.
+
+**Arguments**
+
+* `df` : the DataFrame to merge into
+* `others` : `AbstractDataFrame`s to be merged into `df`
+
+**Result**
+
+* `::DataFrame` : the updated result. Columns with duplicate names are overwritten.
+
+**Examples**
+
+```julia
+df = DataFrame(id = 1:10, x = rand(10), y = rand(["a", "b", "c"], 10))
+df2 = DataFrame(id = 11:20, z = rand(10))
+merge!(df, df2)  # column z is added, column id is overwritten
+```
+"""
 function Base.merge!(df::DataFrame, others::AbstractDataFrame...)
     for other in others
         for n in _names(other)
@@ -698,36 +759,48 @@ end
 ##############################################################################
 
 # hcat! for 2 arguments, only a vector or a data frame is allowed
-function hcat!(df1::DataFrame, df2::AbstractDataFrame)
-    u = add_names(index(df1), index(df2))
+# TODO: after deprecation period change all to makeunique::Bool=false
+function hcat!(df1::DataFrame, df2::AbstractDataFrame; makeunique::Bool=true)
+    u = add_names(index(df1), index(df2), makeunique=makeunique)
     for i in 1:length(u)
         df1[u[i]] = df2[i]
     end
     return df1
 end
 
 # definition required to avoid hcat! ambiguity
-function hcat!(df1::DataFrame, df2::DataFrame)
-    invoke(hcat!, Tuple{DataFrame, AbstractDataFrame}, df1, df2)
+# TODO: after deprecation period change all to makeunique::Bool=false
+function hcat!(df1::DataFrame, df2::DataFrame; makeunique::Bool=true)
+    invoke(hcat!, Tuple{DataFrame, AbstractDataFrame}, df1, df2, makeunique=makeunique)
 end
 
-hcat!(df::DataFrame, x::AbstractVector) = hcat!(df, DataFrame(Any[x]))
-hcat!(x::AbstractVector, df::DataFrame) = hcat!(DataFrame(Any[x]), df)
-function hcat!(x, df::DataFrame)
+# TODO: after deprecation period change all to makeunique::Bool=false
+hcat!(df::DataFrame, x::AbstractVector; makeunique::Bool=true) =
+    hcat!(df, DataFrame(Any[x]), makeunique=makeunique)
+hcat!(x::AbstractVector, df::DataFrame; makeunique::Bool=true) =
+    hcat!(DataFrame(Any[x]), df, makeunique=makeunique)
+function hcat!(x, df::DataFrame; makeunique::Bool=true)
     throw(ArgumentError("x must be AbstractVector or AbstractDataFrame"))
 end
-function hcat!(df::DataFrame, x)
+function hcat!(df::DataFrame, x; makeunique::Bool=true)
     throw(ArgumentError("x must be AbstractVector or AbstractDataFrame"))
 end
 
 # hcat! for 1-n arguments
-hcat!(df::DataFrame) = df
-hcat!(a::DataFrame, b, c...) = hcat!(hcat!(a, b), c...)
+# TODO: after deprecation period change all to makeunique::Bool=false
+hcat!(df::DataFrame; makeunique::Bool=true) = df
+hcat!(a::DataFrame, b, c...; makeunique::Bool=true) =
+    hcat!(hcat!(a, b, makeunique=makeunique), c..., makeunique=makeunique)
 
 # hcat
-Base.hcat(df::DataFrame, x) = hcat!(copy(df), x)
-Base.hcat(df1::DataFrame, df2::AbstractDataFrame) = hcat!(copy(df1), df2)
-Base.hcat(df1::DataFrame, df2::AbstractDataFrame, dfn::AbstractDataFrame...) = hcat!(hcat(df1, df2), dfn...)
+# TODO: after deprecation period change all to makeunique::Bool=false
+Base.hcat(df::DataFrame, x; makeunique::Bool=true) =
+    hcat!(copy(df), x, makeunique=makeunique)
+Base.hcat(df1::DataFrame, df2::AbstractDataFrame; makeunique::Bool=true) =
+    hcat!(copy(df1), df2, makeunique=makeunique)
+Base.hcat(df1::DataFrame, df2::AbstractDataFrame, dfn::AbstractDataFrame...;
+          makeunique::Bool=true) =
+    hcat!(hcat(df1, df2, makeunique=makeunique), dfn..., makeunique=makeunique)
 
 ##############################################################################
 ##