Merge f215c69 into b71841b

JuliaData · Dec 14, 2017 · 533497e · 533497e
2 parents b71841b + f215c69
commit 533497e
Show file tree

Hide file tree

Showing 6 changed files with 108 additions and 25 deletions.
diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
@@ -129,8 +129,12 @@ names!(df, [:a, :b, :a], allow_duplicates=true)  # renames second :a to :a_1
 ```
 
 """
-function names!(df::AbstractDataFrame, vals; allow_duplicates=false)
-    names!(index(df), vals; allow_duplicates=allow_duplicates)
+function names!(df::AbstractDataFrame, vals; allow_duplicates=false, make_unique=false)
+    if allow_duplicates
+        Base.depwarn("Keyword allow_duplicates is deprecated. Use make_unique.", :names!)
+        make_unique = allow_duplicates
+    end
+    names!(index(df), vals; make_unique=make_unique)
     return df
 end
 

diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl
@@ -596,10 +596,34 @@ Base.setindex!(df::DataFrame, x::Void, col_ind::Int) = delete!(df, col_ind)
 
 Base.empty!(df::DataFrame) = (empty!(df.columns); empty!(index(df)); df)
 
-function Base.insert!(df::DataFrame, col_ind::Int, item::AbstractVector, name::Symbol)
+# TODO: add docstring of make_unique after 0.4 release of DataFrames
+function Base.insert!(df::DataFrame, col_ind::Int, item::AbstractVector, name::Symbol;
+                      make_unique::Bool=false)
     0 < col_ind <= ncol(df) + 1 || throw(BoundsError())
     size(df, 1) == length(item) || size(df, 1) == 0 || error("number of rows does not match")
 
+    if name in _names(df)
+        if make_unique
+            k = 1
+            while true
+                # we only make sure that new column name is unique
+                # if df originally had duplicates in names we do not fix it
+                nn = Symbol("$(name)_$k")
+                if !(nn in _names(df))
+                    name = nn
+                    break
+                end
+                k += 1
+            end
+        else
+            # TODO: after 0.4 release of DataFrames remove depwarn and uncomment ArgumentError below
+            Base.depwarn("Inserting duplicate column name is deprecated.", :insert)
+            # msg = """Duplicate variable name $(name).
+            #      Pass make_unique=true to make it unique using a suffix automatically."""
+            # throw(ArgumentError(msg))
+        end
+    end
+
     insert!(index(df), col_ind, name)
     insert!(df.columns, col_ind, item)
     df
@@ -609,6 +633,31 @@ function Base.insert!(df::DataFrame, col_ind::Int, item, name::Symbol)
     insert!(df, col_ind, upgrade_scalar(df, item), name)
 end
 
+"""
+Merge `DataFrame`s
+
+
+```julia
+merge!(df::DataFrame, others::AbstractDataFrame...)
+```
+
+**Arguments**
+
+* `df` : the DataFrame to merge into
+* `others` : `AbstractDataFrame`s to be merged into `df`
+
+**Result**
+
+* `::DataFrame` : the updated result. Columns with duplicate names are overwritten.
+
+**Examples**
+
+```julia
+df = DataFrame(id = 1:10, x = rand(10), y = rand(["a", "b", "c"], 10))
+df2 = DataFrame(id = 11:20, z = rand(10))
+merge!(df, df2)  # column z is added, column id is overwritten
+```
+"""
 function Base.merge!(df::DataFrame, others::AbstractDataFrame...)
     for other in others
         for n in _names(other)
@@ -698,7 +747,16 @@ end
 ##############################################################################
 
 # hcat! for 2 arguments, only a vector or a data frame is allowed
-function hcat!(df1::DataFrame, df2::AbstractDataFrame)
+function hcat!(df1::DataFrame, df2::AbstractDataFrame; make_unique::Bool=false)
+    common = intersect(_names(df1), _names(df2))
+    if !make_unique && length(common) > 0
+        # TODO: after 0.4 release of DataFrames remove depwarn and uncomment ArgumentError below
+        Base.depwarn("Inserting duplicate column names is deprecated.", :hcat!)
+        # msg = """Duplicate variable names $(common).
+        #      Pass make_unique=true to make them unique using a suffix automatically."""
+        # throw(ArgumentError(msg))
+
+    end
     u = add_names(index(df1), index(df2))
     for i in 1:length(u)
         df1[u[i]] = df2[i]
@@ -707,27 +765,35 @@ function hcat!(df1::DataFrame, df2::AbstractDataFrame)
 end
 
 # definition required to avoid hcat! ambiguity
-function hcat!(df1::DataFrame, df2::DataFrame)
-    invoke(hcat!, Tuple{DataFrame, AbstractDataFrame}, df1, df2)
+function hcat!(df1::DataFrame, df2::DataFrame; make_unique::Bool=false)
+    invoke(hcat!, Tuple{DataFrame, AbstractDataFrame}, df1, df2, make_unique=make_unique)
 end
 
-hcat!(df::DataFrame, x::AbstractVector) = hcat!(df, DataFrame(Any[x]))
-hcat!(x::AbstractVector, df::DataFrame) = hcat!(DataFrame(Any[x]), df)
-function hcat!(x, df::DataFrame)
+hcat!(df::DataFrame, x::AbstractVector; make_unique::Bool=false) =
+    hcat!(df, DataFrame(Any[x]), make_unique=make_unique)
+hcat!(x::AbstractVector, df::DataFrame; make_unique::Bool=false) =
+    hcat!(DataFrame(Any[x]), df, make_unique=make_unique)
+function hcat!(x, df::DataFrame; make_unique::Bool=false)
     throw(ArgumentError("x must be AbstractVector or AbstractDataFrame"))
 end
-function hcat!(df::DataFrame, x)
+function hcat!(df::DataFrame, x; make_unique::Bool=false)
     throw(ArgumentError("x must be AbstractVector or AbstractDataFrame"))
 end
 
 # hcat! for 1-n arguments
-hcat!(df::DataFrame) = df
-hcat!(a::DataFrame, b, c...) = hcat!(hcat!(a, b), c...)
+hcat!(df::DataFrame; make_unique::Bool=false) = df
+hcat!(a::DataFrame, b, c...; make_unique::Bool=false) =
+    hcat!(hcat!(a, b, make_unique=make_unique), c..., make_unique=make_unique)
 
 # hcat
-Base.hcat(df::DataFrame, x) = hcat!(copy(df), x)
-Base.hcat(df1::DataFrame, df2::AbstractDataFrame) = hcat!(copy(df1), df2)
-Base.hcat(df1::DataFrame, df2::AbstractDataFrame, dfn::AbstractDataFrame...) = hcat!(hcat(df1, df2), dfn...)
+# TODO: add docstring of make_unique after 0.4 release of DataFrames
+Base.hcat(df::DataFrame, x; make_unique::Bool=false) =
+    hcat!(copy(df), x, make_unique=make_unique)
+Base.hcat(df1::DataFrame, df2::AbstractDataFrame; make_unique::Bool=false) =
+    hcat!(copy(df1), df2, make_unique=make_unique)
+Base.hcat(df1::DataFrame, df2::AbstractDataFrame, dfn::AbstractDataFrame...;
+          make_unique::Bool=false) =
+    hcat!(hcat(df1, df2, make_unique=make_unique), dfn..., make_unique=make_unique)
 
 ##############################################################################
 ##

diff --git a/src/other/index.jl b/src/other/index.jl
@@ -7,8 +7,8 @@ mutable struct Index <: AbstractIndex   # an OrderedDict would be nice here...
     lookup::Dict{Symbol, Int}      # name => names array position
     names::Vector{Symbol}
 end
-function Index(names::Vector{Symbol}; allow_duplicates=true)
-    u = make_unique(names, allow_duplicates=allow_duplicates)
+function Index(names::Vector{Symbol}; make_unique=false)
+    u = _make_unique(names, make_unique=make_unique)
     lookup = Dict{Symbol, Int}(zip(u, 1:length(u)))
     Index(lookup, u)
 end
@@ -22,11 +22,16 @@ Base.isequal(x::Index, y::Index) = isequal(x.lookup, y.lookup) && isequal(x.name
 # Imported in DataFrames.jl for compatibility across Julia 0.4 and 0.5
 Base.:(==)(x::Index, y::Index) = isequal(x, y)
 
-function names!(x::Index, nms::Vector{Symbol}; allow_duplicates=false)
+# TODO: after DataFrames release 0.4 change docstring of names!
+function names!(x::Index, nms::Vector{Symbol}; allow_duplicates=false, make_unique=false)
+    if allow_duplicates
+        Base.depwarn("Keyword allow_duplicates is deprecated. Use make_unique.", :names!)
+        make_unique = allow_duplicates
+    end
     if length(nms) != length(x)
         throw(ArgumentError("Length of nms doesn't match length of x."))
     end
-    newindex = Index(nms, allow_duplicates=allow_duplicates)
+    newindex = Index(nms, make_unique=make_unique)
     x.names = newindex.names
     x.lookup = newindex.lookup
     return x

diff --git a/src/other/utils.jl b/src/other/utils.jl
@@ -49,7 +49,7 @@ function makeidentifier(s::AbstractString)
     return String(take!(res))
 end
 
-function make_unique(names::Vector{Symbol}; allow_duplicates=true)
+function _make_unique(names::Vector{Symbol}; make_unique=true)
     seen = Set{Symbol}()
     names = copy(names)
     dups = Int[]
@@ -58,10 +58,10 @@ function make_unique(names::Vector{Symbol}; allow_duplicates=true)
         in(name, seen) ? push!(dups, i) : push!(seen, name)
     end
 
-    if !allow_duplicates && length(dups) > 0
+    if !make_unique && length(dups) > 0
         d = unique(names[dups])
         msg = """Duplicate variable names: $d.
-                 Pass allow_duplicates=true to make them unique using a suffix automatically."""
+                 Pass make_unique=true to make them unique using a suffix automatically."""
         throw(ArgumentError(msg))
     end
 

diff --git a/test/dataframe.jl b/test/dataframe.jl
@@ -109,6 +109,14 @@ module TestDataFrame
     @test df[:b] == [3.0, 4.0]
     @test df[:newcol] == ["a", "b"]
 
+    @test insert!(df, 1, ["a1", "b1"], :newcol, make_unique=true) == df
+    @test names(df) == [:newcol_1, :newcol, :a, :b]
+    @test df[:a] == [1, 2]
+    @test df[:b] == [3.0, 4.0]
+    @test df[:newcol] == ["a", "b"]
+    @test df[:newcol_1] == ["a1", "b1"]
+
+
     df = DataFrame(a=[1, 2], b=[3.0, 4.0])
     df2 = DataFrame(b=["a", "b"], c=[:c, :d])
     @test merge!(df, df2) == df

diff --git a/test/utils.jl b/test/utils.jl
@@ -9,9 +9,9 @@ module TestUtils
     @test identifier("begin") == :_begin
     @test identifier("end") == :_end
 
-    @test DataFrames.make_unique([:x, :x, :x_1, :x2]) == [:x, :x_2, :x_1, :x2]
-    @test_throws ArgumentError DataFrames.make_unique([:x, :x, :x_1, :x2], allow_duplicates=false)
-    @test DataFrames.make_unique([:x, :x_1, :x2], allow_duplicates=false) == [:x, :x_1, :x2]
+    @test DataFrames._make_unique([:x, :x, :x_1, :x2]) == [:x, :x_2, :x_1, :x2]
+    @test_throws ArgumentError DataFrames._make_unique([:x, :x, :x_1, :x2], allow_duplicates=false)
+    @test DataFrames._make_unique([:x, :x_1, :x2], allow_duplicates=false) == [:x, :x_1, :x2]
 
     # Check that reserved words are up to date