Skip to content

Commit

Permalink
Merge 482df4e into 740978e
Browse files Browse the repository at this point in the history
  • Loading branch information
bkamins committed Dec 16, 2017
2 parents 740978e + 482df4e commit d4624d1
Show file tree
Hide file tree
Showing 10 changed files with 197 additions and 67 deletions.
35 changes: 25 additions & 10 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ names!(df::AbstractDataFrame, vals)
* `df` : the AbstractDataFrame
* `vals` : column names, normally a Vector{Symbol} the same length as
the number of columns in `df`
* `allow_duplicates` : if `false` (the default), an error will be raised
* `makeunique` : if `false` (the default), an error will be raised
if duplicate names are found; if `true`, duplicate names will be suffixed
with `_i` (`i` starting at 1 for the first duplicate).
Expand All @@ -125,12 +125,17 @@ names!(df::AbstractDataFrame, vals)
df = DataFrame(i = 1:10, x = rand(10), y = rand(["a", "b", "c"], 10))
names!(df, [:a, :b, :c])
names!(df, [:a, :b, :a]) # throws ArgumentError
names!(df, [:a, :b, :a], allow_duplicates=true) # renames second :a to :a_1
names!(df, [:a, :b, :a], makeunique=true) # renames second :a to :a_1
```
"""
function names!(df::AbstractDataFrame, vals; allow_duplicates=false)
names!(index(df), vals; allow_duplicates=allow_duplicates)
# TODO: remove allow_duplicates after deprecation period
function names!(df::AbstractDataFrame, vals; allow_duplicates=false, makeunique::Bool=false)
if allow_duplicates
Base.depwarn("Keyword allow_duplicates is deprecated. Use makeunique.", :names!)
makeunique = true
end
names!(index(df), vals, makeunique=makeunique)
return df
end

Expand Down Expand Up @@ -172,6 +177,9 @@ rename(f::Function, df::AbstractDataFrame)
* `::AbstractDataFrame` : the updated result
New names are processed sequentially. A new name must not exist in the `DataFrame`
at the moment an attempt to rename a column is performed.
**Examples**
```julia
Expand Down Expand Up @@ -678,18 +686,25 @@ without(df::AbstractDataFrame, c::Any) = without(df, index(df)[c])

# hcat's first argument must be an AbstractDataFrame
# or AbstractVector if the second argument is AbstractDataFrame
# Trailing arguments (currently) may also be vectors or scalars.
# Trailing arguments (currently) may also be vectors.

# hcat! is defined in DataFrames/DataFrames.jl
# Its first argument (currently) must be a DataFrame.

# catch-all to cover cases where indexing returns a DataFrame and copy doesn't
Base.hcat(df::AbstractDataFrame, x) = hcat!(df[:, :], x)
Base.hcat(x, df::AbstractDataFrame) = hcat!(x, df[:, :])
Base.hcat(df1::AbstractDataFrame, df2::AbstractDataFrame) = hcat!(df1[:, :], df2)

Base.hcat(df::AbstractDataFrame, x, y...) = hcat!(hcat(df, x), y...)
Base.hcat(df1::AbstractDataFrame, df2::AbstractDataFrame, dfn::AbstractDataFrame...) = hcat!(hcat(df1, df2), dfn...)
# TODO: after deprecation period change all to makeunique::Bool=false
Base.hcat(df::AbstractDataFrame, x; makeunique::Bool=true) =
hcat!(df[:, :], x, makeunique=makeunique)
Base.hcat(x, df::AbstractDataFrame; makeunique::Bool=true) =
hcat!(x, df[:, :], makeunique=makeunique)
Base.hcat(df1::AbstractDataFrame, df2::AbstractDataFrame; makeunique::Bool=true) =
hcat!(df1[:, :], df2, makeunique=makeunique)
Base.hcat(df::AbstractDataFrame, x, y...; makeunique::Bool=true) =
hcat!(hcat(df, x, makeunique=makeunique), y..., makeunique=makeunique)
Base.hcat(df1::AbstractDataFrame, df2::AbstractDataFrame, dfn::AbstractDataFrame...;
makeunique::Bool=true) =
hcat!(hcat(df1, df2, makeunique=makeunique), dfn..., makeunique=makeunique)

@generated function promote_col_type(cols::AbstractVector...)
T = mapreduce(x -> Missings.T(eltype(x)), promote_type, cols)
Expand Down
5 changes: 3 additions & 2 deletions src/abstractdataframe/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -307,5 +307,6 @@ DataFrame(sink, sch::Data.Schema, ::Type{S}, append::Bool;
append!(sink.columns[col], column)
end


Data.close!(df::DataFrameStream) = DataFrame(collect(Any, df.columns), Symbol.(df.header))
# TODO: after deprecation period change all to makeunique::Bool=false
Data.close!(df::DataFrameStream, makeunique::Bool=true) =
DataFrame(collect(Any, df.columns), Symbol.(df.header), makeunique=makeunique)
29 changes: 19 additions & 10 deletions src/abstractdataframe/join.jl
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,12 @@ Base.length(x::RowIndexMap) = length(x.orig)

# composes the joined data table using the maps between the left and right
# table rows and the indices of rows in the result

# TODO: after deprecation period change all to makeunique::Bool=false
function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol,
left_ixs::RowIndexMap, leftonly_ixs::RowIndexMap,
right_ixs::RowIndexMap, rightonly_ixs::RowIndexMap)
right_ixs::RowIndexMap, rightonly_ixs::RowIndexMap;
makeunique::Bool=true)
@assert length(left_ixs) == length(right_ixs)
# compose left half of the result taking all left columns
all_orig_left_ixs = vcat(left_ixs.orig, leftonly_ixs.orig)
Expand Down Expand Up @@ -95,7 +98,7 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol,
copy!(cols[i+ncleft], view(col, all_orig_right_ixs))
permute!(cols[i+ncleft], right_perm)
end
res = DataFrame(cols, vcat(names(joiner.dfl), names(dfr_noon)))
res = DataFrame(cols, vcat(names(joiner.dfl), names(dfr_noon)), makeunique=makeunique)

if length(rightonly_ixs.join) > 0
# some left rows are missings, so the values of the "on" columns
Expand Down Expand Up @@ -253,13 +256,14 @@ join(name, job2, on = :ID => :identifier)
```
"""
# TODO: after deprecation period change all to makeunique::Bool=false
function Base.join(df1::AbstractDataFrame,
df2::AbstractDataFrame;
on::Union{<:OnType, AbstractVector{<:OnType}} = Symbol[],
kind::Symbol = :inner)
kind::Symbol = :inner, makeunique::Bool=true)
if kind == :cross
(on == Symbol[]) || throw(ArgumentError("Cross joins don't use argument 'on'."))
return crossjoin(df1, df2)
return crossjoin(df1, df2, makeunique=makeunique)
elseif on == Symbol[]
throw(ArgumentError("Missing join argument 'on'."))
end
Expand All @@ -269,19 +273,23 @@ function Base.join(df1::AbstractDataFrame,
if kind == :inner
compose_joined_table(joiner, kind, update_row_maps!(joiner.dfl_on, joiner.dfr_on,
group_rows(joiner.dfr_on),
true, false, true, false)...)
true, false, true, false)...,
makeunique=makeunique)
elseif kind == :left
compose_joined_table(joiner, kind, update_row_maps!(joiner.dfl_on, joiner.dfr_on,
group_rows(joiner.dfr_on),
true, true, true, false)...)
true, true, true, false)...,
makeunique=makeunique)
elseif kind == :right
compose_joined_table(joiner, kind, update_row_maps!(joiner.dfr_on, joiner.dfl_on,
group_rows(joiner.dfl_on),
true, true, true, false)[[3, 4, 1, 2]]...)
true, true, true, false)[[3, 4, 1, 2]]...,
makeunique=makeunique)
elseif kind == :outer
compose_joined_table(joiner, kind, update_row_maps!(joiner.dfl_on, joiner.dfr_on,
group_rows(joiner.dfr_on),
true, true, true, true)...)
true, true, true, true)...,
makeunique=makeunique)
elseif kind == :semi
# hash the right rows
dfr_on_grp = group_rows(joiner.dfr_on)
Expand Down Expand Up @@ -315,10 +323,11 @@ function Base.join(df1::AbstractDataFrame,
end
end

function crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame)
# TODO: after deprecation period change all to makeunique::Bool=false
function crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; makeunique::Bool=true)
r1, r2 = size(df1, 1), size(df2, 1)
colindex = merge(index(df1), index(df2), makeunique=makeunique)
cols = Any[[repeat(c, inner=r2) for c in columns(df1)];
[repeat(c, outer=r1) for c in columns(df2)]]
colindex = merge(index(df1), index(df2))
DataFrame(cols, colindex)
end
8 changes: 4 additions & 4 deletions src/abstractdataframe/reshape.jl
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ function stack(df::AbstractDataFrame, measure_vars::Vector{Int},
DataFrame(Any[repeat(_names(df)[measure_vars], inner=nrow(df)), # variable
vcat([df[c] for c in measure_vars]...), # value
[repeat(df[c], outer=N) for c in id_vars]...], # id_var columns
cnames)
cnames) # TODO update this comment later: now it allows duplicate names, but after deprecation it will not
end
function stack(df::AbstractDataFrame, measure_var::Int, id_var::Int;
variable_name::Symbol=:variable, value_name::Symbol=:value)
Expand Down Expand Up @@ -254,7 +254,7 @@ function _unstack(df::AbstractDataFrame, rowkey::Int,
copy!(col, levs)
hadmissing && (col[end] = missing)
df2 = DataFrame(unstacked_val, map(Symbol, levels(keycol)))
insert!(df2, 1, col, _names(df)[rowkey])
insert!(df2, 1, col, _names(df)[rowkey]) # TODO update this comment later: now it allows duplicate names, but after deprecation it will not
end

unstack(df::AbstractDataFrame, rowkey::ColumnIndex,
Expand Down Expand Up @@ -320,7 +320,7 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Symbol},
mask_filled[i, j] = true
end
df2 = DataFrame(unstacked_val, map(Symbol, levels(keycol)))
hcat(df1, df2)
hcat(df1, df2) # TODO update this comment later: now it allows duplicate names, but after deprecation it will not
end

unstack(df::AbstractDataFrame) = unstack(df, :id, :variable, :value)
Expand Down Expand Up @@ -526,7 +526,7 @@ function stackdf(df::AbstractDataFrame, measure_vars::Vector{Int},
DataFrame(Any[RepeatedVector(_names(df)[measure_vars], nrow(df), 1), # variable
StackedVector(Any[df[:,c] for c in measure_vars]), # value
[RepeatedVector(df[:,c], 1, N) for c in id_vars]...], # id_var columns
cnames)
cnames) # duplicate names not allowed
end
function stackdf(df::AbstractDataFrame, measure_var::Int, id_var::Int;
variable_name::Symbol=:variable, value_name::Symbol=:value)
Expand Down
119 changes: 96 additions & 23 deletions src/dataframe/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -106,27 +106,34 @@ mutable struct DataFrame <: AbstractDataFrame
end
end

function DataFrame(pairs::Pair{Symbol,<:Any}...)
# TODO: after deprecation period change all to makeunique::Bool=false
function DataFrame(pairs::Pair{Symbol,<:Any}...; makeunique::Bool=true)::DataFrame
colnames = Symbol[k for (k,v) in pairs]
columns = Any[v for (k,v) in pairs]
DataFrame(columns, Index(colnames))
DataFrame(columns, Index(colnames, makeunique=makeunique))
end

# TODO: after deprecation period change all to makeunique::Bool=false
function DataFrame(; kwargs...)
if isempty(kwargs)
DataFrame(Any[], Index())
else
DataFrame((k => v for (k,v) in kwargs)...)
DataFrame((k => v for (k,v) in kwargs)..., makeunique=true)::DataFrame
end
end

# TODO: after deprecation period change all to makeunique::Bool=false
function DataFrame(columns::AbstractVector,
cnames::AbstractVector{Symbol} = gennames(length(columns)))
return DataFrame(convert(Vector{Any}, columns), Index(convert(Vector{Symbol}, cnames)))
cnames::AbstractVector{Symbol} = gennames(length(columns));
makeunique::Bool=true)::DataFrame
return DataFrame(convert(Vector{Any}, columns), Index(convert(Vector{Symbol}, cnames),
makeunique=makeunique))
end

# Initialize an empty DataFrame with specific eltypes and names
function DataFrame(column_eltypes::AbstractVector{T}, cnames::AbstractVector{Symbol}, nrows::Integer) where T<:Type
# TODO: after deprecation period change all to makeunique::Bool=false
function DataFrame(column_eltypes::AbstractVector{T}, cnames::AbstractVector{Symbol},
nrows::Integer; makeunique::Bool=true)::DataFrame where T<:Type
columns = Vector{Any}(length(column_eltypes))
for (j, elty) in enumerate(column_eltypes)
if elty >: Missing
Expand All @@ -143,13 +150,15 @@ function DataFrame(column_eltypes::AbstractVector{T}, cnames::AbstractVector{Sym
end
end
end
return DataFrame(columns, Index(convert(Vector{Symbol}, cnames)))
return DataFrame(columns, Index(convert(Vector{Symbol}, cnames), makeunique=makeunique))
end

# Initialize an empty DataFrame with specific eltypes and names
# and whether a CategoricalArray should be created
# TODO: after deprecation period change all to makeunique::Bool=false
function DataFrame(column_eltypes::AbstractVector{T}, cnames::AbstractVector{Symbol},
categorical::Vector{Bool}, nrows::Integer) where T<:Type
categorical::Vector{Bool}, nrows::Integer;
makeunique::Bool=true)::DataFrame where T<:Type
# upcast Vector{DataType} -> Vector{Type} which can hold CategoricalValues
updated_types = convert(Vector{Type}, column_eltypes)
for i in eachindex(categorical)
Expand All @@ -160,7 +169,7 @@ function DataFrame(column_eltypes::AbstractVector{T}, cnames::AbstractVector{Sym
updated_types[i] = CategoricalValue{updated_types[i]}
end
end
return DataFrame(updated_types, cnames, nrows)
return DataFrame(updated_types, cnames, nrows, makeunique=makeunique)
end

# Initialize empty DataFrame objects of arbitrary size
Expand Down Expand Up @@ -596,10 +605,33 @@ Base.setindex!(df::DataFrame, x::Void, col_ind::Int) = delete!(df, col_ind)

Base.empty!(df::DataFrame) = (empty!(df.columns); empty!(index(df)); df)

function Base.insert!(df::DataFrame, col_ind::Int, item::AbstractVector, name::Symbol)
# TODO: add docstring of makeunique after 0.4 release of DataFrames
function Base.insert!(df::DataFrame, col_ind::Int, item::AbstractVector, name::Symbol;
makeunique::Bool=false)
0 < col_ind <= ncol(df) + 1 || throw(BoundsError())
size(df, 1) == length(item) || size(df, 1) == 0 || error("number of rows does not match")

if haskey(df, name)
if makeunique
k = 1
while true
# we only make sure that new column name is unique
# if df originally had duplicates in names we do not fix it
nn = Symbol("$(name)_$k")
if !haskey(df, nn)
name = nn
break
end
k += 1
end
else
# TODO: after 0.4 release of DataFrames remove depwarn and uncomment ArgumentError below
Base.depwarn("Inserting duplicate column name is deprecated.", :insert!)
# msg = """Duplicate variable name $(name).
# Pass makeunique=true to make it unique using a suffix automatically."""
# throw(ArgumentError(msg))
end
end
insert!(index(df), col_ind, name)
insert!(df.columns, col_ind, item)
df
Expand All @@ -609,6 +641,35 @@ function Base.insert!(df::DataFrame, col_ind::Int, item, name::Symbol)
insert!(df, col_ind, upgrade_scalar(df, item), name)
end

"""
Merge `DataFrame`s
```julia
merge!(df::DataFrame, others::AbstractDataFrame...)
```
For every column `c` with name `n` in `others` sequentially performs `df[n] = c`.
This behavior is identical with how `merge!` works for any `Associative` type.
Use `join` if you want to join two `DataFrame`s.
**Arguments**
* `df` : the DataFrame to merge into
* `others` : `AbstractDataFrame`s to be merged into `df`
**Result**
* `::DataFrame` : the updated result. Columns with duplicate names are overwritten.
**Examples**
```julia
df = DataFrame(id = 1:10, x = rand(10), y = rand(["a", "b", "c"], 10))
df2 = DataFrame(id = 11:20, z = rand(10))
merge!(df, df2) # column z is added, column id is overwritten
```
"""
function Base.merge!(df::DataFrame, others::AbstractDataFrame...)
for other in others
for n in _names(other)
Expand Down Expand Up @@ -698,36 +759,48 @@ end
##############################################################################

# hcat! for 2 arguments, only a vector or a data frame is allowed
function hcat!(df1::DataFrame, df2::AbstractDataFrame)
u = add_names(index(df1), index(df2))
# TODO: after deprecation period change all to makeunique::Bool=false
function hcat!(df1::DataFrame, df2::AbstractDataFrame; makeunique::Bool=true)
u = add_names(index(df1), index(df2), makeunique=makeunique)
for i in 1:length(u)
df1[u[i]] = df2[i]
end
return df1
end

# definition required to avoid hcat! ambiguity
function hcat!(df1::DataFrame, df2::DataFrame)
invoke(hcat!, Tuple{DataFrame, AbstractDataFrame}, df1, df2)
# TODO: after deprecation period change all to makeunique::Bool=false
function hcat!(df1::DataFrame, df2::DataFrame; makeunique::Bool=true)
invoke(hcat!, Tuple{DataFrame, AbstractDataFrame}, df1, df2, makeunique=makeunique)
end

hcat!(df::DataFrame, x::AbstractVector) = hcat!(df, DataFrame(Any[x]))
hcat!(x::AbstractVector, df::DataFrame) = hcat!(DataFrame(Any[x]), df)
function hcat!(x, df::DataFrame)
# TODO: after deprecation period change all to makeunique::Bool=false
hcat!(df::DataFrame, x::AbstractVector; makeunique::Bool=true) =
hcat!(df, DataFrame(Any[x]), makeunique=makeunique)
hcat!(x::AbstractVector, df::DataFrame; makeunique::Bool=true) =
hcat!(DataFrame(Any[x]), df, makeunique=makeunique)
function hcat!(x, df::DataFrame; makeunique::Bool=true)
throw(ArgumentError("x must be AbstractVector or AbstractDataFrame"))
end
function hcat!(df::DataFrame, x)
function hcat!(df::DataFrame, x; makeunique::Bool=true)
throw(ArgumentError("x must be AbstractVector or AbstractDataFrame"))
end

# hcat! for 1-n arguments
hcat!(df::DataFrame) = df
hcat!(a::DataFrame, b, c...) = hcat!(hcat!(a, b), c...)
# TODO: after deprecation period change all to makeunique::Bool=false
hcat!(df::DataFrame; makeunique::Bool=true) = df
hcat!(a::DataFrame, b, c...; makeunique::Bool=true) =
hcat!(hcat!(a, b, makeunique=makeunique), c..., makeunique=makeunique)

# hcat
Base.hcat(df::DataFrame, x) = hcat!(copy(df), x)
Base.hcat(df1::DataFrame, df2::AbstractDataFrame) = hcat!(copy(df1), df2)
Base.hcat(df1::DataFrame, df2::AbstractDataFrame, dfn::AbstractDataFrame...) = hcat!(hcat(df1, df2), dfn...)
# TODO: after deprecation period change all to makeunique::Bool=false
Base.hcat(df::DataFrame, x; makeunique::Bool=true) =
hcat!(copy(df), x, makeunique=makeunique)
Base.hcat(df1::DataFrame, df2::AbstractDataFrame; makeunique::Bool=true) =
hcat!(copy(df1), df2, makeunique=makeunique)
Base.hcat(df1::DataFrame, df2::AbstractDataFrame, dfn::AbstractDataFrame...;
makeunique::Bool=true) =
hcat!(hcat(df1, df2, makeunique=makeunique), dfn..., makeunique=makeunique)

##############################################################################
##
Expand Down

0 comments on commit d4624d1

Please sign in to comment.