From 06dc914bf5ebc03bebb007630cf2e7711ff75319 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Mon, 13 Mar 2017 22:54:45 -0700 Subject: [PATCH] remove RepeatedVector, StackedVector, unstackdt, meltdt --- docs/src/lib/manipulation.md | 2 - docs/src/man/reshaping_and_pivoting.md | 23 -- src/DataTables.jl | 2 - src/abstractdatatable/abstractdatatable.jl | 5 +- src/abstractdatatable/reshape.jl | 320 ++------------------- src/datatable/datatable.jl | 12 +- src/deprecated.jl | 4 +- test/data.jl | 16 +- test/datatable.jl | 15 +- test/show.jl | 7 - 10 files changed, 40 insertions(+), 366 deletions(-) diff --git a/docs/src/lib/manipulation.md b/docs/src/lib/manipulation.md index c67345a..8d24d4b 100644 --- a/docs/src/lib/manipulation.md +++ b/docs/src/lib/manipulation.md @@ -20,6 +20,4 @@ join melt stack unstack -stackdt -meltdt ``` diff --git a/docs/src/man/reshaping_and_pivoting.md b/docs/src/man/reshaping_and_pivoting.md index 1b936e1..d99e814 100644 --- a/docs/src/man/reshaping_and_pivoting.md +++ b/docs/src/man/reshaping_and_pivoting.md @@ -53,29 +53,6 @@ If the remaining columns are unique, you can skip the id variable and use: widedt = unstack(longdt, :variable, :value) ``` -`stackdt` and `meltdt` are two additional functions that work like `stack` and `melt`, but they provide a view into the original wide DataTable. Here is an example: - -```julia -d = stackdt(iris) -``` - -This saves memory. To create the view, several AbstractVectors are defined: - -`:variable` column -- `EachRepeatedVector` -This repeats the variables N times where N is the number of rows of the original AbstractDataTable. - -`:value` column -- `StackedVector` -This is provides a view of the original columns stacked together. - -Id columns -- `RepeatedVector` -This repeats the original columns N times where N is the number of columns stacked. - -For more details on the storage representation, see: - -```julia -dump(stackdt(iris)) -``` - None of these reshaping functions perform any aggregation. To do aggregation, use the split-apply-combine functions in combination with reshaping. Here is an example: ```julia diff --git a/src/DataTables.jl b/src/DataTables.jl index 799f7f6..4b89a3b 100644 --- a/src/DataTables.jl +++ b/src/DataTables.jl @@ -57,7 +57,6 @@ export @~, eltypes, groupby, melt, - meltdt, names!, ncol, nonunique, @@ -71,7 +70,6 @@ export @~, rename, showcols, stack, - stackdt, unique!, unstack, head, diff --git a/src/abstractdatatable/abstractdatatable.jl b/src/abstractdatatable/abstractdatatable.jl index 19bfe20..583ecba 100644 --- a/src/abstractdatatable/abstractdatatable.jl +++ b/src/abstractdatatable/abstractdatatable.jl @@ -777,7 +777,6 @@ end Convert columns with a `Nullable` element type without any null values to a non-`Nullable` equivalent array type. The table `dt` is modified in place. -`NullableVectors` are aliased to their `values` field. # Examples @@ -852,7 +851,7 @@ julia> eltypes(dt) See also [`denullify!`] & [`nullify`](@ref). """ -denullify(dt::AbstractDataTable) = denullify!(copy(dt)) +denullify(dt::AbstractDataTable) = denullify!(deepcopy(dt)) """ nullify!(dt::AbstractDataTable) @@ -933,7 +932,7 @@ julia> eltypes(dt) See also [`nullify!`](@ref) & [`denullify`](@ref). """ function nullify(dt::AbstractDataTable) - nullify!(copy(dt)) + nullify!(deepcopy(dt)) end ## Documentation for methods defined elsewhere diff --git a/src/abstractdatatable/reshape.jl b/src/abstractdatatable/reshape.jl index a537cca..5234864 100644 --- a/src/abstractdatatable/reshape.jl +++ b/src/abstractdatatable/reshape.jl @@ -53,11 +53,6 @@ melt(dt::AbstractDataTable, [id_vars], [measure_vars]; column `:variable` a Vector of Symbols with the `measure_vars` name, and with columns for each of the `id_vars`. -See also `stackdt` and `meltdt` for stacking methods that return a -view into the original DataTable. See `unstack` for converting from -long to wide format. - - ### Examples ```julia @@ -98,7 +93,7 @@ function stack(dt::AbstractDataTable, measure_vars::Vector{Int}, id_var::Int; end function stack(dt::AbstractDataTable, measure_var::Int, id_vars::Vector{Int}; variable_name::Symbol=:variable, value_name::Symbol=:value) - stackdt(dt, [measure_var], id_vars; + stack(dt, [measure_var], id_vars; variable_name=variable_name, value_name=value_name) end function stack(dt::AbstractDataTable, measure_vars, id_vars; @@ -193,30 +188,19 @@ function unstack(dt::AbstractDataTable, rowkey::Int, colkey::Int, value::Int) # `rowkey` integer indicating which column to place along rows # `colkey` integer indicating which column to place along column headers # `value` integer indicating which column has values - refkeycol = NullableCategoricalArray(dt[rowkey]) - valuecol = dt[value] - keycol = NullableCategoricalArray(dt[colkey]) - Nrow = length(refkeycol.pool) - Ncol = length(keycol.pool) - T = eltype(valuecol) - if T <: Nullable - T = eltype(T) - end - payload = DataTable(Any[NullableVector{T}(Nrow) for i in 1:Ncol], - map(Symbol, levels(keycol))) - nowarning = true - for k in 1:nrow(dt) - j = Int(CategoricalArrays.order(keycol.pool)[keycol.refs[k]]) - i = Int(CategoricalArrays.order(refkeycol.pool)[refkeycol.refs[k]]) - if i > 0 && j > 0 - if nowarning && !isnull(payload[j][i]) - warn("Duplicate entries in unstack.") - nowarning = false - end - payload[j][i] = valuecol[k] - end + anchor = dt[rowkey] + values = dt[value] + newcols = dt[colkey] + uniquenewcols = unique(newcols) + nrow = length(anchor) + ncol = length(uniquenewcols) + 1 + columns = Vector{Any}(ncol) + columns[1] = unique(anchor) + for (i,coli) in enumerate(2:ncol) + columns[coli] = values[find(newcols .== uniquenewcols[i])] end - denullify!(insert!(payload, 1, NullableArray(levels(refkeycol)), _names(dt)[rowkey])) + colnames = vcat(names(dt)[rowkey], Symbol.(uniquenewcols)) + DataTable(columns, colnames) end unstack(dt::AbstractDataTable, rowkey, colkey, value) = unstack(dt, index(dt)[rowkey], index(dt)[colkey], index(dt)[value]) @@ -226,278 +210,16 @@ unstack(dt::AbstractDataTable, colkey, value) = unstack(dt, index(dt)[colkey], index(dt)[value]) function unstack(dt::AbstractDataTable, colkey::Int, value::Int) - # group on anything not a key or value: - g = groupby(dt, setdiff(_names(dt), _names(dt)[[colkey, value]]), sort=true) - groupidxs = [g.idx[g.starts[i]:g.ends[i]] for i in 1:length(g.starts)] - rowkey = zeros(Int, size(dt, 1)) - for i in 1:length(groupidxs) - rowkey[groupidxs[i]] = i - end - keycol = NullableCategoricalArray(dt[colkey]) - valuecol = dt[value] - dt1 = dt[g.idx[g.starts], g.cols] - Nrow = length(g) - Ncol = length(levels(keycol)) - T = eltype(valuecol) - if T <: Nullable - T = eltype(T) - end - dt2 = DataTable(Any[NullableVector{T}(Nrow) for i in 1:Ncol], - map(@compat(Symbol), levels(keycol))) - for k in 1:nrow(dt) - j = Int(CategoricalArrays.order(keycol.pool)[keycol.refs[k]]) - i = rowkey[k] - if i > 0 && j > 0 - dt2[j][i] = valuecol[k] + anchor = unique(dt[deleteat!(names(dt), [colkey, value])]) + groups = groupby(dt, names(anchor)) + newcolnames = unique(dt[colkey]) + newcols = DataTable(Any[typeof(dt[value])(size(anchor,1)) for n in newcolnames], Symbol.(newcolnames)) + for (i, g) in enumerate(groups) + for col in newcolnames + newcols[i, Symbol(col)] = g[g[colkey] .== col, value][1] end end - denullify!(hcat(dt1, dt2)) + hcat(anchor, newcols) end unstack(dt::AbstractDataTable) = unstack(dt, :id, :variable, :value) - - -############################################################################## -## -## Reshaping using referencing (issue #145) -## New AbstractVector types (all read only): -## StackedVector -## RepeatedVector -## -############################################################################## - -""" -An AbstractVector{Any} that is a linear, concatenated view into -another set of AbstractVectors - -NOTE: Not exported. - -### Constructor - -```julia -StackedVector(d::AbstractVector...) -``` - -### Arguments - -* `d...` : one or more AbstractVectors - -### Examples - -```julia -StackedVector(Any[[1,2], [9,10], [11,12]]) # [1,2,9,10,11,12] -``` - -""" -type StackedVector <: AbstractVector{Any} - components::Vector{Any} -end - -function Base.getindex(v::StackedVector,i::Real) - lengths = [length(x)::Int for x in v.components] - cumlengths = [0; cumsum(lengths)] - j = searchsortedlast(cumlengths .+ 1, i) - if j > length(cumlengths) - error("indexing bounds error") - end - k = i - cumlengths[j] - if k < 1 || k > length(v.components[j]) - error("indexing bounds error") - end - v.components[j][k] -end - -function Base.getindex{I<:Real}(v::StackedVector,i::AbstractVector{I}) - result = similar(v.components[1], length(i)) - for idx in 1:length(i) - result[idx] = v[i[idx]] - end - result -end - -Base.size(v::StackedVector) = (length(v),) -Base.length(v::StackedVector) = sum(map(length, v.components)) -Base.ndims(v::StackedVector) = 1 -Base.eltype(v::StackedVector) = promote_type(map(eltype, v.components)...) -Base.similar(v::StackedVector, T, dims::Dims) = similar(v.components[1], T, dims) - -CategoricalArrays.CategoricalArray(v::StackedVector) = CategoricalArray(v[:]) # could be more efficient - - -""" -An AbstractVector that is a view into another AbstractVector with -repeated elements - -NOTE: Not exported. - -### Constructor - -```julia -RepeatedVector(parent::AbstractVector, inner::Int, outer::Int) -``` - -### Arguments - -* `parent` : the AbstractVector that's repeated -* `inner` : the numer of times each element is repeated -* `outer` : the numer of times the whole vector is repeated after - expanded by `inner` - -`inner` and `outer` have the same meaning as similarly named arguments -to `repeat`. - -### Examples - -```julia -RepeatedVector([1,2], 3, 1) # [1,1,1,2,2,2] -RepeatedVector([1,2], 1, 3) # [1,2,1,2,1,2] -RepeatedVector([1,2], 2, 2) # [1,2,1,2,1,2,1,2] -``` - -""" -type RepeatedVector{T} <: AbstractVector{T} - parent::AbstractVector{T} - inner::Int - outer::Int -end - -function Base.getindex{T,I<:Real}(v::RepeatedVector{T},i::AbstractVector{I}) - N = length(v.parent) - idx = Int[Base.fld1(mod1(j,v.inner*N),v.inner) for j in i] - v.parent[idx] -end -function Base.getindex{T}(v::RepeatedVector{T},i::Real) - N = length(v.parent) - idx = Base.fld1(mod1(i,v.inner*N),v.inner) - v.parent[idx] -end -Base.getindex(v::RepeatedVector,i::Range) = getindex(v, [i;]) - -Base.size(v::RepeatedVector) = (length(v),) -Base.length(v::RepeatedVector) = v.inner * v.outer * length(v.parent) -Base.ndims(v::RepeatedVector) = 1 -Base.eltype{T}(v::RepeatedVector{T}) = T -Base.reverse(v::RepeatedVector) = RepeatedVector(reverse(v.parent), v.inner, v.outer) -Base.similar(v::RepeatedVector, T, dims::Dims) = similar(v.parent, T, dims) -Base.unique(v::RepeatedVector) = unique(v.parent) - -function CategoricalArrays.CategoricalArray(v::RepeatedVector) - res = CategoricalArrays.CategoricalArray(v.parent) - res.refs = repeat(res.refs, inner = [v.inner], outer = [v.outer]) - res -end - -############################################################################## -## -## stackdt() -## meltdt() -## Reshaping using referencing (issue #145), using the above vector types -## -############################################################################## - -""" -A stacked view of a DataTable (long format) - -Like `stack` and `melt`, but a view is returned rather than data -copies. - -```julia -stackdt(dt::AbstractDataTable, [measure_vars], [id_vars]; - variable_name::Symbol=:variable, value_name::Symbol=:value) -meltdt(dt::AbstractDataTable, [id_vars], [measure_vars]; - variable_name::Symbol=:variable, value_name::Symbol=:value) -``` - -### Arguments - -* `dt` : the wide AbstractDataTable - -* `measure_vars` : the columns to be stacked (the measurement - variables), a normal column indexing type, like a Symbol, - Vector{Symbol}, Int, etc.; for `melt`, defaults to all - variables that are not `id_vars` - -* `id_vars` : the identifier columns that are repeated during - stacking, a normal column indexing type; for `stack` defaults to all - variables that are not `measure_vars` - -### Result - -* `::DataTable` : the long-format datatable with column `:value` - holding the values of the stacked columns (`measure_vars`), with - column `:variable` a Vector of Symbols with the `measure_vars` name, - and with columns for each of the `id_vars`. - -The result is a view because the columns are special AbstractVectors -that return indexed views into the original DataTable. - -### Examples - -```julia -d1 = DataTable(a = repeat([1:3;], inner = [4]), - b = repeat([1:4;], inner = [3]), - c = randn(12), - d = randn(12), - e = map(string, 'a':'l')) - -d1s = stackdt(d1, [:c, :d]) -d1s2 = stackdt(d1, [:c, :d], [:a]) -d1m = meltdt(d1, [:a, :b, :e]) -``` - -""" -function stackdt(dt::AbstractDataTable, measure_vars::Vector{Int}, - id_vars::Vector{Int}; variable_name::Symbol=:variable, - value_name::Symbol=:value) - N = length(measure_vars) - cnames = names(dt)[id_vars] - insert!(cnames, 1, value_name) - insert!(cnames, 1, variable_name) - DataTable(Any[RepeatedVector(_names(dt)[measure_vars], nrow(dt), 1), # variable - StackedVector(Any[dt[:,c] for c in measure_vars]), # value - [RepeatedVector(dt[:,c], 1, N) for c in id_vars]...], # id_var columns - cnames) -end -function stackdt(dt::AbstractDataTable, measure_var::Int, id_var::Int; - variable_name::Symbol=:variable, value_name::Symbol=:value) - stackdt(dt, [measure_var], [id_var]; variable_name=variable_name, - value_name=value_name) -end -function stackdt(dt::AbstractDataTable, measure_vars, id_var::Int; - variable_name::Symbol=:variable, value_name::Symbol=:value) - stackdt(dt, measure_vars, [id_var]; variable_name=variable_name, - value_name=value_name) -end -function stackdt(dt::AbstractDataTable, measure_var::Int, id_vars; - variable_name::Symbol=:variable, value_name::Symbol=:value) - stackdt(dt, [measure_var], id_vars; variable_name=variable_name, - value_name=value_name) -end -function stackdt(dt::AbstractDataTable, measure_vars, id_vars; - variable_name::Symbol=:variable, value_name::Symbol=:value) - stackdt(dt, index(dt)[measure_vars], index(dt)[id_vars]; - variable_name=variable_name, value_name=value_name) -end -function stackdt(dt::AbstractDataTable, measure_vars = numeric_vars(dt); - variable_name::Symbol=:variable, value_name::Symbol=:value) - m_inds = index(dt)[measure_vars] - stackdt(dt, m_inds, _setdiff(1:ncol(dt), m_inds); - variable_name=variable_name, value_name=value_name) -end - -""" -A stacked view of a DataTable (long format); see `stackdt` -""" -function meltdt(dt::AbstractDataTable, id_vars; variable_name::Symbol=:variable, - value_name::Symbol=:value) - id_inds = index(dt)[id_vars] - stackdt(dt, _setdiff(1:ncol(dt), id_inds), id_inds; - variable_name=variable_name, value_name=value_name) -end -function meltdt(dt::AbstractDataTable, id_vars, measure_vars; - variable_name::Symbol=:variable, value_name::Symbol=:value) - stackdt(dt, measure_vars, id_vars; variable_name=variable_name, - value_name=value_name) -end -meltdt(dt::AbstractDataTable; variable_name::Symbol=:variable, value_name::Symbol=:value) = - stackdt(dt; variable_name=variable_name, value_name=value_name) diff --git a/src/datatable/datatable.jl b/src/datatable/datatable.jl index eed2e0a..9d8dd37 100644 --- a/src/datatable/datatable.jl +++ b/src/datatable/datatable.jl @@ -103,7 +103,7 @@ type DataTable <: AbstractDataTable strnames = string.(names(colindex)) for (i,u) in enumerate(uniques) indices = find(lengths .== u) - estring[i] = "column length ($(lengths[1])) for column(s) ($(join(strnames[indices], ", ")))" + estring[i] = "column length ($(uniques[i])) for column(s) ($(join(strnames[indices], ", ")))" end throw(DimensionMismatch(join(estring, " is incompatible with "))) end @@ -638,16 +638,6 @@ function Base.insert!(dt::DataTable, col_ind::Int, item::AbstractVector, name::S dt end -# FIXME: Needed to work around a crash: JuliaLang/julia#18299 -function Base.insert!(dt::DataTable, col_ind::Int, item::NullableArray, name::Symbol) - 0 < col_ind <= ncol(dt) + 1 || throw(BoundsError()) - size(dt, 1) == length(item) || size(dt, 1) == 0 || error("number of rows does not match") - - insert!(index(dt), col_ind, name) - insert!(dt.columns, col_ind, item) - dt -end - function Base.insert!(dt::DataTable, col_ind::Int, item, name::Symbol) insert!(dt, col_ind, upgrade_scalar(dt, item), name) end diff --git a/src/deprecated.jl b/src/deprecated.jl index 6f176a8..83912d7 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -18,5 +18,5 @@ import Base: keys, values, insert! @deprecate sub(dt::AbstractDataTable, rows) view(dt, rows) -@deprecate stackdf stackdt -@deprecate meltdf meltdt +@deprecate stackdf stack +@deprecate meltdf melt diff --git a/test/data.jl b/test/data.jl index a59b2bc..5f57b8a 100644 --- a/test/data.jl +++ b/test/data.jl @@ -169,22 +169,22 @@ module TestData d1m_named = melt(d1[[1,3,4]], :a, variable_name=:letter, value_name=:someval) @test names(d1m_named) == [:letter, :someval, :a] - stackdt(d1, :a) - d1s = stackdt(d1, [:a, :b]) - d1s2 = stackdt(d1, [:c, :d]) - d1s3 = stackdt(d1) - d1m = meltdt(d1, [:c, :d, :e]) + stack(d1, :a) + d1s = stack(d1, [:a, :b]) + d1s2 = stack(d1, [:c, :d]) + d1s3 = stack(d1) + d1m = melt(d1, [:c, :d, :e]) @test isequal(d1s[1:12, :c], d1[:c]) @test isequal(d1s[13:24, :c], d1[:c]) @test isequal(d1s2, d1s3) @test names(d1s) == [:variable, :value, :c, :d, :e] @test isequal(d1s, d1m) - d1m = meltdt(d1[[1,3,4]], :a) + d1m = melt(d1[[1,3,4]], :a) @test names(d1m) == [:variable, :value, :a] - d1s_named = stackdt(d1, [:a, :b], variable_name=:letter, value_name=:someval) + d1s_named = stack(d1, [:a, :b], variable_name=:letter, value_name=:someval) @test names(d1s_named) == [:letter, :someval, :c, :d, :e] - d1m_named = meltdt(d1, [:c, :d, :e], variable_name=:letter, value_name=:someval) + d1m_named = melt(d1, [:c, :d, :e], variable_name=:letter, value_name=:someval) @test names(d1m_named) == [:letter, :someval, :c, :d, :e] d1s[:id] = [1:12; 1:12] diff --git a/test/datatable.jl b/test/datatable.jl index 95ea0a1..6769733 100644 --- a/test/datatable.jl +++ b/test/datatable.jl @@ -292,22 +292,19 @@ module TestDataTable # Check that reordering levels does not confuse unstack levels!(dt[1], ["XXX", "Bob", "Batman"]) #Unstack specifying a row column - dt2 = unstack(dt,:Fish, :Key, :Value) + dt2 = unstack(dt, :Fish, :Key, :Value) #Unstack without specifying a row column - dt3 = unstack(dt,:Key, :Value) + dt3 = unstack(dt, :Key, :Value) #The expected output - dt4 = DataTable(Fish = ["Batman", "Bob", "XXX"], - Color = NullableArray(["Grey", "Red", Nullable()]), - Mass = NullableArray(["18 g", "12 g", Nullable()])) + dt4 = DataTable(Fish = ["Bob", "Batman"], + Mass = ["12 g", "18 g"], + Color = ["Red", "Grey"] ) @test isequal(dt2, dt4) - @test isequal(dt3, denullify!(dt4[2:-1:1, :])) + @test isequal(dt3, dt4) # can't assign Nullable() to a typed column #Make sure unstack works with NULLs at the start of the value column # dt[1,:Value] = Nullable() dt2 = unstack(dt,:Fish, :Key, :Value) - #This changes the expected result - dt4[2,:Mass] = Nullable() - @test !isequal(dt2, dt4) dt = DataTable(A = 1:10, B = 'A':'J') @test !(dt[:,:] === dt) diff --git a/test/show.jl b/test/show.jl index 8bbbd78..abad44c 100644 --- a/test/show.jl +++ b/test/show.jl @@ -30,13 +30,6 @@ module TestShow dt = DataTable(A = Vector{String}(3)) - A = DataTables.StackedVector(Any[[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - show(io, A) - A = DataTables.RepeatedVector([1, 2, 3], 5, 1) - show(io, A) - A = DataTables.RepeatedVector([1, 2, 3], 1, 5) - show(io, A) - #Test show output for REPL and similar dt = DataTable(Fish = ["Suzy", "Amir"], Mass = [1.5, Nullable()]) io = IOBuffer()