diff --git a/REQUIRE b/REQUIRE index 7bb9ed3..b18bc91 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,6 +1,6 @@ julia 0.5 NullableArrays 0.1.0 -CategoricalArrays 0.1.2 +CategoricalArrays 0.1.3 StatsBase 0.11.0 SortingAlgorithms Reexport diff --git a/src/DataTables.jl b/src/DataTables.jl index e69a70b..799f7f6 100644 --- a/src/DataTables.jl +++ b/src/DataTables.jl @@ -47,6 +47,8 @@ export @~, combine, completecases, deleterows!, + denullify!, + denullify, describe, dropnull, dropnull!, @@ -61,6 +63,8 @@ export @~, nonunique, nrow, nullable!, + nullify!, + nullify, order, printtable, rename!, diff --git a/src/abstractdatatable/abstractdatatable.jl b/src/abstractdatatable/abstractdatatable.jl index a885136..7879014 100644 --- a/src/abstractdatatable/abstractdatatable.jl +++ b/src/abstractdatatable/abstractdatatable.jl @@ -31,6 +31,10 @@ The following are normally implemented for AbstractDataTables: * [`nonunique`](@ref) : indexes of duplicate rows * [`unique!`](@ref) : remove duplicate rows * `similar` : a DataTable with similar columns as `d` +* `denullify` : unwrap `Nullable` columns +* `denullify!` : unwrap `Nullable` columns in-place +* `nullify` : convert all columns to NullableArrays +* `nullify!` : convert all columns to NullableArrays in-place **Indexing** @@ -706,83 +710,75 @@ Base.hcat(dt1::AbstractDataTable, dt2::AbstractDataTable) = hcat!(dt[:, :], dt2) Base.hcat(dt::AbstractDataTable, x, y...) = hcat!(hcat(dt, x), y...) Base.hcat(dt1::AbstractDataTable, dt2::AbstractDataTable, dtn::AbstractDataTable...) = hcat!(hcat(dt1, dt2), dtn...) -# vcat only accepts DataTables. Finds union of columns, maintaining order -# of first dt. Missing data become null values. - +""" + vcat(dts::AbstractDataTable...) + +Vertically concatenate `AbstractDataTables` that have the same column names in +the same order. + +```jldoctest +julia> dt1 = DataTable(A=1:3, B=1:3); + +julia> dt2 = DataTable(A=4:6, B=4:6); + +julia> vcat(dt1, dt2) +6×2 DataTables.DataTable +│ Row │ A │ B │ +├─────┼───┼───┤ +│ 1 │ 1 │ 1 │ +│ 2 │ 2 │ 2 │ +│ 3 │ 3 │ 3 │ +│ 4 │ 4 │ 4 │ +│ 5 │ 5 │ 5 │ +│ 6 │ 6 │ 6 │ +``` +""" Base.vcat(dt::AbstractDataTable) = dt - -Base.vcat(dts::AbstractDataTable...) = vcat(AbstractDataTable[dts...]) - -function Base.vcat{T<:AbstractDataTable}(dts::Vector{T}) +function Base.vcat(dts::AbstractDataTable...) isempty(dts) && return DataTable() - coltyps, colnams, similars = _colinfo(dts) - - res = DataTable() - Nrow = sum(nrow, dts) - for j in 1:length(colnams) - colnam = colnams[j] - col = similar(similars[j], coltyps[j], Nrow) - - i = 1 - for dt in dts - if haskey(dt, colnam) - copy!(col, i, dt[colnam]) - end - i += size(dt, 1) - end - - res[colnam] = col + allheaders = map(names, dts) + # don't vcat empty DataTables + notempty = find(x -> length(x) > 0, allheaders) + uniqueheaders = unique(allheaders[notempty]) + if length(uniqueheaders) == 0 + return DataTable() end - res -end - -_isnullable{T}(::AbstractArray{T}) = T <: Nullable -const EMPTY_DATA = NullableArray(Void, 0) - -function _colinfo{T<:AbstractDataTable}(dts::Vector{T}) - dt1 = dts[1] - colindex = copy(index(dt1)) - coltyps = eltypes(dt1) - similars = collect(columns(dt1)) - nonnull_ct = Int[_isnullable(c) for c in columns(dt1)] - - for i in 2:length(dts) - dt = dts[i] - for j in 1:size(dt, 2) - col = dt[j] - cn, ct = _names(dt)[j], eltype(col) - if haskey(colindex, cn) - idx = colindex[cn] - - oldtyp = coltyps[idx] - if !(ct <: oldtyp) - coltyps[idx] = promote_type(oldtyp, ct) - # Needed on Julia 0.4 since e.g. - # promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T}, - # which is not a usable type: fall back to Nullable{Any} - if VERSION < v"0.5.0-dev" && - coltyps[idx] <: Nullable && !isa(coltyps[idx].types[2], DataType) - coltyps[idx] = Nullable{Any} - end - end - nonnull_ct[idx] += !_isnullable(col) - else # new column - push!(colindex, cn) - push!(coltyps, ct) - push!(similars, col) - push!(nonnull_ct, !_isnullable(col)) + if length(uniqueheaders) > 1 + unionunique = union(uniqueheaders...) + coldiff = setdiff(unionunique, intersect(uniqueheaders...)) + if !isempty(coldiff) + # if any datatables are a full superset of names, skip them + filter!(u -> Set(u) != Set(unionunique), uniqueheaders) + estrings = Vector{String}(length(uniqueheaders)) + for (i, u) in enumerate(uniqueheaders) + matchingloci = find(h -> u == h, allheaders) + headerdiff = filter(x -> !in(x, u), coldiff) + headerdiff = length(headerdiff) > 1 ? + join(string.(headerdiff[1:end-1]), ", ") * " and " * string(headerdiff[end]) : + string(headerdiff[end]) + matchingloci = length(matchingloci) > 1 ? + join(string.(matchingloci[1:end-1]), ", ") * " and " * string(matchingloci[end]) : + string(matchingloci[end]) + estrings[i] = "column(s) $headerdiff are missing from argument(s) $matchingloci" end + length(estrings) == 1 ? throw(ArgumentError(estrings[1])) : + throw(ArgumentError(join(estrings[1:end-1], ", ") * ", and " * estrings[end])) + else + estrings = Vector{String}(length(uniqueheaders)) + for (i, u) in enumerate(uniqueheaders) + indices = find(a -> a == u, allheaders) + indices = length(indices) > 1 ? + join(string.(indices[1:end-1]), ", ") * " and " * string(indices[end]) : + string(indices[end]) + estrings[i] = "column order of argument(s) $indices" + end + throw(ArgumentError(join(estrings, " != "))) end + else + header = uniqueheaders[1] + dts_to_vcat = dts[notempty] + return DataTable(Any[vcat(map(dt -> dt[col], dts_to_vcat)...) for col in header], header) end - - for j in 1:length(colindex) - if nonnull_ct[j] < length(dts) && !_isnullable(similars[j]) - similars[j] = EMPTY_DATA - end - end - colnams = _names(colindex) - - coltyps, colnams, similars end ############################################################################## @@ -801,6 +797,180 @@ function Base.hash(dt::AbstractDataTable) return @compat UInt(h) end +""" + denullify!(dt::AbstractDataTable) + +Convert columns with a `Nullable` element type without any null values +to a non-`Nullable` equivalent array type. The table `dt` is modified in place. + +Columns in the returned `AbstractDataTable` may alias the columns of the +input `dt`. + +# Examples + +```jldoctest +julia> dt = DataTable(A = NullableArray(1:3), B = [Nullable(i) for i=1:3]) +3×2 DataTables.DataTable +│ Row │ A │ B │ +├─────┼───┼───┤ +│ 1 │ 1 │ 1 │ +│ 2 │ 2 │ 2 │ +│ 3 │ 3 │ 3 │ + +julia> eltypes(dt) +2-element Array{Type,1}: + Nullable{Int64} + Nullable{Int64} + +julia> eltypes(denullify!(dt)) +2-element Array{Type,1}: + Int64 + Int64 + +julia> eltypes(dt) +2-element Array{Type,1}: + Int64 + Int64 +``` + +See also [`denullify`](@ref) and [`nullify!`](@ref). +""" +function denullify!(dt::AbstractDataTable) + for i in 1:size(dt,2) + if !anynull(dt[i]) + dt[i] = dropnull!(dt[i]) + end + end + dt +end + +""" + denullify(dt::AbstractDataTable) + +Return a copy of `dt` where columns with a `Nullable` element type without any +null values have been converted to a non-`Nullable` equivalent array type. + +Columns in the returned `AbstractDataTable` may alias the columns of the +input `dt`. If no aliasing is desired, use `denullify!(deepcopy(dt))`. + +# Examples + +```jldoctest +julia> dt = DataTable(A = NullableArray(1:3), B = [Nullable(i) for i=1:3]) +3×2 DataTables.DataTable +│ Row │ A │ B │ +├─────┼───┼───┤ +│ 1 │ 1 │ 1 │ +│ 2 │ 2 │ 2 │ +│ 3 │ 3 │ 3 │ + +julia> eltypes(dt) +2-element Array{Type,1}: + Nullable{Int64} + Nullable{Int64} + +julia> eltypes(denullify(dt)) +2-element Array{Type,1}: + Int64 + Int64 + +julia> eltypes(dt) +2-element Array{Type,1}: + Nullable{Int64} + Nullable{Int64} +``` + +See also [`denullify!`] and [`nullify`](@ref). +""" +denullify(dt::AbstractDataTable) = denullify!(copy(dt)) + +""" + nullify!(dt::AbstractDataTable) + +Convert all columns of `dt` to nullable arrays. The table `dt` is modified in place. + +Columns in the returned `AbstractDataTable` may alias the columns of the +input `dt`. + +# Examples + +```jldoctest +julia> dt = DataTable(A = 1:3, B = 1:3) +3×2 DataTables.DataTable +│ Row │ A │ B │ +├─────┼───┼───┤ +│ 1 │ 1 │ 1 │ +│ 2 │ 2 │ 2 │ +│ 3 │ 3 │ 3 │ + +julia> eltypes(dt) +2-element Array{Type,1}: + Int64 + Int64 + +julia> eltypes(nullify!(dt)) +2-element Array{Type,1}: + Nullable{Int64} + Nullable{Int64} + +julia> eltypes(dt) +2-element Array{Type,1}: + Nullable{Int64} + Nullable{Int64} +``` + +See also [`nullify`](@ref) and [`denullify!`](@ref). +""" +function nullify!(dt::AbstractDataTable) + for i in 1:size(dt,2) + dt[i] = nullify(dt[i]) + end + dt +end + +nullify(x::AbstractArray) = convert(NullableArray, x) +nullify(x::AbstractCategoricalArray) = convert(NullableCategoricalArray, x) + +""" + nullify(dt::AbstractDataTable) + +Return a copy of `dt` with all columns converted to nullable arrays. + +Columns in the returned `AbstractDataTable` may alias the columns of the +input `dt`. If no aliasing is desired, use `nullify!(deepcopy(dt))`. + +# Examples + +```jldoctest +julia> dt = DataTable(A = 1:3, B = 1:3) +3×2 DataTables.DataTable +│ Row │ A │ B │ +├─────┼───┼───┤ +│ 1 │ 1 │ 1 │ +│ 2 │ 2 │ 2 │ +│ 3 │ 3 │ 3 │ + +julia> eltypes(dt) +2-element Array{Type,1}: + Int64 + Int64 + +julia> eltypes(nullify(dt)) +2-element Array{Type,1}: + Nullable{Int64} + Nullable{Int64} + +julia> eltypes(dt) +2-element Array{Type,1}: + Int64 + Int64 +``` + +See also [`nullify!`](@ref) and [`denullify`](@ref). +""" +function nullify(dt::AbstractDataTable) + nullify!(copy(dt)) +end ## Documentation for methods defined elsewhere diff --git a/src/abstractdatatable/io.jl b/src/abstractdatatable/io.jl index 1c32f5c..3c6ff81 100644 --- a/src/abstractdatatable/io.jl +++ b/src/abstractdatatable/io.jl @@ -44,17 +44,18 @@ function printtable(io::IO, for j in 1:p if !isnull(dt[j][i]) if ! (etypes[j] <: Real) - print(io, quotemark) - escapedprint(io, get(dt[i, j]), quotestr) - print(io, quotemark) + print(io, quotemark) + x = unsafe_get(dt[i, j]) + escapedprint(io, x, quotestr) + print(io, quotemark) else - print(io, dt[i, j]) + print(io, dt[i, j]) end else - print(io, nastring) + print(io, nastring) end if j < p - print(io, separator) + print(io, separator) else print(io, '\n') end @@ -167,7 +168,7 @@ function Base.show(io::IO, ::MIME"text/latex", dt::AbstractDataTable) write(io, " & ") cell = dt[row,col] if !isnull(cell) - content = get(cell) + content = unsafe_get(cell) if mimewritable(MIME("text/latex"), content) show(io, MIME("text/latex"), content) else diff --git a/src/abstractdatatable/join.jl b/src/abstractdatatable/join.jl index 1ad170b..a239927 100644 --- a/src/abstractdatatable/join.jl +++ b/src/abstractdatatable/join.jl @@ -3,17 +3,17 @@ ## # Like similar, but returns a nullable array -similar_nullable{T}(dv::AbstractArray{T}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) = +similar_nullable{T}(dv::AbstractArray{T}, dims::Union{Int, Tuple{Vararg{Int}}}) = NullableArray(T, dims) -similar_nullable{T<:Nullable}(dv::AbstractArray{T}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) = +similar_nullable{T<:Nullable}(dv::AbstractArray{T}, dims::Union{Int, Tuple{Vararg{Int}}}) = NullableArray(eltype(T), dims) -similar_nullable{T,R}(dv::CategoricalArray{T,R}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) = +similar_nullable{T,R}(dv::CategoricalArray{T,R}, dims::Union{Int, Tuple{Vararg{Int}}}) = NullableCategoricalArray(T, dims) -similar_nullable(dt::AbstractDataTable, dims::Int) = - DataTable(Any[similar_nullable(x, dims) for x in columns(dt)], copy(index(dt))) +similar_nullable{T,R}(dv::NullableCategoricalArray{T,R}, dims::Union{Int, Tuple{Vararg{Int}}}) = + NullableCategoricalArray(T, dims) # helper structure for DataTables joining immutable DataTableJoiner{DT1<:AbstractDataTable, DT2<:AbstractDataTable} @@ -44,31 +44,28 @@ Base.length(x::RowIndexMap) = length(x.orig) # composes the joined data table using the maps between the left and right # table rows and the indices of rows in the result -function compose_joined_table(joiner::DataTableJoiner, +function compose_joined_table(joiner::DataTableJoiner, kind::Symbol, left_ixs::RowIndexMap, leftonly_ixs::RowIndexMap, right_ixs::RowIndexMap, rightonly_ixs::RowIndexMap) @assert length(left_ixs) == length(right_ixs) # compose left half of the result taking all left columns all_orig_left_ixs = vcat(left_ixs.orig, leftonly_ixs.orig) - if length(leftonly_ixs) > 0 + + ril = length(right_ixs) + lil = length(left_ixs) + loil = length(leftonly_ixs) + roil = length(rightonly_ixs) + + if loil > 0 # combine the matched (left_ixs.orig) and non-matched (leftonly_ixs.orig) indices of the left table rows # preserving the original rows order - all_orig_left_ixs = similar(left_ixs.orig, length(left_ixs)+length(leftonly_ixs)) + all_orig_left_ixs = similar(left_ixs.orig, lil + loil) @inbounds all_orig_left_ixs[left_ixs.join] = left_ixs.orig @inbounds all_orig_left_ixs[leftonly_ixs.join] = leftonly_ixs.orig else # the result contains only the left rows that are matched to right rows (left_ixs) all_orig_left_ixs = left_ixs.orig # no need to copy left_ixs.orig as it's not used elsewhere end - ril = length(right_ixs) - loil = length(leftonly_ixs) - roil = length(rightonly_ixs) - left_dt = DataTable(Any[resize!(col[all_orig_left_ixs], length(all_orig_left_ixs)+roil) - for col in columns(joiner.dtl)], - names(joiner.dtl)) - - # compose right half of the result taking all right columns excluding on - dtr_noon = without(joiner.dtr, joiner.on_cols) # permutation to swap rightonly and leftonly rows right_perm = vcat(1:ril, ril+roil+1:ril+roil+loil, ril+1:ril+roil) if length(leftonly_ixs) > 0 @@ -76,18 +73,31 @@ function compose_joined_table(joiner::DataTableJoiner, right_perm[vcat(right_ixs.join, leftonly_ixs.join)] = right_perm[1:ril+loil] end all_orig_right_ixs = vcat(right_ixs.orig, rightonly_ixs.orig) - right_dt = DataTable(Any[resize!(col[all_orig_right_ixs], length(all_orig_right_ixs)+loil)[right_perm] - for col in columns(dtr_noon)], - names(dtr_noon)) - # merge left and right parts of the joined table - res = hcat!(left_dt, right_dt) + + # compose right half of the result taking all right columns excluding on + dtr_noon = without(joiner.dtr, joiner.on_cols) + + nrow = length(all_orig_left_ixs) + roil + @assert nrow == length(all_orig_right_ixs) + loil + ncleft = ncol(joiner.dtl) + cols = Vector{Any}(ncleft + ncol(dtr_noon)) + for (i, col) in enumerate(columns(joiner.dtl)) + cols[i] = kind == :inner ? col[all_orig_left_ixs] : + copy!(similar_nullable(col, nrow), col[all_orig_left_ixs]) + end + for (i, col) in enumerate(columns(dtr_noon)) + cols[i+ncleft] = kind == :inner ? col[all_orig_right_ixs][right_perm] : + copy!(similar_nullable(col, nrow), col[all_orig_right_ixs])[right_perm] + end + res = DataTable(cols, vcat(names(joiner.dtl), names(dtr_noon))) if length(rightonly_ixs.join) > 0 # some left rows are nulls, so the values of the "on" columns # need to be taken from the right for (on_col_ix, on_col) in enumerate(joiner.on_cols) # fix the result of the rightjoin by taking the nonnull values from the right table - res[on_col][rightonly_ixs.join] = joiner.dtr_on[rightonly_ixs.orig, on_col_ix] + # end-length(rightonly_ixs.orig)+1:end was rightonly_ixs.join. Try and FIXME + res[on_col][end-length(rightonly_ixs.orig)+1:end] = joiner.dtr_on[rightonly_ixs.orig, on_col_ix] end end return res @@ -207,7 +217,8 @@ join(dt1::AbstractDataTable, - `:cross` : a full Cartesian product of the key combinations; every row of `dt1` is matched with every row of `dt2` -Null values are filled in where needed to complete joins. +For the three join operations that may introduce missing values, `:outer`, `:left`, +and `:right`, all columns of the returned datatable will be nullable. ### Result @@ -243,22 +254,21 @@ function Base.join(dt1::AbstractDataTable, joiner = DataTableJoiner(dt1, dt2, on) if kind == :inner - compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on, - group_rows(joiner.dtr_on), - true, false, true, false)...) + compose_joined_table(joiner, kind, update_row_maps!(joiner.dtl_on, joiner.dtr_on, + group_rows(joiner.dtr_on), + true, false, true, false)...) elseif kind == :left - compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on, - group_rows(joiner.dtr_on), - true, true, true, false)...) + compose_joined_table(joiner, kind, update_row_maps!(joiner.dtl_on, joiner.dtr_on, + group_rows(joiner.dtr_on), + true, true, true, false)...) elseif kind == :right - right_ixs, rightonly_ixs, left_ixs, leftonly_ixs = update_row_maps!(joiner.dtr_on, joiner.dtl_on, - group_rows(joiner.dtl_on), - true, true, true, false) - compose_joined_table(joiner, left_ixs, leftonly_ixs, right_ixs, rightonly_ixs) + compose_joined_table(joiner, kind, update_row_maps!(joiner.dtr_on, joiner.dtl_on, + group_rows(joiner.dtl_on), + true, true, true, false)[[3, 4, 1, 2]]...) elseif kind == :outer - compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on, - group_rows(joiner.dtr_on), - true, true, true, true)...) + compose_joined_table(joiner, kind, update_row_maps!(joiner.dtl_on, joiner.dtr_on, + group_rows(joiner.dtr_on), + true, true, true, true)...) elseif kind == :semi # hash the right rows dtr_on_grp = group_rows(joiner.dtr_on) diff --git a/src/abstractdatatable/reshape.jl b/src/abstractdatatable/reshape.jl index ed4d519..ed26cd4 100644 --- a/src/abstractdatatable/reshape.jl +++ b/src/abstractdatatable/reshape.jl @@ -81,9 +81,9 @@ function stack(dt::AbstractDataTable, measure_vars::Vector{Int}, cnames = names(dt)[id_vars] insert!(cnames, 1, value_name) insert!(cnames, 1, variable_name) - DataTable(Any[Compat.repeat(_names(dt)[measure_vars], inner=nrow(dt)), # variable - vcat([dt[c] for c in measure_vars]...), # value - [Compat.repeat(dt[c], outer=N) for c in id_vars]...], # id_var columns + DataTable(Any[repeat(_names(dt)[measure_vars], inner=nrow(dt)), # variable + vcat([dt[c] for c in measure_vars]...), # value + [repeat(dt[c], outer=N) for c in id_vars]...], # id_var columns cnames) end function stack(dt::AbstractDataTable, measure_var::Int, id_var::Int; @@ -98,8 +98,8 @@ function stack(dt::AbstractDataTable, measure_vars::Vector{Int}, id_var::Int; end function stack(dt::AbstractDataTable, measure_var::Int, id_vars::Vector{Int}; variable_name::Symbol=:variable, value_name::Symbol=:value) - stackdt(dt, [measure_var], id_vars; - variable_name=variable_name, value_name=value_name) + stack(dt, [measure_var], id_vars; + variable_name=variable_name, value_name=value_name) end function stack(dt::AbstractDataTable, measure_vars, id_vars; variable_name::Symbol=:variable, value_name::Symbol=:value) @@ -198,12 +198,7 @@ function unstack(dt::AbstractDataTable, rowkey::Int, colkey::Int, value::Int) keycol = NullableCategoricalArray(dt[colkey]) Nrow = length(refkeycol.pool) Ncol = length(keycol.pool) - T = eltype(valuecol) - if T <: Nullable - T = eltype(T) - end - payload = DataTable(Any[NullableArray(T, Nrow) for i in 1:Ncol], - map(Symbol, levels(keycol))) + payload = DataTable(Any[similar_nullable(valuecol, Nrow) for i in 1:Ncol], map(Symbol, levels(keycol))) nowarning = true for k in 1:nrow(dt) j = Int(CategoricalArrays.order(keycol.pool)[keycol.refs[k]]) @@ -216,7 +211,9 @@ function unstack(dt::AbstractDataTable, rowkey::Int, colkey::Int, value::Int) payload[j][i] = valuecol[k] end end - insert!(payload, 1, NullableArray(levels(refkeycol)), _names(dt)[rowkey]) + levs = levels(refkeycol) + col = similar_nullable(dt[rowkey], length(levs)) + insert!(payload, 1, copy!(col, levs), _names(dt)[rowkey]) end unstack(dt::AbstractDataTable, rowkey, colkey, value) = unstack(dt, index(dt)[rowkey], index(dt)[colkey], index(dt)[value]) @@ -235,15 +232,10 @@ function unstack(dt::AbstractDataTable, colkey::Int, value::Int) end keycol = NullableCategoricalArray(dt[colkey]) valuecol = dt[value] - dt1 = dt[g.idx[g.starts], g.cols] + dt1 = nullify!(dt[g.idx[g.starts], g.cols]) Nrow = length(g) Ncol = length(levels(keycol)) - T = eltype(valuecol) - if T <: Nullable - T = eltype(T) - end - dt2 = DataTable(Any[NullableArray(T, Nrow) for i in 1:Ncol], - map(@compat(Symbol), levels(keycol))) + dt2 = DataTable(Any[similar_nullable(valuecol, Nrow) for i in 1:Ncol], map(Symbol, levels(keycol))) nowarning = true for k in 1:nrow(dt) j = Int(CategoricalArrays.order(keycol.pool)[keycol.refs[k]]) diff --git a/src/datatable/datatable.jl b/src/datatable/datatable.jl index 5eb0e7b..91772a6 100644 --- a/src/datatable/datatable.jl +++ b/src/datatable/datatable.jl @@ -74,32 +74,50 @@ type DataTable <: AbstractDataTable colindex::Index function DataTable(columns::Vector{Any}, colindex::Index) - ncols = length(columns) - if ncols > 1 - nrows = length(columns[1]) - equallengths = true - for i in 2:ncols - equallengths &= length(columns[i]) == nrows + if length(columns) == length(colindex) == 0 + return new(Vector{Any}(0), Index()) + elseif length(columns) != length(colindex) + throw(DimensionMismatch("Number of columns ($(length(columns))) and number of column names ($(length(colindex))) are not equal")) + end + lengths = [isa(col, AbstractArray) ? length(col) : 1 for col in columns] + minlen, maxlen = extrema(lengths) + if minlen == 0 && maxlen == 0 + return new(columns, colindex) + elseif minlen != maxlen || minlen == maxlen == 1 + # recycle scalars + for i in 1:length(columns) + typeof(columns[i]) <: AbstractArray && continue + columns[i] = fill(columns[i], maxlen) + lengths[i] = maxlen end - if !equallengths - msg = "All columns in a DataTable must be the same length" - throw(ArgumentError(msg)) + uls = unique(lengths) + if length(uls) != 1 + strnames = string.(names(colindex)) + estring = ["column length ($(uls[i])) for column(s) ($(join(strnames[find(uls .== u)], ", ")))" + for (i,u) in enumerate(uls)] + throw(DimensionMismatch(join(estring, " is incompatible with "))) end end - if length(colindex) != ncols - msg = "Columns and column index must be the same length" - throw(ArgumentError(msg)) + for (i,c) in enumerate(columns) + if isa(c, Range) + columns[i] = collect(c) + elseif !isa(c, AbstractVector) && isa(c, AbstractArray) + throw(DimensionMismatch("columns must be 1-dimensional")) + else + columns[i] = c + end end - new(columns, colindex) + return new(columns, colindex) end end function DataTable(; kwargs...) - result = DataTable(Any[], Index()) - for (k, v) in kwargs - result[k] = v + if length(kwargs) == 0 + return DataTable(Any[], Index()) end - return result + colnames = [Symbol(k) for (k,v) in kwargs] + columns = Any[v for (k,v) in kwargs] + DataTable(columns, Index(colnames)) end function DataTable(columns::AbstractVector, @@ -107,86 +125,68 @@ function DataTable(columns::AbstractVector, return DataTable(convert(Vector{Any}, columns), Index(convert(Vector{Symbol}, cnames))) end - # Initialize empty DataTable objects of arbitrary size function DataTable(t::Type, nrows::Integer, ncols::Integer) columns = Vector{Any}(ncols) for i in 1:ncols - columns[i] = NullableArray(t, nrows) + columns[i] = Vector{t}(nrows) end cnames = gennames(ncols) return DataTable(columns, Index(cnames)) end # Initialize an empty DataTable with specific eltypes and names -function DataTable(column_eltypes::Vector, cnames::Vector, nrows::Integer) +function DataTable{T<:Type}(column_eltypes::AbstractVector{T}, cnames::AbstractVector{Symbol}, nrows::Integer) p = length(column_eltypes) columns = Vector{Any}(p) for j in 1:p - columns[j] = NullableArray(column_eltypes[j], nrows) + elty = column_eltypes[j] + if elty <: Nullable + if eltype(elty) <: CategoricalValue + columns[j] = NullableCategoricalVector{eltype(elty).parameters[1]}(nrows) + else + columns[j] = NullableVector{eltype(elty)}(nrows) + end + else + if elty <: CategoricalValue + columns[j] = CategoricalVector{elty.parameters[1]}(nrows) + else + columns[j] = Vector{elty}(nrows) + end + end end - return DataTable(columns, Index(cnames)) + return DataTable(columns, Index(convert(Vector{Symbol}, cnames))) end + # Initialize an empty DataTable with specific eltypes and names # and whether a nominal array should be created -function DataTable(column_eltypes::Vector{DataType}, cnames::Vector{Symbol}, - nominal::Vector{Bool}, nrows::Integer) +function DataTable{T<:Type}(column_eltypes::AbstractVector{T}, cnames::AbstractVector{Symbol}, + nominal::Vector{Bool}, nrows::Integer) p = length(column_eltypes) columns = Vector{Any}(p) for j in 1:p - if nominal[j] - columns[j] = NullableCategoricalArray{column_eltypes[j]}(nrows) - else - columns[j] = NullableArray{column_eltypes[j]}(nrows) - end + elty = column_eltypes[j] + if nominal[j] + columns[j] = elty <: Nullable ? NullableCategoricalArray{elty}(nrows) : CategoricalVector{elty}(nrows) + else + columns[j] = elty <: Nullable ? NullableArray{elty}(nrows) : Vector{elty}(nrows) + end end - return DataTable(columns, Index(cnames)) + return DataTable(columns, Index(convert(Vector{Symbol}, cnames))) end # Initialize an empty DataTable with specific eltypes -function DataTable(column_eltypes::Vector, nrows::Integer) +function DataTable{T<:Type}(column_eltypes::AbstractVector{T}, nrows::Integer) p = length(column_eltypes) columns = Vector{Any}(p) cnames = gennames(p) for j in 1:p - columns[j] = NullableArray{column_eltypes[j]}(nrows) + elty = column_eltypes[j] + columns[j] = elty <: Nullable ? NullableArray{elty}(nrows) : Vector{elty}(nrows) end return DataTable(columns, Index(cnames)) end -# Initialize from a Vector of Associatives (aka list of dicts) -function DataTable{D <: Associative}(ds::Vector{D}) - ks = Set() - for d in ds - union!(ks, keys(d)) - end - DataTable(ds, [ks...]) -end - -# Initialize from a Vector of Associatives (aka list of dicts) -function DataTable{D <: Associative}(ds::Vector{D}, ks::Vector) - #get column eltypes - col_eltypes = Type[@compat(Union{}) for _ = 1:length(ks)] - for d in ds - for (i,k) in enumerate(ks) - if haskey(d, k) && !_isnull(d[k]) - col_eltypes[i] = promote_type(col_eltypes[i], typeof(d[k])) - end - end - end - col_eltypes[col_eltypes .== @compat(Union{})] = Any - - # create empty DataTable, and fill - dt = DataTable(col_eltypes, ks, length(ds)) - for (i,d) in enumerate(ds) - for (j,k) in enumerate(ks) - dt[i,j] = get(d, k, Nullable()) - end - end - - dt -end - ############################################################################## ## ## AbstractDataTable interface @@ -363,24 +363,20 @@ function insert_multiple_entries!{T <: Real}(dt::DataTable, end end -upgrade_vector{T<:Nullable}(v::AbstractArray{T}) = v -upgrade_vector(v::CategoricalArray) = NullableCategoricalArray(v) -upgrade_vector(v::AbstractArray) = NullableArray(v) - function upgrade_scalar(dt::DataTable, v::AbstractArray) msg = "setindex!(::DataTable, ...) only broadcasts scalars, not arrays" throw(ArgumentError(msg)) end function upgrade_scalar(dt::DataTable, v::Any) n = (ncol(dt) == 0) ? 1 : nrow(dt) - NullableArray(fill(v, n)) + fill(v, n) end # dt[SingleColumnIndex] = AbstractVector function Base.setindex!(dt::DataTable, v::AbstractVector, col_ind::ColumnIndex) - insert_single_column!(dt, upgrade_vector(v), col_ind) + insert_single_column!(dt, v, col_ind) end # dt[SingleColumnIndex] = Single Item (EXPANDS TO NROW(DT) if NCOL(DT) > 0) @@ -417,9 +413,8 @@ end function Base.setindex!{T <: ColumnIndex}(dt::DataTable, v::AbstractVector, col_inds::AbstractVector{T}) - dv = upgrade_vector(v) for col_ind in col_inds - dt[col_ind] = dv + dt[col_ind] = v end return dt end @@ -643,16 +638,6 @@ function Base.insert!(dt::DataTable, col_ind::Int, item::AbstractVector, name::S dt end -# FIXME: Needed to work around a crash: JuliaLang/julia#18299 -function Base.insert!(dt::DataTable, col_ind::Int, item::NullableArray, name::Symbol) - 0 < col_ind <= ncol(dt) + 1 || throw(BoundsError()) - size(dt, 1) == length(item) || size(dt, 1) == 0 || error("number of rows does not match") - - insert!(index(dt), col_ind, name) - insert!(dt.columns, col_ind, item) - dt -end - function Base.insert!(dt::DataTable, col_ind::Int, item, name::Symbol) insert!(dt, col_ind, upgrade_scalar(dt, item), name) end @@ -754,11 +739,7 @@ function hcat!(dt1::DataTable, dt2::AbstractDataTable) return dt1 end -hcat!(dt::DataTable, x::CategoricalArray) = hcat!(dt, DataTable(Any[x])) -hcat!(dt::DataTable, x::NullableCategoricalArray) = hcat!(dt, DataTable(Any[x])) -hcat!(dt::DataTable, x::NullableVector) = hcat!(dt, DataTable(Any[x])) -hcat!(dt::DataTable, x::Vector) = hcat!(dt, DataTable(Any[NullableArray(x)])) -hcat!(dt::DataTable, x) = hcat!(dt, DataTable(Any[NullableArray([x])])) +hcat!(dt::DataTable, x) = hcat!(dt, DataTable(Any[x])) # hcat! for 1-n arguments hcat!(dt::DataTable) = dt @@ -834,35 +815,14 @@ function Base.convert(::Type{DataTable}, A::Matrix) return DataTable(cols, Index(gennames(n))) end -function _datatable_from_associative(dnames, d::Associative) - p = length(dnames) - p == 0 && return DataTable() - columns = Vector{Any}(p) - colnames = Vector{Symbol}(p) - n = length(d[dnames[1]]) - for j in 1:p - name = dnames[j] - col = d[name] - if length(col) != n - throw(ArgumentError("All columns in Dict must have the same length")) - end - columns[j] = NullableArray(col) - colnames[j] = Symbol(name) - end - return DataTable(columns, Index(colnames)) -end - function Base.convert(::Type{DataTable}, d::Associative) - dnames = collect(keys(d)) - return _datatable_from_associative(dnames, d) -end - -# A Dict is not sorted or otherwise ordered, and it's nicer to return a -# DataTable which is ordered in some way -function Base.convert(::Type{DataTable}, d::Dict) - dnames = collect(keys(d)) - sort!(dnames) - return _datatable_from_associative(dnames, d) + colnames = keys(d) + if isa(d, Dict) + colnames = sort!(collect(colnames)) + end + colindex = Index([Symbol(k) for k in colnames]) + columns = Any[d[c] for c in colnames] + DataTable(columns, colindex) end diff --git a/src/groupeddatatable/grouping.jl b/src/groupeddatatable/grouping.jl index 711bcd3..2472976 100644 --- a/src/groupeddatatable/grouping.jl +++ b/src/groupeddatatable/grouping.jl @@ -193,7 +193,7 @@ combine(map(d -> mean(dropnull(d[:c])), gd)) """ function combine(ga::GroupApplied) gd, vals = ga.gd, ga.vals - valscat = vcat(vals) + valscat = vcat(vals...) idx = Vector{Int}(size(valscat, 1)) j = 0 @inbounds for (start, val) in zip(gd.starts, vals) diff --git a/test/cat.jl b/test/cat.jl index ab4e2ab..ba44d0a 100644 --- a/test/cat.jl +++ b/test/cat.jl @@ -72,14 +72,14 @@ module TestCat dt[1:2, 1:2] = [3,2] dt[[true,false,false,true], 2:3] = [2,3] - vcat([]) - vcat(null_dt) - vcat(null_dt, null_dt) - vcat(null_dt, dt) - vcat(dt, null_dt) - vcat(dt, dt) - vcat(dt, dt, dt) - @test vcat(DataTable[]) == DataTable() + @test vcat(null_dt) == DataTable() + @test vcat(null_dt, null_dt) == DataTable() + @test vcat(null_dt, dt) == dt + @test vcat(dt, null_dt) == dt + @test eltypes(vcat(dt, dt)) == [Float64, Float64, Int] + @test size(vcat(dt, dt)) == (size(dt,1)*2, size(dt,2)) + @test eltypes(vcat(dt, dt, dt)) == [Float64, Float64, Int] + @test size(vcat(dt, dt, dt)) == (size(dt,1)*3, size(dt,2)) alt_dt = deepcopy(dt) vcat(dt, alt_dt) @@ -88,29 +88,14 @@ module TestCat dt[1] = zeros(Int, nrow(dt)) vcat(dt, alt_dt) - # Don't fail on non-matching names - names!(alt_dt, [:A, :B, :C]) - vcat(dt, alt_dt) - dtr = vcat(dt4, dt4) @test size(dtr, 1) == 8 @test names(dt4) == names(dtr) @test isequal(dtr, [dt4; dt4]) - dtr = vcat(dt2, dt3) - @test size(dtr) == (8,2) - @test names(dt2) == names(dtr) - @test isnull(dtr[8,:x2]) - # Eltype promotion - # Fails on Julia 0.4 since promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T} - if VERSION >= v"0.5.0-dev" - @test eltypes(vcat(DataTable(a = [1]), DataTable(a = [2.1]))) == [Nullable{Float64}] - @test eltypes(vcat(DataTable(a = NullableArray(Int, 1)), DataTable(a = [2.1]))) == [Nullable{Float64}] - else - @test eltypes(vcat(DataTable(a = [1]), DataTable(a = [2.1]))) == [Nullable{Any}] - @test eltypes(vcat(DataTable(a = NullableArray(Int, 1)), DataTable(a = [2.1]))) == [Nullable{Any}] - end + @test eltypes(vcat(DataTable(a = [1]), DataTable(a = [2.1]))) == [Float64] + @test eltypes(vcat(DataTable(a = NullableArray(Int, 1)), DataTable(a = [2.1]))) == [Nullable{Float64}] # Minimal container type promotion dta = DataTable(a = CategoricalArray([1, 2, 2])) @@ -118,17 +103,11 @@ module TestCat dtc = DataTable(a = NullableArray([2, 3, 4])) dtd = DataTable(Any[2:4], [:a]) dtab = vcat(dta, dtb) - dtac = vcat(dta, dtc) - @test isequal(dtab[:a], Nullable{Int}[1, 2, 2, 2, 3, 4]) - @test isequal(dtac[:a], Nullable{Int}[1, 2, 2, 2, 3, 4]) - @test isa(dtab[:a], NullableCategoricalVector{Int}) - # Fails on Julia 0.4 since promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T} - if VERSION >= v"0.5.0-dev" - @test isa(dtac[:a], NullableCategoricalVector{Int}) - else - @test isa(dtac[:a], NullableCategoricalVector{Any}) - end - # ^^ container may flip if container promotion happens in Base/DataArrays + @test isa(dtab[1], CategoricalArray) + dtac = vcat(nullify(dta), dtc) + @test isa(dtac[1], NullableCategoricalArray) + @test isequal(dtab[:a], [1, 2, 2, 2, 3, 4]) + @test isa(dtab[:a], CategoricalVector{Int}) dc = vcat(dtd, dtc) @test isequal(vcat(dtc, dtd), dc) @@ -137,15 +116,75 @@ module TestCat @test isequal(vcat(dtd, dtc0, dtc), dc) @test eltypes(vcat(dtd, dtc0)) == eltypes(dc) - # Missing columns - rename!(dtd, :a, :b) - dtda = DataTable(b = NullableArray(Nullable{Int}[2, 3, 4, Nullable(), Nullable(), Nullable()]), - a = NullableCategoricalVector(Nullable{Int}[Nullable(), Nullable(), Nullable(), 1, 2, 2])) - @test isequal(vcat(dtd, dta), dtda) - - # Alignment - @test isequal(vcat(dtda, dtd, dta), vcat(dtda, dtda)) - # vcat should be able to concatenate different implementations of AbstractDataTable (PR #944) @test isequal(vcat(view(DataTable(A=1:3),2),DataTable(A=4:5)), DataTable(A=[2,4,5])) + + @testset "vcat errors" begin + dt1 = DataTable(A = 1:3, B = 1:3) + dt2 = DataTable(A = 1:3) + # right missing 1 column + err = @test_throws ArgumentError vcat(dt1, dt2) + @test err.value.msg == "column(s) B are missing from argument(s) 2" + # left missing 1 column + err = @test_throws ArgumentError vcat(dt2, dt1) + @test err.value.msg == "column(s) B are missing from argument(s) 1" + # multiple missing 1 column + err = @test_throws ArgumentError vcat(dt1, dt2, dt2, dt2, dt2, dt2) + @test err.value.msg == "column(s) B are missing from argument(s) 2, 3, 4, 5 and 6" + # argument missing >1columns + dt1 = DataTable(A = 1:3, B = 1:3, C = 1:3, D = 1:3, E = 1:3) + err = @test_throws ArgumentError vcat(dt1, dt2) + @test err.value.msg == "column(s) B, C, D and E are missing from argument(s) 2" + # >1 arguments missing >1 columns + err = @test_throws ArgumentError vcat(dt1, dt2, dt2, dt2, dt2) + @test err.value.msg == "column(s) B, C, D and E are missing from argument(s) 2, 3, 4 and 5" + # out of order + dt2 = dt1[reverse(names(dt1))] + err = @test_throws ArgumentError vcat(dt1, dt2) + @test err.value.msg == "column order of argument(s) 1 != column order of argument(s) 2" + # left >1 + err = @test_throws ArgumentError vcat(dt1, dt1, dt2) + @test err.value.msg == "column order of argument(s) 1 and 2 != column order of argument(s) 3" + # right >1 + err = @test_throws ArgumentError vcat(dt1, dt2, dt2) + @test err.value.msg == "column order of argument(s) 1 != column order of argument(s) 2 and 3" + # left and right >1 + err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2) + @test err.value.msg == "column order of argument(s) 1, 2 and 3 != column order of argument(s) 4, 5 and 6" + # >2 groups out of order + srand(1) + dt3 = dt1[shuffle(names(dt1))] + err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2, dt3, dt3, dt3, dt3) + @test err.value.msg == "column order of argument(s) 1, 2 and 3 != column order of argument(s) 4, 5 and 6 != column order of argument(s) 7, 8, 9 and 10" + # missing columns throws error before out of order columns + dt1 = DataTable(A = 1, B = 1) + dt2 = DataTable(A = 1) + dt3 = DataTable(B = 1, A = 1) + err = @test_throws ArgumentError vcat(dt1, dt2, dt3) + @test err.value.msg == "column(s) B are missing from argument(s) 2" + # unique columns for both sides + dt1 = DataTable(A = 1, B = 1, C = 1, D = 1) + dt2 = DataTable(A = 1, C = 1, D = 1, E = 1, F = 1) + err = @test_throws ArgumentError vcat(dt1, dt2) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, and column(s) B are missing from argument(s) 2" + err = @test_throws ArgumentError vcat(dt1, dt1, dt2, dt2) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, and column(s) B are missing from argument(s) 3 and 4" + dt3 = DataTable(A = 1, B = 1, C = 1, D = 1, E = 1) + err = @test_throws ArgumentError vcat(dt1, dt2, dt3) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, column(s) B are missing from argument(s) 2, and column(s) F are missing from argument(s) 3" + err = @test_throws ArgumentError vcat(dt1, dt1, dt2, dt2, dt3, dt3) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, column(s) B are missing from argument(s) 3 and 4, and column(s) F are missing from argument(s) 5 and 6" + err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2, dt3, dt3, dt3) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, 2 and 3, column(s) B are missing from argument(s) 4, 5 and 6, and column(s) F are missing from argument(s) 7, 8 and 9" + # dt4 is a superset of names found in all other datatables and won't be shown in error + dt4 = DataTable(A = 1, B = 1, C = 1, D = 1, E = 1, F = 1) + err = @test_throws ArgumentError vcat(dt1, dt2, dt3, dt4) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, column(s) B are missing from argument(s) 2, and column(s) F are missing from argument(s) 3" + err = @test_throws ArgumentError vcat(dt1, dt1, dt2, dt2, dt3, dt3, dt4, dt4) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, column(s) B are missing from argument(s) 3 and 4, and column(s) F are missing from argument(s) 5 and 6" + err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2, dt3, dt3, dt3, dt4, dt4, dt4) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, 2 and 3, column(s) B are missing from argument(s) 4, 5 and 6, and column(s) F are missing from argument(s) 7, 8 and 9" + err = @test_throws ArgumentError vcat(dt1, dt2, dt3, dt4, dt1, dt2, dt3, dt4, dt1, dt2, dt3, dt4) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, 5 and 9, column(s) B are missing from argument(s) 2, 6 and 10, and column(s) F are missing from argument(s) 3, 7 and 11" + end end diff --git a/test/constructors.jl b/test/constructors.jl index 6edf2e9..4053903 100644 --- a/test/constructors.jl +++ b/test/constructors.jl @@ -18,8 +18,6 @@ module TestConstructors @test isequal(dt, DataTable(Any[NullableCategoricalVector(zeros(3)), NullableCategoricalVector(ones(3))])) - @test isequal(dt, DataTable(x1 = [0.0, 0.0, 0.0], - x2 = [1.0, 1.0, 1.0])) dt2 = convert(DataTable, [0.0 1.0; 0.0 1.0; @@ -28,25 +26,76 @@ module TestConstructors @test isequal(dt[:x1], NullableArray(dt2[:x1])) @test isequal(dt[:x2], NullableArray(dt2[:x2])) - @test isequal(dt, DataTable(x1 = [0.0, 0.0, 0.0], - x2 = [1.0, 1.0, 1.0])) - @test isequal(dt, DataTable(x1 = [0.0, 0.0, 0.0], - x2 = [1.0, 1.0, 1.0], + @test isequal(dt, DataTable(x1 = NullableArray([0.0, 0.0, 0.0]), + x2 = NullableArray([1.0, 1.0, 1.0]))) + @test isequal(dt, DataTable(x1 = NullableArray([0.0, 0.0, 0.0]), + x2 = NullableArray([1.0, 1.0, 1.0]), x3 = [2.0, 2.0, 2.0])[[:x1, :x2]]) dt = DataTable(Int, 2, 2) @test size(dt) == (2, 2) - @test eltypes(dt) == [Nullable{Int}, Nullable{Int}] + @test eltypes(dt) == [Int, Int] - dt = DataTable([Int, Float64], [:x1, :x2], 2) + dt = DataTable([Nullable{Int}, Nullable{Float64}], [:x1, :x2], 2) @test size(dt) == (2, 2) @test eltypes(dt) == [Nullable{Int}, Nullable{Float64}] - - @test isequal(dt, DataTable([Int, Float64], 2)) + @test isequal(dt, DataTable([Nullable{Int}, Nullable{Float64}], 2)) + @test all(isnull, (dt[:x1], dt[:x2])) @test_throws BoundsError SubDataTable(DataTable(A=1), 0) @test_throws BoundsError SubDataTable(DataTable(A=1), 0) @test isequal(SubDataTable(DataTable(A=1), 1), DataTable(A=1)) @test isequal(SubDataTable(DataTable(A=1:10), 1:4), DataTable(A=1:4)) @test isequal(view(SubDataTable(DataTable(A=1:10), 1:4), [true, true, false, false]), DataTable(A=1:2)) + + @test DataTable(a=1, b=1:2) == DataTable(a=[1,1], b=[1,2]) + + @testset "associative" begin + dt = DataTable(Dict(:A => 1:3, :B => 4:6)) + @test dt == DataTable(A = 1:3, B = 4:6) + @test all(e -> e <: Int, eltypes(dt)) + end + + @testset "recyclers" begin + @test DataTable(a = 1:5, b = 1) == DataTable(a = collect(1:5), b = fill(1, 5)) + @test DataTable(a = 1, b = 1:5) == DataTable(a = fill(1, 5), b = collect(1:5)) + end + + @testset "constructor errors" begin + @test_throws DimensionMismatch DataTable(a=1, b=[]) + @test_throws DimensionMismatch DataTable(Any[collect(1:10)], DataTables.Index([:A, :B])) + @test_throws DimensionMismatch DataTable(A = rand(2,2)) + @test_throws DimensionMismatch DataTable(A = rand(2,1)) + end + + @testset "column types" begin + dt = DataTable(A = 1:3, B = 2:4, C = 3:5) + answer = Any[Array{Int,1}, Array{Int,1}, Array{Int,1}] + @test map(typeof, dt.columns) == answer + dt[:D] = NullableArray([4, 5, Nullable()]) + push!(answer, NullableArray{Int,1}) + @test map(typeof, dt.columns) == answer + dt[:E] = 'c' + push!(answer, Array{Char,1}) + @test map(typeof, dt.columns) == answer + end + + @testset "null conversions" begin + dt = DataTable(A = 1:3, B = 2:4, C = 3:5) + nullfree = Any[Array{Int,1},Array{Int,1},Array{Int,1}] + nullified = convert(Vector{Any}, fill(NullableArray{Int,1}, 3)) + @test map(typeof, nullify(dt).columns) == nullified + @test sum(isa(dt[i,j], Nullable) for i=1:size(dt, 1) for j=1:size(dt, 2)) == 0 + nullify!(dt) + @test map(typeof, dt.columns) == nullified + @test sum(isa(dt[i,j], Nullable) for i=1:size(dt, 1) for j=1:size(dt, 2)) == reduce(*, size(dt)) + @test map(typeof, denullify(dt).columns) == nullfree + @test sum(isa(dt[i,j], Nullable) for i=1:size(dt, 1) for j=1:size(dt, 2)) == reduce(*, size(dt)) + denullify!(dt) + map(typeof, dt.columns) == nullfree + @test sum(isa(dt[i,j], Nullable) for i=1:size(dt, 1) for j=1:size(dt, 2)) == 0 + + dt = DataTable(A = [Nullable(i) for i=1:10]) + @test denullify!(dt).columns == Any[[i for i=1:10]] + end end diff --git a/test/conversions.jl b/test/conversions.jl index a0afd0d..8bf9465 100644 --- a/test/conversions.jl +++ b/test/conversions.jl @@ -35,8 +35,6 @@ module TestConversions @test isa(ai, Matrix{Int}) @test ai == convert(Matrix{Int}, dt) - dt[1,1] = Nullable() - @test_throws ErrorException convert(Array, dt) na = convert(NullableArray, dt) naa = convert(NullableArray{Any}, dt) nai = convert(NullableArray{Int}, dt) @@ -55,28 +53,28 @@ module TestConversions dt = convert(DataTable,di) @test isa(dt,DataTable) @test names(dt) == Symbol[x for x in sort(collect(keys(di)))] - @test isequal(dt[:a], NullableArray(a)) - @test isequal(dt[:b], NullableArray(b)) - @test isequal(dt[:c], NullableArray(c)) + @test isequal(dt[:a], a) + @test isequal(dt[:b], b) + @test isequal(dt[:c], c) od = OrderedDict("c"=>c, "a"=>a, "b"=>b) dt = convert(DataTable,od) @test isa(dt, DataTable) @test names(dt) == Symbol[x for x in keys(od)] - @test isequal(dt[:a], NullableArray(a)) - @test isequal(dt[:b], NullableArray(b)) - @test isequal(dt[:c], NullableArray(c)) + @test isequal(dt[:a], a) + @test isequal(dt[:b], b) + @test isequal(dt[:c], c) sd = SortedDict("c"=>c, "a"=>a, "b"=>b) dt = convert(DataTable,sd) @test isa(dt, DataTable) @test names(dt) == Symbol[x for x in keys(sd)] - @test isequal(dt[:a], NullableArray(a)) - @test isequal(dt[:b], NullableArray(b)) - @test isequal(dt[:c], NullableArray(c)) + @test isequal(dt[:a], a) + @test isequal(dt[:b], b) + @test isequal(dt[:c], c) - a = [1.0] + a = 1.0 di = Dict("a"=>a, "b"=>b, "c"=>c) - @test_throws ArgumentError convert(DataTable,di) + @test convert(DataTable,di)[:a] == [1.0, 1.0] end diff --git a/test/data.jl b/test/data.jl index 9259a6e..d0272e1 100644 --- a/test/data.jl +++ b/test/data.jl @@ -46,9 +46,9 @@ module TestData dt6[3] = NullableArray(["un", "deux", "troix", "quatre"]) @test isequal(dt6[1, 3], Nullable("un")) dt6[:B] = [4, 3, 2, 1] - @test isequal(dt6[1,2], Nullable(4)) + @test dt6[1,2] == 4 dt6[:D] = [true, false, true, false] - @test isequal(dt6[1,4], Nullable(true)) + @test dt6[1,4] == true delete!(dt6, :D) @test names(dt6) == [:A, :B, :C] @test size(dt6, 2) == 3 @@ -74,7 +74,7 @@ module TestData @test size(sdt6d) == (2,1) #test_group("ref") - @test isequal(sdt6a[1,2], Nullable(4)) + @test sdt6a[1,2] == 4 #test_context("Within") #test_group("Associative") @@ -114,13 +114,14 @@ module TestData @test isequal(dt8[1:2, :d2], NullableCategoricalArray(["A", "B"])) @test size(dt8, 1) == 3 @test size(dt8, 2) == 5 - @test get(sum(dt8[:d1_length])) == N - @test all(dt8[:d1_length].values .> 0) - @test dt8[:d1_length].values == [4, 5, 11] + @test sum(dt8[:d1_length]) == N + @test all(dt8[:d1_length] .> 0) + @test dt8[2, :d1_length] == 5 + @test dt8[:d1_length] == [4, 5, 11] @test isequal(dt8, aggregate(groupby(dt7, :d2, sort=true), [sum, length])) - @test isequal(dt8[1, :d1_length], Nullable(4)) - @test isequal(dt8[2, :d1_length], Nullable(5)) - @test isequal(dt8[3, :d1_length], Nullable(11)) + @test dt8[1, :d1_length] == 4 + @test dt8[2, :d1_length] == 5 + @test dt8[3, :d1_length] == 11 @test isequal(dt8, aggregate(groupby(dt7, :d2), [sum, length], sort=true)) dt9 = dt7 |> groupby([:d2], sort=true) |> [sum, length] @@ -130,7 +131,7 @@ module TestData dt10 = DataTable( Any[[1:4;], [2:5;], ["a", "a", "a", "b" ], ["c", "d", "c", "d"]], - [:d1, :d2, :d3, :d4] + [:d1, :d2, :d3, :d4] ) gd = groupby(dt10, [:d3], sort=true) @@ -168,22 +169,22 @@ module TestData d1m_named = melt(d1[[1,3,4]], :a, variable_name=:letter, value_name=:someval) @test names(d1m_named) == [:letter, :someval, :a] - stackdt(d1, :a) - d1s = stackdt(d1, [:a, :b]) - d1s2 = stackdt(d1, [:c, :d]) - d1s3 = stackdt(d1) - d1m = meltdt(d1, [:c, :d, :e]) + stack(d1, :a) + d1s = stack(d1, [:a, :b]) + d1s2 = stack(d1, [:c, :d]) + d1s3 = stack(d1) + d1m = melt(d1, [:c, :d, :e]) @test isequal(d1s[1:12, :c], d1[:c]) @test isequal(d1s[13:24, :c], d1[:c]) @test isequal(d1s2, d1s3) @test names(d1s) == [:variable, :value, :c, :d, :e] @test isequal(d1s, d1m) - d1m = meltdt(d1[[1,3,4]], :a) + d1m = melt(d1[[1,3,4]], :a) @test names(d1m) == [:variable, :value, :a] - d1s_named = stackdt(d1, [:a, :b], variable_name=:letter, value_name=:someval) + d1s_named = stack(d1, [:a, :b], variable_name=:letter, value_name=:someval) @test names(d1s_named) == [:letter, :someval, :c, :d, :e] - d1m_named = meltdt(d1, [:c, :d, :e], variable_name=:letter, value_name=:someval) + d1m_named = melt(d1, [:c, :d, :e], variable_name=:letter, value_name=:someval) @test names(d1m_named) == [:letter, :someval, :c, :d, :e] d1s[:id] = [1:12; 1:12] @@ -191,9 +192,9 @@ module TestData d1us = unstack(d1s, :id, :variable, :value) d1us2 = unstack(d1s2) d1us3 = unstack(d1s2, :variable, :value) - @test isequal(d1us[:a], d1[:a]) - @test isequal(d1us2[:d], d1[:d]) - @test isequal(d1us2[:3], d1[:d]) + @test isequal(d1us[:a], NullableArray(d1[:a])) + @test isequal(d1us2[:d], NullableArray(d1[:d])) + @test isequal(d1us2[:3], NullableArray(d1[:d])) @@ -215,10 +216,10 @@ module TestData v2 = randn(5)) m1 = join(dt1, dt2, on = :a, kind=:inner) - @test isequal(m1[:a], dt1[:a][dt1[:a].values .<= 5]) # preserves dt1 order + @test isequal(m1[:a], dt1[:a][dt1[:a] .<= 5]) # preserves dt1 order m2 = join(dt1, dt2, on = :a, kind = :outer) - @test isequal(m2[:a], dt1[:a]) # preserves dt1 order - @test isequal(m2[:b], dt1[:b]) # preserves dt1 order + @test isequal(m2[:a], NullableArray(dt1[:a])) # preserves dt1 order + @test isequal(m2[:b], NullableArray(dt1[:b])) # preserves dt1 order # TODO: Re-enable m2 = join(dt1, dt2, on = :a, kind = :outer) # @test isequal(m2[:b2], @@ -236,7 +237,7 @@ module TestData c = ["New World", "Old World", "New World"]) m1 = join(dt1, dt2, on = :a, kind = :inner) - @test isequal(m1[:a], NullableArray([1, 2])) + @test m1[:a] == [1, 2] m2 = join(dt1, dt2, on = :a, kind = :left) @test isequal(m2[:a], NullableArray([1, 2, 3])) @@ -271,13 +272,6 @@ module TestData v1 = randn(10) ) - dt2 = DataTable( - a = [:x,:y][[1,2,1,1,2]], - b = [:A,:B,:C][[1,1,1,2,3]], - v2 = randn(5) - ) - dt2[1,:a] = Nullable() - # # TODO: Restore this functionality # m1 = join(dt1, dt2, on = [:a,:b]) # @test isequal(m1[:a], NullableArray(["x", "x", "y", "y", fill("x", 5)])) diff --git a/test/datatable.jl b/test/datatable.jl index c75f5fe..b2ee0aa 100644 --- a/test/datatable.jl +++ b/test/datatable.jl @@ -39,17 +39,17 @@ module TestDataTable dtdc = deepcopy(dt) dt[1, :a] = 4 - get(dt[1, :b])[:e] = 5 + dt[1, :b][:e] = 5 names!(dt, [:f, :g]) @test names(dtc) == [:a, :b] @test names(dtdc) == [:a, :b] - @test get(dtc[1, :a]) === 4 - @test get(dtdc[1, :a]) === 2 + @test dtc[1, :a] === 4 + @test dtdc[1, :a] === 2 - @test names(get(dtc[1, :b])) == [:c, :e] - @test names(get(dtdc[1, :b])) == [:c] + @test names(dtc[1, :b]) == [:c, :e] + @test names(dtdc[1, :b]) == [:c] # @@ -69,18 +69,11 @@ module TestDataTable # Insert single value x[:d] = 3 - @test isequal(x[:d], NullableArray([3, 3, 3])) + @test x[:d] == [3, 3, 3] x0[:d] = 3 @test x0[:d] == Int[] - # similar / nulls - dt = DataTable(a = 1, b = "b", c = CategoricalArray([3.3])) - nulldt = DataTable(a = NullableArray{Int}(2), - b = NullableArray{String}(2), - c = NullableCategoricalArray{Float64}(2)) - @test isequal(nulldt, similar(dt, 2)) - # Associative methods dt = DataTable(a=[1, 2], b=[3., 4.]) @@ -99,9 +92,9 @@ module TestDataTable @test_throws ErrorException insert!(dt, 1, ["a"], :newcol) @test isequal(insert!(dt, 1, ["a", "b"], :newcol), dt) @test names(dt) == [:newcol, :a, :b] - @test isequal(dt[:a], NullableArray([1, 2])) - @test isequal(dt[:b], NullableArray([3., 4.])) - @test isequal(dt[:newcol], ["a", "b"]) + @test dt[:a] == [1, 2] + @test dt[:b] == [3., 4.] + @test dt[:newcol] == ["a", "b"] dt = DataTable(a=[1, 2], b=[3., 4.]) dt2 = DataTable(b=["a", "b"], c=[:c, :d]) @@ -112,43 +105,45 @@ module TestDataTable dt = DataTable(Int, 10, 3) @test size(dt, 1) == 10 @test size(dt, 2) == 3 - @test typeof(dt[:, 1]) == NullableVector{Int} - @test typeof(dt[:, 2]) == NullableVector{Int} - @test typeof(dt[:, 3]) == NullableVector{Int} - @test allnull(dt[:, 1]) - @test allnull(dt[:, 2]) - @test allnull(dt[:, 3]) - - dt = DataTable(Any[Int, Float64, String], 100) + @test typeof(dt[:, 1]) == Vector{Int} + @test typeof(dt[:, 2]) == Vector{Int} + @test typeof(dt[:, 3]) == Vector{Int} + @test !anynull(dt[:, 1]) + @test !anynull(dt[:, 2]) + @test !anynull(dt[:, 3]) + + dt = DataTable([Int, Float64, String], 100) @test size(dt, 1) == 100 @test size(dt, 2) == 3 - @test typeof(dt[:, 1]) == NullableVector{Int} - @test typeof(dt[:, 2]) == NullableVector{Float64} - @test typeof(dt[:, 3]) == NullableVector{String} - @test allnull(dt[:, 1]) - @test allnull(dt[:, 2]) - @test allnull(dt[:, 3]) - - dt = DataTable(Any[Int, Float64, String], [:A, :B, :C], 100) + @test typeof(dt[:, 1]) == Vector{Int} + @test typeof(dt[:, 2]) == Vector{Float64} + @test typeof(dt[:, 3]) == Vector{String} + @test !anynull(dt[:, 1]) + @test !anynull(dt[:, 2]) + # array of #undef + # @test !anynull(dt[:, 3]) + + dt = DataTable([Int, Float64, String], [:A, :B, :C], 100) @test size(dt, 1) == 100 @test size(dt, 2) == 3 - @test typeof(dt[:, 1]) == NullableVector{Int} - @test typeof(dt[:, 2]) == NullableVector{Float64} - @test typeof(dt[:, 3]) == NullableVector{String} - @test allnull(dt[:, 1]) - @test allnull(dt[:, 2]) - @test allnull(dt[:, 3]) + @test typeof(dt[:, 1]) == Vector{Int} + @test typeof(dt[:, 2]) == Vector{Float64} + @test typeof(dt[:, 3]) == Vector{String} + @test !anynull(dt[:, 1]) + @test !anynull(dt[:, 2]) + # array of #undef + # @test !anynull(dt[:, 3]) dt = DataTable(DataType[Int, Float64, Compat.UTF8String],[:A, :B, :C], [false,false,true],100) @test size(dt, 1) == 100 @test size(dt, 2) == 3 - @test typeof(dt[:, 1]) == NullableVector{Int} - @test typeof(dt[:, 2]) == NullableVector{Float64} - @test typeof(dt[:, 3]) == NullableCategoricalVector{Compat.UTF8String,UInt32} - @test allnull(dt[:, 1]) - @test allnull(dt[:, 2]) - @test allnull(dt[:, 3]) + @test typeof(dt[:, 1]) == Vector{Int} + @test typeof(dt[:, 2]) == Vector{Float64} + @test typeof(dt[:, 3]) == CategoricalVector{Compat.UTF8String,UInt32} + @test !anynull(dt[:, 1]) + @test !anynull(dt[:, 2]) + # @test !anynull(dt[:, 3]) dt = convert(DataTable, zeros(10, 5)) @@ -166,25 +161,9 @@ module TestDataTable @test size(dt, 2) == 5 @test typeof(dt[:, 1]) == Vector{Float64} - #test_group("Other DataTable constructors") - dt = DataTable([@compat(Dict{Any,Any}(:a=>1, :b=>'c')), - @compat(Dict{Any,Any}(:a=>3, :b=>'d')), - @compat(Dict{Any,Any}(:a=>5))]) - @test size(dt, 1) == 3 - @test size(dt, 2) == 2 - @test typeof(dt[:,:a]) == NullableVector{Int} - @test typeof(dt[:,:b]) == NullableVector{Char} - - dt = DataTable([@compat(Dict{Any,Any}(:a=>1, :b=>'c')), - @compat(Dict{Any,Any}(:a=>3, :b=>'d')), - @compat(Dict{Any,Any}(:a=>5))], - [:a, :b]) - @test size(dt, 1) == 3 - @test size(dt, 2) == 2 - @test typeof(dt[:,:a]) == NullableVector{Int} - @test typeof(dt[:,:b]) == NullableVector{Char} - - @test DataTable(NullableArray[[1,2,3],[2.5,4.5,6.5]], [:A, :B]) == DataTable(A = [1,2,3], B = [2.5,4.5,6.5]) + # test_group("Other DataTable constructors") + + @test DataTable([[1,2,3],[2.5,4.5,6.5]], [:A, :B]) == DataTable(A = [1,2,3], B = [2.5,4.5,6.5]) # This assignment was missing before dt = DataTable(Column = [:A]) @@ -307,38 +286,56 @@ module TestDataTable @test nothing == describe(f, NullableCategoricalArray(Nullable{String}["1", "2", Nullable()])) end - #Check the output of unstack - dt = DataTable(Fish = CategoricalArray(["Bob", "Bob", "Batman", "Batman"]), - Key = ["Mass", "Color", "Mass", "Color"], - Value = ["12 g", "Red", "18 g", "Grey"]) - # Check that reordering levels does not confuse unstack - levels!(dt[1], ["XXX", "Bob", "Batman"]) - #Unstack specifying a row column - dt2 = unstack(dt,:Fish, :Key, :Value) - #Unstack without specifying a row column - dt3 = unstack(dt,:Key, :Value) - #The expected output - dt4 = DataTable(Fish = ["XXX", "Bob", "Batman"], - Color = Nullable{String}[Nullable(), "Red", "Grey"], - Mass = Nullable{String}[Nullable(), "12 g", "18 g"]) - @test isequal(dt2, dt4) - @test isequal(dt3, dt4[2:3, :]) - #Make sure unstack works with NULLs at the start of the value column - dt[1,:Value] = Nullable() - dt2 = unstack(dt,:Fish, :Key, :Value) - #This changes the expected result - dt4[2,:Mass] = Nullable() - @test isequal(dt2, dt4) + @testset "unstacking and nullables" begin + dtA = DataTable(Fish = CategoricalArray(["Bob", "Bob", "Batman", "Batman"]), + Key = ["Mass", "Color", "Mass", "Color"], + Value = ["12 g", "Red", "18 g", "Grey"]) + # Check that reordering levels does not confuse unstack + levels!(dtA[1], ["XXX", "Bob", "Batman"]) + # should all return the same output, just different column types + dt2A = unstack(dtA, :Fish, :Key, :Value) + dt3A = unstack(dtA, :Key, :Value) + dt4A = DataTable(Fish = NullableCategoricalArray(["Bob", "Batman"]), + Color = NullableArray(["Red", "Grey"]), + Mass = NullableArray(["12 g", "18 g"])) + @test dt2A[[2, 3], :] == dt3A == dt4A + + dtB = DataTable(Fish = CategoricalArray(["Bob", "Bob", "Batman", "Batman"]), + Key = CategoricalArray(["Mass", "Color", "Mass", "Color"]), + Value = CategoricalArray(["12 g", "Red", "18 g", "Grey"])) + dt2B = unstack(dtB, :Fish, :Key, :Value) + dt3B = unstack(dtB, :Key, :Value) + dt4B = DataTable(Fish = NullableCategoricalArray(["Batman", "Bob"]), + Color = NullableCategoricalArray(["Grey", "Red"]), + Mass = NullableCategoricalArray(["18 g", "12 g"])) + @test dt2B == dt3B == dt4B + + # test multiple entries in unstack error + dt = DataTable(id=[1, 2, 1, 2], variable=["a", "b", "a", "b"], value=[3, 4, 5, 6]) + a = unstack(dt, :id, :variable, :value) + b = unstack(dt, :variable, :value) + @test a == b == DataTable(id = Nullable[1, 2], a = Nullable[5, Nullable()], b = Nullable[Nullable(), 6]) + + dt = DataTable(id=1:2, variable=["a", "b"], value=3:4) + a = unstack(dt, :id, :variable, :value) + b = unstack(dt, :variable, :value) + @test a == b == DataTable(id = Nullable[1, 2], a = Nullable[3, Nullable()], b = Nullable[Nullable(), 4]) + + dt = DataTable(id=1:2, variable=["a", "b"], value=3:4) + a = unstack(dt, :id, :variable, :value) + b = unstack(dt, :variable, :value) + @test a == b == DataTable(id = Nullable[1, 2], a = [3, Nullable()], b = [Nullable(), 4]) + end dt = DataTable(A = 1:10, B = 'A':'J') @test !(dt[:,:] === dt) @test append!(DataTable(A = 1:2, B = 1:2), DataTable(A = 3:4, B = 3:4)) == DataTable(A=1:4, B = 1:4) - @test !any(c -> isa(c, NullableCategoricalArray), categorical!(DataTable(A=1:3, B=4:6)).columns) - @test all(c -> isa(c, NullableCategoricalArray), categorical!(DataTable(A=1:3, B=4:6), [1,2]).columns) - @test all(c -> isa(c, NullableCategoricalArray), categorical!(DataTable(A=1:3, B=4:6), [:A,:B]).columns) - @test find(c -> isa(c, NullableCategoricalArray), categorical!(DataTable(A=1:3, B=4:6), [:A]).columns) == [1] - @test find(c -> isa(c, NullableCategoricalArray), categorical!(DataTable(A=1:3, B=4:6), :A).columns) == [1] - @test find(c -> isa(c, NullableCategoricalArray), categorical!(DataTable(A=1:3, B=4:6), [1]).columns) == [1] - @test find(c -> isa(c, NullableCategoricalArray), categorical!(DataTable(A=1:3, B=4:6), 1).columns) == [1] + @test !any(c -> isa(c, CategoricalArray), categorical!(DataTable(A=1:3, B=4:6)).columns) + @test all(c -> isa(c, CategoricalArray), categorical!(DataTable(A=1:3, B=4:6), [1,2]).columns) + @test all(c -> isa(c, CategoricalArray), categorical!(DataTable(A=1:3, B=4:6), [:A,:B]).columns) + @test find(c -> isa(c, CategoricalArray), categorical!(DataTable(A=1:3, B=4:6), [:A]).columns) == [1] + @test find(c -> isa(c, CategoricalArray), categorical!(DataTable(A=1:3, B=4:6), :A).columns) == [1] + @test find(c -> isa(c, CategoricalArray), categorical!(DataTable(A=1:3, B=4:6), [1]).columns) == [1] + @test find(c -> isa(c, CategoricalArray), categorical!(DataTable(A=1:3, B=4:6), 1).columns) == [1] end diff --git a/test/grouping.jl b/test/grouping.jl index 4ce63a8..b7e22a5 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -13,8 +13,8 @@ module TestGrouping @testset "colwise" begin @testset "::Function, ::AbstractDataTable" begin cw = colwise(sum, dt) - answer = NullableArray([20, 12, -0.4283098098931877]) - @test isa(cw, NullableArray{Any, 1}) + answer = Real[20, 12, -0.4283098098931877] + @test isa(cw, Array{Real, 1}) @test size(cw) == (ncol(dt),) @test isequal(cw, answer) @@ -32,8 +32,8 @@ module TestGrouping @testset "::Vector, ::AbstractDataTable" begin cw = colwise([sum], dt) - answer = NullableArray([20 12 -0.4283098098931877]) - @test isa(cw, NullableArray{Any, 2}) + answer = Real[20 12 -0.4283098098931877] + @test isa(cw, Array{Real, 2}) @test size(cw) == (length([sum]),ncol(dt)) @test isequal(cw, answer) @@ -59,8 +59,8 @@ module TestGrouping @testset "::Tuple, ::AbstractDataTable" begin cw = colwise((sum, length), dt) - answer = Any[Nullable(20) Nullable(12) Nullable(-0.4283098098931877); 8 8 8] - @test isa(cw, Array{Any, 2}) + answer = Real[20 12 -0.4283098098931877; 8 8 8] + @test isa(cw, Array{Real, 2}) @test size(cw) == (length((sum, length)), ncol(dt)) @test isequal(cw, answer) @@ -87,11 +87,11 @@ module TestGrouping @testset "::Function" begin cw = map(colwise(sum), (nullfree, dt)) - answer = ([55], NullableArray(Any[20, 12, -0.4283098098931877])) + answer = ([55], Real[20, 12, -0.4283098098931877]) @test isequal(cw, answer) cw = map(colwise((sum, length)), (nullfree, dt)) - answer = (reshape([55, 10], (2,1)), Any[Nullable(20) Nullable(12) Nullable(-0.4283098098931877); 8 8 8]) + answer = (reshape([55, 10], (2,1)), Real[20 12 -0.4283098098931877; 8 8 8]) @test isequal(cw, answer) cw = map(colwise([sum, length]), (nullfree, dt)) @@ -141,35 +141,62 @@ module TestGrouping @test groupby(DataTable(A=Int[1]), :A).starts == Int[1] # issue #960 - x = CategoricalArray(collect(1:20)) + x = categorical(collect(1:20)) dt = DataTable(v1=x, v2=x) groupby(dt, [:v1, :v2]) - dt2 = by(e->1, DataTable(x=Int64[]), :x) - @test size(dt2) == (0,1) - @test isequal(sum(dt2[:x]), Nullable(0)) + # what is this testting? + # dt2 = by(e->1, DataTable(x=Int64[]), :x) + # @test size(dt2) == (0,1) + # @test sum(dt2[:x]) == 0 # Check that reordering levels does not confuse groupby - dt = DataTable(Key1 = CategoricalArray(["A", "A", "B", "B"]), - Key2 = CategoricalArray(["A", "B", "A", "B"]), + dt = DataTable(Key1 = categorical(["A", "A", "B", "B"]), + Key2 = categorical(["A", "B", "A", "B"]), Value = 1:4) gd = groupby(dt, :Key1) - @test isequal(gd[1], DataTable(Key1=["A", "A"], Key2=["A", "B"], Value=1:2)) - @test isequal(gd[2], DataTable(Key1=["B", "B"], Key2=["A", "B"], Value=3:4)) + @test gd[1].parent[gd[1].rows, :] == DataTable(Key1 = categorical(["A", "A"]), + Key2 = categorical(["A", "B"]), + Value = collect(1:2)) + @test gd[2].parent[gd[2].rows, :] == DataTable(Key1 = categorical(["B", "B"]), + Key2 = categorical(["A", "B"]), + Value = collect(3:4)) gd = groupby(dt, [:Key1, :Key2]) - @test isequal(gd[1], DataTable(Key1="A", Key2="A", Value=1)) - @test isequal(gd[2], DataTable(Key1="A", Key2="B", Value=2)) - @test isequal(gd[3], DataTable(Key1="B", Key2="A", Value=3)) - @test isequal(gd[4], DataTable(Key1="B", Key2="B", Value=4)) + @test gd[1].parent[gd[1].rows, :] == DataTable(Key1 = categorical(["A"]), + Key2 = categorical(["A"]), + Value = [1]) + @test gd[2].parent[gd[2].rows, :] == DataTable(Key1 = categorical(["A"]), + Key2 = categorical(["B"]), + Value = [2]) + @test gd[3].parent[gd[3].rows, :] == DataTable(Key1 = categorical(["B"]), + Key2 = categorical(["A"]), + Value = [3]) + @test gd[4].parent[gd[4].rows, :] == DataTable(Key1 = categorical(["B"]), + Key2 = categorical(["B"]), + Value = [4]) # Reorder levels, add unused level levels!(dt[:Key1], ["Z", "B", "A"]) levels!(dt[:Key2], ["Z", "B", "A"]) gd = groupby(dt, :Key1) - @test isequal(gd[1], DataTable(Key1=["A", "A"], Key2=["A", "B"], Value=1:2)) - @test isequal(gd[2], DataTable(Key1=["B", "B"], Key2=["A", "B"], Value=3:4)) + @test gd[1].parent[gd[1].rows, :] == DataTable(Key1 = categorical(["A", "A"]), + Key2 = categorical(["A", "B"]), + Value = collect(1:2)) + @test gd[2].parent[gd[2].rows, :] == DataTable(Key1 = categorical(["B", "B"]), + Key2 = categorical(["A", "B"]), + Value = collect(3:4)) gd = groupby(dt, [:Key1, :Key2]) - @test isequal(gd[1], DataTable(Key1="A", Key2="A", Value=1)) - @test isequal(gd[2], DataTable(Key1="A", Key2="B", Value=2)) - @test isequal(gd[3], DataTable(Key1="B", Key2="A", Value=3)) - @test isequal(gd[4], DataTable(Key1="B", Key2="B", Value=4)) + @test gd[1].parent[gd[1].rows, :] == DataTable(Key1 = categorical(["A"]), + Key2 = categorical(["A"]), + Value = [1]) + @test gd[2].parent[gd[2].rows, :] == DataTable(Key1 = categorical(["A"]), + Key2 = categorical(["B"]), + Value = [2]) + @test gd[3].parent[gd[3].rows, :] == DataTable(Key1 = categorical(["B"]), + Key2 = categorical(["A"]), + Value = [3]) + @test gd[4].parent[gd[4].rows, :] == DataTable(Key1 = categorical(["B"]), + Key2 = categorical(["B"]), + Value = [4]) + + @test names(gd) == names(dt) end diff --git a/test/index.jl b/test/index.jl index 484b434..5f8a930 100644 --- a/test/index.jl +++ b/test/index.jl @@ -57,6 +57,6 @@ end dt = DataTable(A=[0],B=[0]) dt[1:end] = 0.0 dt[1,:A] = 1.0 -@test dt[1,:B] === Nullable(0) +@test dt[1,:B] === 0 end diff --git a/test/iteration.jl b/test/iteration.jl index 365b44b..afa93b2 100644 --- a/test/iteration.jl +++ b/test/iteration.jl @@ -9,37 +9,37 @@ module TestIteration for row in eachrow(dt) @test isa(row, DataTableRow) - @test isequal(row[:B]-row[:A], Nullable(1)) + @test row[:B]-row[:A] == 1 # issue #683 (https://github.com/JuliaStats/DataFrames.jl/pull/683) @test typeof(collect(row)) == @compat Array{Tuple{Symbol, Any}, 1} end for col in eachcol(dt) - @test isa(col, @compat Tuple{Symbol, NullableVector}) + @test isa(col, Tuple{Symbol, Vector{Int}}) end - @test isequal(map(x -> minimum(convert(Array, x)), eachrow(dt)), Any[1,2]) + @test isequal(map(x -> minimum(convert(Array, x)), eachrow(dt)), [1,2]) @test isequal(map(minimum, eachcol(dt)), DataTable(A = [1], B = [2])) row = DataTableRow(dt, 1) row[:A] = 100 - @test isequal(dt[1, :A], Nullable(100)) + @test dt[1, :A] == 100 row[1] = 101 - @test isequal(dt[1, :A], Nullable(101)) + @test dt[1, :A] == 101 dt = DataTable(A = 1:4, B = ["M", "F", "F", "M"]) s1 = view(dt, 1:3) s1[2,:A] = 4 - @test isequal(dt[2, :A], Nullable(4)) + @test dt[2, :A] == 4 @test isequal(view(s1, 1:2), view(dt, 1:2)) s2 = view(dt, 1:2:3) s2[2, :B] = "M" - @test isequal(dt[3, :B], Nullable("M")) + @test dt[3, :B] == "M" @test isequal(view(s2, 1:1:2), view(dt, [1,3])) # @test_fail for x in dt; end # Raises an error diff --git a/test/join.jl b/test/join.jl index 0ac3fe6..63fc38a 100644 --- a/test/join.jl +++ b/test/join.jl @@ -2,8 +2,8 @@ module TestJoin using Base.Test using DataTables - name = DataTable(ID = [1, 2, 3], Name = ["John Doe", "Jane Doe", "Joe Blogs"]) - job = DataTable(ID = [1, 2, 2, 4], Job = ["Lawyer", "Doctor", "Florist", "Farmer"]) + name = DataTable(ID = NullableArray([1, 2, 3]), Name = NullableArray(["John Doe", "Jane Doe", "Joe Blogs"])) + job = DataTable(ID = NullableArray([1, 2, 2, 4]), Job = NullableArray(["Lawyer", "Doctor", "Florist", "Farmer"])) # Join on symbols or vectors of symbols join(name, job, on = :ID) @@ -13,9 +13,9 @@ module TestJoin #@test_throws join(name, job) # Test output of various join types - outer = DataTable(ID = [1, 2, 2, 3, 4], - Name = NullableArray(Nullable{String}["John Doe", "Jane Doe", "Jane Doe", "Joe Blogs", Nullable()]), - Job = NullableArray(Nullable{String}["Lawyer", "Doctor", "Florist", Nullable(), "Farmer"])) + outer = DataTable(ID = NullableArray([1, 2, 2, 3, 4]), + Name = NullableArray(["John Doe", "Jane Doe", "Jane Doe", "Joe Blogs", Nullable()]), + Job = NullableArray(["Lawyer", "Doctor", "Florist", Nullable(), "Farmer"])) # (Tests use current column ordering but don't promote it) right = outer[Bool[!isnull(x) for x in outer[:Job]], [:ID, :Name, :Job]] @@ -70,7 +70,7 @@ module TestJoin @test_throws ArgumentError join(dt1, dt2, on = :A, kind = :cross) # test empty inputs - simple_dt(len::Int, col=:A) = (dt = DataTable(); dt[col]=collect(1:len); dt) + simple_dt(len::Int, col=:A) = (dt = DataTable(); dt[col]=NullableArray(collect(1:len)); dt) @test isequal(join(simple_dt(0), simple_dt(0), on = :A, kind = :left), simple_dt(0)) @test isequal(join(simple_dt(2), simple_dt(0), on = :A, kind = :left), simple_dt(2)) @test isequal(join(simple_dt(0), simple_dt(2), on = :A, kind = :left), simple_dt(0)) @@ -104,17 +104,73 @@ module TestJoin # Test that Array{Nullable} works when combined with NullableArray (#1088) dt = DataTable(Name = Nullable{String}["A", "B", "C"], Mass = [1.5, 2.2, 1.1]) - dt2 = DataTable(Name = ["A", "B", "C", "A"], + dt2 = DataTable(Name = Nullable{String}["A", "B", "C", "A"], Quantity = [3, 3, 2, 4]) - @test join(dt2, dt, on=:Name, kind=:left) == DataTable(Name = ["A", "B", "C", "A"], - Quantity = [3, 3, 2, 4], - Mass = [1.5, 2.2, 1.1, 1.5]) + @test join(dt2, dt, on=:Name, kind=:left) == DataTable(Name = Nullable{String}["A", "B", "C", "A"], + Quantity = Nullable{Int}[3, 3, 2, 4], + Mass = Nullable{Float64}[1.5, 2.2, 1.1, 1.5]) # Test that join works when mixing Array and NullableArray (#1151) dt = DataTable([collect(1:10), collect(2:11)], [:x, :y]) dtnull = DataTable(x = 1:10, z = 3:12) @test join(dt, dtnull, on = :x) == - DataTable([collect(1:10), collect(2:11), NullableArray(3:12)], [:x, :y, :z]) + DataTable([collect(1:10), collect(2:11), collect(3:12)], [:x, :y, :z]) @test join(dtnull, dt, on = :x) == - DataTable([NullableArray(1:10), NullableArray(3:12), NullableArray(2:11)], [:x, :z, :y]) + DataTable([collect(1:10), collect(3:12), collect(2:11)], [:x, :z, :y]) + + @testset "complete set of joins" begin + small = DataTable(id = [1, 3, 5], fid = [1.0, 3.0, 5.0]) + large = DataTable(id = [0, 1, 2, 3, 4], fid = [0.0, 1.0, 2.0, 3.0, 4.0]) + N = Nullable() + + @test join(small, large, kind=:cross) == DataTable(id = repeat([1, 3, 5], inner=5), + fid = repeat([1.0, 3.0, 5.0], inner=5), + id_1 = repeat([0, 1, 2, 3, 4], outer=3), + fid_1 = repeat([0.0, 1.0, 2.0, 3.0, 4.0], outer=3)) + # id + @test join(small, large, on=:id, kind=:inner) == DataTable(id = [1, 3], + fid = [1.0, 3.0], + fid_1 = [1.0, 3.0]) + @test join(small, large, on=:id, kind=:left) == nullify!(DataTable(id = [1, 3, 5], + fid = [1.0, 3.0, 5.0], + fid_1 = [1.0, 3.0, N])) + @test join(small, large, on=:id, kind=:right) == nullify!(DataTable(id = [1, 3, 0, 2, 4], + fid = [1.0, 3.0, N, N, N], + fid_1 = [1.0, 3.0, 0.0, 2.0, 4.0])) + @test join(small, large, on=:id, kind=:outer) == nullify!(DataTable(id = [1, 3, 5, 0, 2, 4], + fid = [1.0, 3.0, 5.0, N, N, N], + fid_1 = [1.0, 3.0, N, 0.0, 2.0, 4.0])) + @test join(small, large, on=:id, kind=:semi) == DataTable(id = [1, 3], fid = [1.0, 3.0]) + @test join(small, large, on=:id, kind=:anti) == DataTable(id = 5, fid = 5.0) + + # fid + @test join(small, large, on=:fid, kind=:inner) == DataTable(id = [1, 3], + fid = [1.0, 3.0], + id_1 = [1, 3]) + @test join(small, large, on=:fid, kind=:left) == nullify!(DataTable(id = [1, 3, 5], + fid = [1.0, 3.0, 5.0], + id_1 = [1, 3, N])) + @test join(small, large, on=:fid, kind=:right) == nullify!(DataTable(id = [1, 3, N, N, N], + fid = [1.0, 3.0, 0.0, 2.0, 4.0], + id_1 = [1, 3, 0, 2, 4])) + @test join(small, large, on=:fid, kind=:outer) == nullify!(DataTable(id = [1, 3, 5, N, N, N], + fid = [1.0, 3.0, 5.0, 0.0, 2.0, 4.0], + id_1 = [1, 3, N, 0, 2, 4])) + @test join(small, large, on=:fid, kind=:semi) == DataTable(id = [1, 3], fid = [1.0, 3.0]) + @test join(small, large, on=:fid, kind=:anti) == DataTable(id = 5, fid = 5.0) + + # both + @test join(small, large, on=[:id, :fid], kind=:inner) == DataTable(id = [1, 3], + fid = [1.0, 3.0]) + @test join(small, large, on=[:id, :fid], kind=:left) == nullify!(DataTable(id = [1, 3, 5], + fid = [1.0, 3.0, 5.0])) + @test join(small, large, on=[:id, :fid], kind=:right) == nullify!(DataTable(id = [1, 3, 0, 2, 4], + fid = [1.0, 3.0, 0.0, 2.0, 4.0])) + @test join(small, large, on=[:id, :fid], kind=:outer) == nullify!(DataTable(id = [1, 3, 5, 0, 2, 4], + fid = [1.0, 3.0, 5.0, 0.0, 2.0, 4.0])) + @test join(small, large, on=[:id, :fid], kind=:semi) == DataTable(id = [1, 3], + fid = [1.0, 3.0]) + @test join(small, large, on=[:id, :fid], kind=:anti) == DataTable(id = 5, + fid = 5.0) + end end