incorporate edits suggested during review

JuliaData · Mar 13, 2017 · 2c95f13 · nalimilan · Mar 15, 2017 · nalimilan
1 parent f5a53a1
commit 2c95f13
Show file tree

Hide file tree

Showing 8 changed files with 107 additions and 64 deletions.
diff --git a/src/abstractdatatable/abstractdatatable.jl b/src/abstractdatatable/abstractdatatable.jl
@@ -777,6 +777,7 @@ end
 
 Convert columns with a `Nullable` element type without any null values
 to a non-`Nullable` equivalent array type. The table `dt` is modified in place.
+`NullableVectors` are aliased to their `values` field.
 
 # Examples
 
@@ -805,12 +806,12 @@ julia> eltypes(dt)
  Int64
 ```
 
-See also [`denullify`](@ref) & [`nullify!`](@ref).
+See also [`denullify`](@ref) and [`nullify!`](@ref).
 """
 function denullify!(dt::AbstractDataTable)
     for i in 1:size(dt,2)
         if !anynull(dt[i])
-            dt[i] = dropnull(dt[i])
+            dt[i] = dropnull!(dt[i])
         end
     end
     dt
@@ -889,11 +890,14 @@ See also [`nullify`](@ref) & [`denullify!`](@ref).
 """
 function nullify!(dt::AbstractDataTable)
     for i in 1:size(dt,2)
-        dt[i] = NullableArray(dt[i])
+        dt[i] = nullify(dt[i])
     end
     dt
 end
 
+nullify(x::AbstractArray) = convert(NullableArray, x)
+nullify(x::AbstractCategoricalArray) = convert(NullableCategoricalArray, x)
+
 """
     nullify(dt::AbstractDataTable)
 

diff --git a/src/abstractdatatable/io.jl b/src/abstractdatatable/io.jl
@@ -45,7 +45,7 @@ function printtable(io::IO,
             if !isnull(dt[j][i])
                 if ! (etypes[j] <: Real)
                     print(io, quotemark)
-                    x = isa(dt[i, j], Nullable) ? get(dt[i, j]) : dt[i, j]
+                    x = isa(dt[i, j], Nullable) ? _unsafe_get(dt[i, j]) : dt[i, j]
                     escapedprint(io, x, quotestr)
                     print(io, quotemark)
                 else

diff --git a/src/abstractdatatable/join.jl b/src/abstractdatatable/join.jl
@@ -2,6 +2,19 @@
 ## Join / merge
 ##
 
+# Like similar, but returns a nullable array
+similar_nullable{T}(dv::AbstractArray{T}, dims::Union{Int, Tuple{Vararg{Int}}}) =
+    NullableArray(T, dims)
+
+similar_nullable{T<:Nullable}(dv::AbstractArray{T}, dims::Union{Int, Tuple{Vararg{Int}}}) =
+    NullableArray(eltype(T), dims)
+
+similar_nullable{T,R}(dv::CategoricalArray{T,R}, dims::Union{Int, Tuple{Vararg{Int}}}) =
+    NullableCategoricalArray(T, dims)
+
+similar_nullable(dt::AbstractDataTable, dims::Int) =
+    DataTable(Any[similar_nullable(x, dims) for x in columns(dt)], copy(index(dt)))
+
 # helper structure for DataTables joining
 immutable DataTableJoiner{DT1<:AbstractDataTable, DT2<:AbstractDataTable}
     dtl::DT1
@@ -64,9 +77,9 @@ function compose_joined_table(joiner::DataTableJoiner,
     end
     all_orig_right_ixs = vcat(right_ixs.orig, rightonly_ixs.orig)
     resizelen = length(all_orig_right_ixs)+length(leftonly_ixs)
-    rightcols = Any[length(col[all_orig_right_ixs]) >= resizelen ?
-                               resize!(col[all_orig_right_ixs], resizelen)[right_perm] :
-                               NullableArray(vcat(col[all_orig_right_ixs], fill(Nullable(), resizelen - length(col[all_orig_right_ixs]))))[right_perm]
+    rightcols = Any[length(all_orig_right_ixs) >= resizelen ?
+                       resize!(col[all_orig_right_ixs], resizelen)[right_perm] :
+                       copy!(similar_nullable(col[all_orig_right_ixs], resizelen), col[all_orig_right_ixs])[right_perm]
                     for col in columns(dtr_noon)]
     right_dt = DataTable(rightcols, names(dtr_noon))
     # merge left and right parts of the joined table

diff --git a/src/abstractdatatable/reshape.jl b/src/abstractdatatable/reshape.jl
@@ -204,14 +204,19 @@ function unstack(dt::AbstractDataTable, rowkey::Int, colkey::Int, value::Int)
     end
     payload = DataTable(Any[NullableVector{T}(Nrow) for i in 1:Ncol],
                         map(Symbol, levels(keycol)))
+    nowarning = true
     for k in 1:nrow(dt)
         j = Int(CategoricalArrays.order(keycol.pool)[keycol.refs[k]])
         i = Int(CategoricalArrays.order(refkeycol.pool)[refkeycol.refs[k]])
         if i > 0 && j > 0
+            if nowarning && !isnull(payload[j][i])
+                warn("Duplicate entries in unstack.")
+                nowarning = false
+            end
             payload[j][i]  = valuecol[k]
         end
     end
-    denullify!(insert!(payload, 1, levels(refkeycol), _names(dt)[rowkey]))
+    denullify!(insert!(payload, 1, NullableArray(levels(refkeycol)), _names(dt)[rowkey]))
 end
 unstack(dt::AbstractDataTable, rowkey, colkey, value) =
     unstack(dt, index(dt)[rowkey], index(dt)[colkey], index(dt)[value])

diff --git a/src/datatable/datatable.jl b/src/datatable/datatable.jl
@@ -77,25 +77,42 @@ type DataTable <: AbstractDataTable
         if length(columns) == length(colindex) == 0
             return new(Vector{Any}(0), Index())
         elseif length(columns) != length(colindex)
-            throw(DimensionMismatch("Number of columns and column names are different"))
+            throw(DimensionMismatch("Number of columns ($(length(columns))) and column names ($(length(colindex))) are not equal"))
         end
+        # do we allow people assigning arrays to columns now?
+        # make sure that doesn't work
+        # can use !get(size(c, 2), 0)
         lengths = length.(columns)
         minlen, maxlen = extrema(lengths)
         if minlen == 0 && maxlen == 0
             return new(columns, colindex)
-        elseif (minlen == 0 && maxlen > 0) || any(x -> x != 0, mod(maxlen, lengths))
-            throw(DimensionMismatch("Incompatible lengths of arguments"))
-        else
-            for i in 1:length(columns)
-                if isa(columns[i], Range)
-                    columns[i] = collect(columns[i])
+        elseif minlen != maxlen
+            # recycle scalars
+            if minlen == 1 && maxlen > 1
+                indices = find(lengths .== minlen)
+                for i in indices
+                    if !(typeof(columns[i]) <: AbstractArray)
+                        columns[i] = fill(columns[i], maxlen)
+                        lengths[i] = maxlen
+                    end
                 end
-                repeats = div(maxlen, length(columns[i]))
-                if repeats == 1 && !(typeof(columns[i]) <: AbstractVector)
-                    columns[i] = [columns[i]]
-                elseif repeats !== 1
-                    columns[i] = isa(columns[i], Array) ? repeat(columns[i], outer=repeats) : fill(columns[i], repeats)
+            end
+            uniques = unique(lengths)
+            if length(uniques) != 1
+                estring = Vector{String}(length(uniques))
+                strnames = string.(names(colindex))
+                for (i,u) in enumerate(uniques)
+                    indices = find(lengths .== u)
+                    estring[i] = "column length ($(lengths[1])) for column(s) ($(join(strnames[indices], ", ")))"
                 end
+                throw(DimensionMismatch(join(estring, " is incompatible with ")))
+            end
+        end
+        for (i,c) in enumerate(columns)
+            if isa(c, Range)
+                columns[i] = collect(c)
+            elseif !isa(c, AbstractVector)
+                columns[i] =  size(c, 2) > 1 ? reshape(c, length(c)) : [c]
             end
         end
         return new(columns, colindex)
@@ -106,14 +123,18 @@ function DataTable(; kwargs...)
     if length(kwargs) == 0
         return DataTable(Any[], Index())
     end
-    columns = Any[v for (k,v) in kwargs]
-    colindex = DataTables.Index([k for (k,v) in kwargs])
-    DataTable(columns, colindex)
+    colnames = Vector{Symbol}(length(kwargs))
+    columns = Vector{Any}(length(kwargs))
+    for (i,(k,v)) in enumerate(kwargs)
+        colnames[i] = Symbol(k)
+        columns[i] = v
+    end
+    DataTable(columns, Index(colnames))
 end
 
 function DataTable(columns::AbstractVector,
-                   cnames::AbstractVector{Symbol} = gennames(length(columns)))
-    return DataTable(convert(Vector{Any}, columns), Index(convert(Vector{Symbol}, cnames)))
 res = Array{Symbol}(n) 
 res = Array{Symbol}(n) 
+                   cnames::Vector{Symbol} = gennames(length(columns)))
+    return DataTable(convert(Vector{Any}, columns), Index(cnames))
 end
 
 
@@ -128,37 +149,40 @@ function DataTable(t::Type, nrows::Integer, ncols::Integer)
 end
 
 # Initialize an empty DataTable with specific eltypes and names
-function DataTable(column_eltypes::Vector, cnames::Vector, nrows::Integer)
+function DataTable(column_eltypes::Vector{DataType}, cnames::Vector{Symbol}, nrows::Integer)
     p = length(column_eltypes)
     columns = Vector{Any}(p)
     for j in 1:p
-        columns[j] = Vector{column_eltypes[j]}(nrows)
+        T = column_eltypes[j]
+        columns[j] = T <: Nullable ? NullableArray{eltype(T)}(nrows) : Vector{T}(nrows)
     end
     return DataTable(columns, Index(cnames))
 end
 # Initialize an empty DataTable with specific eltypes and names
 # and whether a nominal array should be created
-function DataTable(column_eltypes::Vector, cnames::Vector,
+function DataTable(column_eltypes::Vector{DataType}, cnames::Vector{Symbol},
                    nominal::Vector{Bool}, nrows::Integer)
     p = length(column_eltypes)
     columns = Vector{Any}(p)
     for j in 1:p
-      if nominal[j]
-        columns[j] = CategoricalVector{column_eltypes[j]}(nrows)
-      else
-        columns[j] = Vector{column_eltypes[j]}(nrows)
-      end
+        T = column_eltypes[j]
+        if nominal[j]
+            columns[j] = T <: Nullable ? NullableCategoricalArray{T}(nrows) : CategoricalVector{T}(nrows)
+        else
+            columns[j] = T <: Nullable ? NullableArray{T}(nrows) : Vector{T}(nrows)
+        end
     end
     return DataTable(columns, Index(cnames))
 end
 
 # Initialize an empty DataTable with specific eltypes
-function DataTable(column_eltypes::Vector, nrows::Integer)
+function DataTable(column_eltypes::Vector{DataType}, nrows::Integer)
     p = length(column_eltypes)
     columns = Vector{Any}(p)
     cnames = gennames(p)
     for j in 1:p
-        columns[j] = Vector{column_eltypes[j]}(nrows)
+        T = column_eltypes[j]
+        columns[j] = T <: Nullable ? NullableArray{T}(nrows) : Vector{T}(nrows)
     end
     return DataTable(columns, Index(cnames))
 end
@@ -806,8 +830,10 @@ function Base.convert(::Type{DataTable}, A::Matrix)
 end
 
 function Base.convert(::Type{DataTable}, d::Associative)
-    colnames = collect(keys(d))
-    isa(d, Dict) && sort!(colnames)
+    colnames = keys(d)
+    if isa(d, Dict)
+        colnames = sort!(collect(colnames))
+    end
     colindex = Index([Symbol(k) for k in colnames])
     columns = Any[d[c] for c in colnames]
     DataTable(columns, colindex)

diff --git a/test/cat.jl b/test/cat.jl
@@ -72,14 +72,14 @@ module TestCat
     dt[1:2, 1:2] = [3,2]
     dt[[true,false,false,true], 2:3] = [2,3]
 
-    vcat([])
-    vcat(null_dt)
-    vcat(null_dt, null_dt)
-    vcat(null_dt, dt)
-    vcat(dt, null_dt)
-    vcat(dt, dt)
-    vcat(dt, dt, dt)
-    @test vcat(DataTable()) == DataTable()
+    @test vcat(null_dt) == DataTable()
+    @test vcat(null_dt, null_dt) == DataTable()
+    @test vcat(null_dt, dt) == dt
+    @test vcat(dt, null_dt) == dt
+    @test all(map((x,y) -> x <: y, eltypes(vcat(dt, dt)), (Float64, Float64, Int)))
+    @test size(vcat(dt, dt)) == (size(dt,1)*2, size(dt,2))
+    @test all(map((x,y) -> x <: y, eltypes(vcat(dt, dt, dt)), (Float64, Float64, Int)))
+    @test size(vcat(dt, dt, dt)) == (size(dt,1)*3, size(dt,2))
 
     alt_dt = deepcopy(dt)
     vcat(dt, alt_dt)
@@ -94,21 +94,16 @@ module TestCat
     @test isequal(dtr, [dt4; dt4])
 
     # Eltype promotion
-    # Fails on Julia 0.4 since promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T}
-    if VERSION >= v"0.5.0-dev"
-        @test eltypes(vcat(DataTable(a = [1]), DataTable(a = [2.1]))) == [Float64]
-        @test eltypes(vcat(DataTable(a = NullableArray(Int, 1)), DataTable(a = [2.1]))) == [Nullable{Float64}]
-    else
-        @test eltypes(vcat(DataTable(a = [1]), DataTable(a = [2.1]))) == [Any]
-        @test eltypes(vcat(DataTable(a = NullableArray(Int, 1)), DataTable(a = [2.1]))) == [Nullable{Any}]
-    end
+    @test eltypes(vcat(DataTable(a = [1]), DataTable(a = [2.1]))) == [Float64]
+    @test eltypes(vcat(DataTable(a = NullableArray(Int, 1)), DataTable(a = [2.1]))) == [Nullable{Float64}]
 
     # Minimal container type promotion
     dta = DataTable(a = CategoricalArray([1, 2, 2]))
     dtb = DataTable(a = CategoricalArray([2, 3, 4]))
     dtc = DataTable(a = NullableArray([2, 3, 4]))
     dtd = DataTable(Any[2:4], [:a])
     dtab = vcat(dta, dtb)
+    dtac = vcat(nullify(dta), dtc)
     @test isequal(dtab[:a], [1, 2, 2, 2, 3, 4])
     @test isa(dtab[:a], CategoricalVector{Int})
     dc = vcat(dtd, dtc)

diff --git a/test/constructors.jl b/test/constructors.jl
@@ -18,8 +18,6 @@ module TestConstructors
 
     @test isequal(dt, DataTable(Any[NullableCategoricalVector(zeros(3)),
                                     NullableCategoricalVector(ones(3))]))
-    @test !isequal(dt, DataTable(x1 = [0.0, 0.0, 0.0],
-                                 x2 = [1.0, 1.0, 1.0]))
 
     dt2 = convert(DataTable, [0.0 1.0;
                               0.0 1.0;
@@ -28,19 +26,21 @@ module TestConstructors
     @test isequal(dt[:x1], NullableArray(dt2[:x1]))
     @test isequal(dt[:x2], NullableArray(dt2[:x2]))
 
-    @test isequal(dt, DataTable(x1 = NullableCategoricalVector([0.0, 0.0, 0.0]),
-                                x2 = NullableCategoricalVector([1.0, 1.0, 1.0])))
-    @test isequal(dt, DataTable(x1 = NullableCategoricalVector([0.0, 0.0, 0.0]),
-                                x2 = NullableCategoricalVector([1.0, 1.0, 1.0]),
+    @test isequal(dt, DataTable(x1 = NullableArray([0.0, 0.0, 0.0]),
+                                x2 = NullableArray([1.0, 1.0, 1.0])))
+    @test isequal(dt, DataTable(x1 = NullableArray([0.0, 0.0, 0.0]),
+                                x2 = NullableArray([1.0, 1.0, 1.0]),
                                 x3 = [2.0, 2.0, 2.0])[[:x1, :x2]])
 
     dt = DataTable(Int, 2, 2)
     @test size(dt) == (2, 2)
     @test eltypes(dt) == [Int, Int]
 
-    dt = DataTable([Int, Float64], [:x1, :x2], 2)
+    dt = DataTable([Nullable{Int}, Nullable{Float64}], [:x1, :x2], 2)
     @test size(dt) == (2, 2)
-    @test eltypes(dt) == [Int, Float64]
+    @test eltypes(dt) == [Nullable{Int}, Nullable{Float64}]
+
+    @test isequal(dt, DataTable([Nullable{Int}, Nullable{Float64}], 2))
 
     @test_throws BoundsError SubDataTable(DataTable(A=1), 0)
     @test_throws BoundsError SubDataTable(DataTable(A=1), 0)
@@ -51,12 +51,12 @@ module TestConstructors
     @test DataTable(a=1, b=1:2) == DataTable(a=[1,1], b=[1,2])
 
     @testset "associative" begin
-        dt = DataTable(Dict(k => v for (k,v) in zip([:A, :B], [1:3, 4:6])))
+        dt = DataTable(Dict(:A => 1:3, :B => 4:6))
         @test dt == DataTable(A = 1:3, B = 4:6)
+        @test all(e -> e <: Int, eltypes(dt))
     end
 
     @testset "recyclers" begin
-        @test DataTable([collect(1:10), collect(1:20)], [:x, :y]) == DataTable(x = vcat(1:10, 1:10), y = 1:20)
         @test DataTable(a = 1:5, b = 1) == DataTable(a = collect(1:5), b = fill(1, 5))
         @test DataTable(a = 1, b = 1:5) == DataTable(a = fill(1, 5), b = collect(1:5))
     end

diff --git a/test/conversions.jl b/test/conversions.jl
@@ -73,7 +73,7 @@ module TestConversions
     @test isequal(dt[:b], b)
     @test isequal(dt[:c], c)
 
-    a = [1.0]
+    a = 1.0
     di = Dict("a"=>a, "b"=>b, "c"=>c)
     @test convert(DataTable,di)[:a] == [1.0, 1.0]