diff --git a/REQUIRE b/REQUIRE index 91fbd05d2d..a7308986b1 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,5 +1,5 @@ julia 0.6 -Missings 0.2.1 +Missings 0.2.3 CategoricalArrays 0.3.0 StatsBase 0.11.0 SortingAlgorithms diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md index ac1625e27e..61f40f9d03 100644 --- a/docs/src/lib/functions.md +++ b/docs/src/lib/functions.md @@ -46,6 +46,7 @@ rename! rename show showcols +similar size sort sort! diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 323a549482..598c6afe02 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -233,8 +233,17 @@ Base.ndims(::AbstractDataFrame) = 2 ## ############################################################################## -Base.similar(df::AbstractDataFrame, dims::Int) = - DataFrame(Any[similar_missing(x, dims) for x in columns(df)], copy(index(df))) +""" + similar(df::DataFrame[, rows::Integer]) + +Create a new `DataFrame` with the same column names and column element types +as `df`. An optional second argument can be provided to request a number of rows +that is different than the number of rows present in `df`. +""" +function Base.similar(df::AbstractDataFrame, rows::Integer = size(df, 1)) + rows < 0 && throw(ArgumentError("the number of rows must be positive")) + DataFrame(Any[similar(x, rows) for x in columns(df)], copy(index(df))) +end ############################################################################## ## diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index c47959014d..13ca6b321c 100644 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -734,11 +734,29 @@ Base.hcat(df1::DataFrame, df2::AbstractDataFrame, dfn::AbstractDataFrame...) = h ## Missing values support ## ############################################################################## +""" + allowmissing!(df::DataFrame) + +Convert all columns of a `df` from element type `T` to +`Union{T, Missing}` to support missing values. + + allowmissing!(df::DataFrame, col::Union{Integer, Symbol}) + +Convert a single column of a `df` from element type `T` to +`Union{T, Missing}` to support missing values. + + allowmissing!(df::DataFrame, cols::AbstractVector{<:Union{Integer, Symbol}}) + +Convert multiple columns of a `df` from element type `T` to +`Union{T, Missing}` to support missing values. +""" +function allowmissing! end function allowmissing!(df::DataFrame, col::ColumnIndex) - df[col] = Vector{Union{eltype(df[col]), Missing}}(df[col]) + df[col] = allowmissing(df[col]) df end + function allowmissing!(df::DataFrame, cols::AbstractVector{<: ColumnIndex}=1:size(df, 2)) for col in cols allowmissing!(df, col) diff --git a/test/dataframe.jl b/test/dataframe.jl index 51fad2946a..2990853c38 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -79,9 +79,12 @@ module TestDataFrame b = Union{String, Missing}["b"], c = CategoricalArray{Union{Float64, Missing}}([3.3])) missingdf = DataFrame(a = missings(Int, 2), - b = missings(String, 2), - c = CategoricalArray{Union{Float64, Missing}}(2)) - @test missingdf ≅ similar(df, 2) + b = missings(String, 2), + c = CategoricalArray{Union{Float64, Missing}}(2)) + # https://github.com/JuliaData/Missings.jl/issues/66 + # @test missingdf ≅ similar(df, 2) + @test typeof.(similar(df, 2).columns) == typeof.(missingdf.columns) + @test size(similar(df, 2)) == size(missingdf) # Associative methods @@ -439,12 +442,35 @@ module TestDataFrame @test isa(df[1], Vector{Union{Int, Missing}}) @test !isa(df[2], Vector{Union{Int, Missing}}) - df = DataFrame(Any[collect(1:10), collect(1:10)]) + df = DataFrame(Any[collect(1:10), collect(1:10)]) allowmissing!(df, [1,2]) @test isa(df[1], Vector{Union{Int, Missing}}) && isa(df[2], Vector{Union{Int, Missing}}) - df = DataFrame(Any[collect(1:10), collect(1:10)]) + df = DataFrame(Any[collect(1:10), collect(1:10)]) allowmissing!(df) @test isa(df[1], Vector{Union{Int, Missing}}) && isa(df[2], Vector{Union{Int, Missing}}) + + df = DataFrame(Any[CategoricalArray(1:10), + CategoricalArray(string.('a':'j'))]) + allowmissing!(df) + @test all(issubtype.(typeof.(df.columns), CategoricalVector)) + @test eltypes(df)[1] <: Union{CategoricalValue{Int}, Missing} + @test eltypes(df)[2] <: Union{CategoricalString, Missing} + end + + @testset "similar" begin + df = DataFrame(a = ["foo"], + b = CategoricalArray(["foo"]), + c = [0.0], + d = CategoricalArray([0.0])) + @test typeof.(similar(df).columns) == typeof.(df.columns) + @test size(similar(df)) == size(df) + + rows = size(df, 1) + 5 + @test size(similar(df, rows)) == (rows, size(df, 2)) + @test typeof.(similar(df, rows).columns) == typeof.(df.columns) + + e = @test_throws ArgumentError similar(df, -1) + @test e.value.msg == "the number of rows must be positive" end end