Skip to content

Commit

Permalink
Small improvements to missing value support (#1298)
Browse files Browse the repository at this point in the history
- change behavior of `similar` on DataFrames to not auto-enable missing
  value support and instead use same column eltypes as parent df
- add docstrings for `similar` and `allowmissing!`
- change behavior of `allowmissing!` on CategoricalArrays to preserve
  correct Array type (which is a CategoricalArray)
- add `similar` to docs
  • Loading branch information
cjprybol authored and nalimilan committed Dec 7, 2017
1 parent bda0dd2 commit 83323bb
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 9 deletions.
2 changes: 1 addition & 1 deletion REQUIRE
@@ -1,5 +1,5 @@
julia 0.6
Missings 0.2.1
Missings 0.2.3
CategoricalArrays 0.3.0
StatsBase 0.11.0
SortingAlgorithms
Expand Down
1 change: 1 addition & 0 deletions docs/src/lib/functions.md
Expand Up @@ -46,6 +46,7 @@ rename!
rename
show
showcols
similar
size
sort
sort!
Expand Down
13 changes: 11 additions & 2 deletions src/abstractdataframe/abstractdataframe.jl
Expand Up @@ -233,8 +233,17 @@ Base.ndims(::AbstractDataFrame) = 2
##
##############################################################################

Base.similar(df::AbstractDataFrame, dims::Int) =
DataFrame(Any[similar_missing(x, dims) for x in columns(df)], copy(index(df)))
"""
similar(df::DataFrame[, rows::Integer])
Create a new `DataFrame` with the same column names and column element types
as `df`. An optional second argument can be provided to request a number of rows
that is different than the number of rows present in `df`.
"""
function Base.similar(df::AbstractDataFrame, rows::Integer = size(df, 1))
rows < 0 && throw(ArgumentError("the number of rows must be positive"))
DataFrame(Any[similar(x, rows) for x in columns(df)], copy(index(df)))
end

##############################################################################
##
Expand Down
20 changes: 19 additions & 1 deletion src/dataframe/dataframe.jl
Expand Up @@ -734,11 +734,29 @@ Base.hcat(df1::DataFrame, df2::AbstractDataFrame, dfn::AbstractDataFrame...) = h
## Missing values support
##
##############################################################################
"""
allowmissing!(df::DataFrame)
Convert all columns of a `df` from element type `T` to
`Union{T, Missing}` to support missing values.
allowmissing!(df::DataFrame, col::Union{Integer, Symbol})
Convert a single column of a `df` from element type `T` to
`Union{T, Missing}` to support missing values.
allowmissing!(df::DataFrame, cols::AbstractVector{<:Union{Integer, Symbol}})
Convert multiple columns of a `df` from element type `T` to
`Union{T, Missing}` to support missing values.
"""
function allowmissing! end

function allowmissing!(df::DataFrame, col::ColumnIndex)
df[col] = Vector{Union{eltype(df[col]), Missing}}(df[col])
df[col] = allowmissing(df[col])
df
end

function allowmissing!(df::DataFrame, cols::AbstractVector{<: ColumnIndex}=1:size(df, 2))
for col in cols
allowmissing!(df, col)
Expand Down
36 changes: 31 additions & 5 deletions test/dataframe.jl
Expand Up @@ -79,9 +79,12 @@ module TestDataFrame
b = Union{String, Missing}["b"],
c = CategoricalArray{Union{Float64, Missing}}([3.3]))
missingdf = DataFrame(a = missings(Int, 2),
b = missings(String, 2),
c = CategoricalArray{Union{Float64, Missing}}(2))
@test missingdf similar(df, 2)
b = missings(String, 2),
c = CategoricalArray{Union{Float64, Missing}}(2))
# https://github.com/JuliaData/Missings.jl/issues/66
# @test missingdf ≅ similar(df, 2)
@test typeof.(similar(df, 2).columns) == typeof.(missingdf.columns)
@test size(similar(df, 2)) == size(missingdf)

# Associative methods

Expand Down Expand Up @@ -439,12 +442,35 @@ module TestDataFrame
@test isa(df[1], Vector{Union{Int, Missing}})
@test !isa(df[2], Vector{Union{Int, Missing}})

df = DataFrame(Any[collect(1:10), collect(1:10)])
df = DataFrame(Any[collect(1:10), collect(1:10)])
allowmissing!(df, [1,2])
@test isa(df[1], Vector{Union{Int, Missing}}) && isa(df[2], Vector{Union{Int, Missing}})

df = DataFrame(Any[collect(1:10), collect(1:10)])
df = DataFrame(Any[collect(1:10), collect(1:10)])
allowmissing!(df)
@test isa(df[1], Vector{Union{Int, Missing}}) && isa(df[2], Vector{Union{Int, Missing}})

df = DataFrame(Any[CategoricalArray(1:10),
CategoricalArray(string.('a':'j'))])
allowmissing!(df)
@test all(issubtype.(typeof.(df.columns), CategoricalVector))
@test eltypes(df)[1] <: Union{CategoricalValue{Int}, Missing}
@test eltypes(df)[2] <: Union{CategoricalString, Missing}
end

@testset "similar" begin
df = DataFrame(a = ["foo"],
b = CategoricalArray(["foo"]),
c = [0.0],
d = CategoricalArray([0.0]))
@test typeof.(similar(df).columns) == typeof.(df.columns)
@test size(similar(df)) == size(df)

rows = size(df, 1) + 5
@test size(similar(df, rows)) == (rows, size(df, 2))
@test typeof.(similar(df, rows).columns) == typeof.(df.columns)

e = @test_throws ArgumentError similar(df, -1)
@test e.value.msg == "the number of rows must be positive"
end
end

0 comments on commit 83323bb

Please sign in to comment.