From 8dac8c578032428f0b1018f2c59125930b6738e7 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sat, 25 Nov 2017 19:54:55 +0100 Subject: [PATCH 1/4] switch to DataFrames 0.11 - switch to Missings.jl/CategoricalArrays.jl - add jlvec() methods handling conversion logic - fix conversion of RLogicalVector into Vector{Bool} - remove DataArrays.jl dependency --- NEWS.md | 11 +++++ REQUIRE | 5 ++- src/RData.jl | 2 +- src/convert.jl | 116 +++++++++++++++++++++++++++++++++---------------- test/RDA.jl | 43 +++++++++--------- 5 files changed, 116 insertions(+), 61 deletions(-) diff --git a/NEWS.md b/NEWS.md index 71406f8..0d007e9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,14 @@ +## RData v0.3.0 Release Notes + +Updated to DataFrames v0.11, switched from [DataArrays](https://github.com/JuliaData/DataArrays.jl) to [Missings](https://github.com/JuliaData/Missings.jl) and [CategoricalArrays](https://github.com/JuliaData/CategoricalArrays.jl). + +##### Changes +* updated to DataFrames v0.11 [#28] +* switched from `DataVector` to `Vector{Union{T,Null}}` for NAs [#28] +* R factors converted into `CategoricalVector` (instead of `PooledDataArray`) [#28] + +[#28]: https://github.com/JuliaStats/RData.jl/issues/28 + ## RData v0.2.0 Release Notes Updated to Julia v0.6 (older versions not supported). diff --git a/REQUIRE b/REQUIRE index 29030fa..f9e2ce0 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,5 +1,6 @@ julia 0.6 -DataFrames 0.9 -DataArrays 0.4 +DataFrames 0.11 +Missings 0.2 +CategoricalArrays 0.3 FileIO 0.1.2 CodecZlib 0.4 diff --git a/src/RData.jl b/src/RData.jl index dc559ce..182cf7c 100644 --- a/src/RData.jl +++ b/src/RData.jl @@ -2,7 +2,7 @@ __precompile__() module RData -using DataFrames, DataArrays, CodecZlib, FileIO +using DataFrames, CategoricalArrays, Missings, CodecZlib, FileIO import DataFrames: identifier import FileIO: load diff --git a/src/convert.jl b/src/convert.jl index 217eb1a..fee7a9c 100644 --- a/src/convert.jl +++ b/src/convert.jl @@ -3,83 +3,125 @@ function Base.convert(::Type{Hash}, pl::RPairList) res = Hash() - for i in 1:length(pl.items) - setindex!(res, pl.items[i], pl.tags[i]) + for i in eachindex(pl.items) + @inbounds setindex!(res, pl.items[i], pl.tags[i]) end res end ############################################################################## ## -## Conversion of intermediate R objects into DataArray and DataFrame objects +## Conversion of intermediate R objects into Vector{T} and DataFrame objects ## ############################################################################## -namask(rl::RLogicalVector) = BitArray(rl.data .== R_NA_INT32) -namask(ri::RIntegerVector) = BitArray(ri.data .== R_NA_INT32) -namask(rn::RNumericVector) = BitArray(map(isna_float64, reinterpret(UInt64, rn.data))) +isna(x::Int32) = x == R_NA_INT32 +isna(x::Float64) = isna_float64(reinterpret(UInt64, x)) # if re or im is NA, the whole complex number is NA -# FIXME avoid temporary Vector{Bool} -namask(rc::RComplexVector) = BitArray([isna_float64(v.re) || isna_float64(v.im) for v in reinterpret(Complex{UInt64}, rc.data)]) -namask(rv::RNullableVector) = rv.na +isna(x::Complex128) = isna(real(x)) || isna(imag(x)) -DataArrays.data(rv::RVEC) = DataArray(rv.data, namask(rv)) +# convert R vector into Vector holding elements of type T +# if force_missing is true, the result is always Vector{Union{T,Missing}}, +# otherwise it's Vector{T} if `rv` doesn't contain NAs +function jlvec(::Type{T}, rv::RVEC, force_missing::Bool=true) where T + anyna = any(isna, rv.data) + if force_missing || anyna + res = convert(Vector{Union{T,Missing}}, rv.data) + if anyna + @inbounds for (i,x) in enumerate(rv.data) + isna(x) && (res[i] = missing) + end + end + return res + else + return convert(Vector{T}, rv.data) + end +end + +# convert R nullable vector (has an explicit NA mask) into Vector{T[?]} +function jlvec(::Type{T}, rv::RNullableVector{R}, force_missing::Bool=true) where {T, R} + anyna = any(rv.na) + if force_missing || anyna + res = convert(Vector{Union{T,Missing}}, rv.data) + anyna && @inbounds res[rv.na] = missing + return res + else + return convert(Vector{T}, rv.data) + end +end + +# convert R vector into Vector of appropriate type +jlvec(rv::RVEC, force_missing::Bool=true) = jlvec(eltype(rv.data), rv, force_missing) -function DataArrays.data(ri::RIntegerVector) - if !isfactor(ri) return DataArray(ri.data, namask(ri)) end - # convert factor into PooledDataArray - pool = getattr(ri, "levels", emptystrvec) - sz = length(pool) +# convert R logical vector (uses Int32 to store values) into Vector{Bool[?]} +function jlvec(rl::RLogicalVector, force_missing::Bool=true) + anyna = any(isna, rl.data) + if force_missing || anyna + return Union{Bool,Missing}[ifelse(isna(x), missing, x != 0) for x in rl.data] + else + return Bool[x != 0 for x in rl.data] + end +end + +# kernel method that converts Vector{Int32} into Vector{R} replacing R_NA_INT32 with 0 +# it's assumed that v fits into R +na2zero(::Type{R}, v::Vector{Int32}) where R = + [ifelse(!isna(x), x % R, zero(R)) for x in v] + +# convert to CategoricalVector{String[?]} if `ri` is a factor, +# or to Vector{Int32[?]} otherwise +function jlvec(ri::RIntegerVector, force_missing::Bool=true) + isfactor(ri) || return jlvec(eltype(ri.data), ri, force_missing) + + rlevels = getattr(ri, "levels", emptystrvec) + sz = length(rlevels) REFTYPE = sz <= typemax(UInt8) ? UInt8 : sz <= typemax(UInt16) ? UInt16 : sz <= typemax(UInt32) ? UInt32 : UInt64 - dd = ri.data - dd[namask(ri)] = 0 - refs = convert(Vector{REFTYPE}, dd) - return PooledDataArray(DataArrays.RefArray(refs), pool) + # FIXME set ordered flag + refs = na2zero(REFTYPE, ri.data) + anyna = any(iszero, refs) + pool = CategoricalPool{String, REFTYPE}(rlevels) + if force_missing || anyna + return CategoricalArray{Union{String, Missing}, 1}(refs, pool) + else + return CategoricalArray{String, 1}(refs, pool) + end end -# convert R logical vector (uses Int32 to store values) into DataVector{Bool} -DataArrays.data(rl::RLogicalVector) = - return DataArray(Bool[x != 0 for x in rl.data], namask(rl)) - function sexp2julia(rex::RSEXPREC) warn("Conversion of $(typeof(rex)) to Julia is not implemented") return nothing end function sexp2julia(rv::RVEC) - # FIXME dimnames - # FIXME forceDataArrays option to always convert to DataArray - nas = namask(rv) - hasna = any(nas) + # TODO dimnames? + # FIXME add force_missing option to control whether always convert to Union{T, Missing} + jv = jlvec(rv, false) if hasnames(rv) # if data has no NA, convert to simple Vector - return DictoVec(hasna ? DataArray(rv.data, nas) : rv.data, names(rv)) + return DictoVec(jv, names(rv)) else hasdims = hasdim(rv) if !hasdims && length(rv.data)==1 # scalar - # FIXME handle NAs - # if hasna - return rv.data[1] + return jv[1] elseif !hasdims # vectors - return hasna ? DataArray(rv.data, nas) : rv.data + return jv else # matrices and so on - dims = tuple(convert(Vector{Int64}, getattr(rv, "dim"))...) - return hasna ? DataArray(reshape(rv.data, dims), reshape(nas, dims)) : - reshape(rv.data, dims) + dims = tuple(convert(Vector{Int}, getattr(rv, "dim"))...) + return reshape(jv, dims) end end end function sexp2julia(rl::RList) if isdataframe(rl) - # FIXME remove Any type assertion workaround - DataFrame(Any[data(col) for col in rl.data], map(identifier, names(rl))) + # FIXME add force_missing option to control whether always convert to Union{T, Missing} + DataFrame(Any[jlvec(col, false) for col in rl.data], identifier.(names(rl))) elseif hasnames(rl) DictoVec(Any[sexp2julia(item) for item in rl.data], names(rl)) else diff --git a/test/RDA.jl b/test/RDA.jl index 900bc43..11b1b4b 100644 --- a/test/RDA.jl +++ b/test/RDA.jl @@ -6,10 +6,11 @@ module TestRDA # check for Float64 NA @testset "Detect R floating-point NAs" begin @test !RData.isna_float64(reinterpret(UInt64, 1.0)) - @test !RData.isna_float64(reinterpret(UInt64, NaN)) - @test !RData.isna_float64(reinterpret(UInt64, Inf)) - @test !RData.isna_float64(reinterpret(UInt64, -Inf)) - @test RData.isna_float64(reinterpret(UInt64, RData.R_NA_FLOAT64)) + @test !RData.isna(1.0) + @test !RData.isna(NaN) + @test !RData.isna(Inf) + @test !RData.isna(-Inf) + @test RData.isna_float64(RData.R_NA_FLOAT64) # check that alternative NA is also recognized (#10) @test RData.isna_float64(reinterpret(UInt64, RData.R_NA_FLOAT64 | ((Base.significand_mask(Float64) + 1) >> 1))) end @@ -17,9 +18,9 @@ module TestRDA testdir = dirname(@__FILE__) @testset "Reading minimal RData" begin df = DataFrame(num = [1.1, 2.2]) - @test isequal(sexp2julia(load("$testdir/data/minimal.rda",convert=false)["df"]), df) - @test isequal(load("$testdir/data/minimal.rda",convert=true)["df"], df) - @test isequal(load("$testdir/data/minimal_ascii.rda")["df"], df) + @test sexp2julia(load("$testdir/data/minimal.rda",convert=false)["df"]) == df + @test load("$testdir/data/minimal.rda",convert=true)["df"] == df + @test load("$testdir/data/minimal_ascii.rda")["df"] == df end @testset "Conversion to Julia types" begin @@ -27,32 +28,32 @@ module TestRDA int = Int32[1, 2], logi = [true, false], chr = ["ab", "c"], - factor = pool(["ab", "c"]), - cplx = Complex128[1.1+0.5im, 1.0im]) + factor = categorical(["ab", "c"], true), + cplx = [1.1+0.5im, 1.0im]) rdf = sexp2julia(load("$testdir/data/types.rda",convert=false)["df"]) @test eltypes(rdf) == eltypes(df) - @test isequal(rdf, df) + @test rdf == df rdf_ascii = sexp2julia(load("$testdir/data/types_ascii.rda",convert=false)["df"]) @test eltypes(rdf_ascii) == eltypes(df) - @test isequal(rdf_ascii, df) + @test rdf_ascii == df end @testset "NAs conversion" begin - df = DataFrame(num = [1.1, 2.2], - int = Int32[1, 2], - logi = [true, false], - chr = ["ab", "c"], - factor = pool(["ab", "c"]), - cplx = Complex128[1.1+0.5im, 1.0im]) + df = DataFrame(num = Union{Float64, Missing}[1.1, 2.2], + int = Union{Int32, Missing}[1, 2], + logi = Union{Bool, Missing}[true, false], + chr = Union{String, Missing}["ab", "c"], + factor = categorical(Union{String, Missing}["ab", "c"], true), + cplx = Union{Complex128, Missing}[1.1+0.5im, 1.0im]) - df[2, :] = NA + df[2, :] = missing append!(df, df[2, :]) df[3, :num] = NaN - df[:, :cplx] = @data [NA, Complex128(1,NaN), NaN] + df[:, :cplx] = [missing, Complex128(1,NaN), NaN] @test isequal(sexp2julia(load("$testdir/data/NAs.rda",convert=false)["df"]), df) # ASCII format saves NaN as NA - df[3, :num] = NA - df[:, :cplx] = @data [NA, NA, NA] + df[3, :num] = missing + df[:, :cplx] = missing @test isequal(sexp2julia(load("$testdir/data/NAs_ascii.rda",convert=false)["df"]), df) end From c8ef95396d61d47ee5c01f7dcc899b69ebf487c0 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sun, 26 Nov 2017 00:23:07 +0100 Subject: [PATCH 2/4] remove v0.5-isms from ctors (::Type{XXX})(...) -> XXX(...) --- src/io/ASCIIIO.jl | 2 +- src/io/NativeIO.jl | 3 ++- src/io/XDRIO.jl | 3 ++- src/sxtypes.jl | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/io/ASCIIIO.jl b/src/io/ASCIIIO.jl index cda4e61..15b72ad 100644 --- a/src/io/ASCIIIO.jl +++ b/src/io/ASCIIIO.jl @@ -4,7 +4,7 @@ ASCII RData format IO stream wrapper. struct ASCIIIO{T<:IO} <: RDAIO sub::T # underlying IO stream - (::Type{ASCIIIO})(io::T) where {T<:IO} = new{T}(io) + ASCIIIO(io::T) where {T<:IO} = new{T}(io) end readint32(io::ASCIIIO) = parse(Int32, readline(io.sub)) diff --git a/src/io/NativeIO.jl b/src/io/NativeIO.jl index 82ea4d6..b289411 100644 --- a/src/io/NativeIO.jl +++ b/src/io/NativeIO.jl @@ -5,5 +5,6 @@ TODO write readers """ struct NativeIO{T<:IO} <: RDAIO sub::T # underlying IO stream - (::Type{NativeIO})(io::T) where {T<:IO} = new{T}(io) + + NativeIO(io::T) where {T<:IO} = new{T}(io) end diff --git a/src/io/XDRIO.jl b/src/io/XDRIO.jl index 5c93498..9066b27 100644 --- a/src/io/XDRIO.jl +++ b/src/io/XDRIO.jl @@ -4,7 +4,8 @@ XDR (machine-independent binary) RData format IO stream wrapper. struct XDRIO{T<:IO} <: RDAIO sub::T # underlying IO stream buf::Vector{UInt8} # buffer for strings - (::Type{XDRIO})(io::T) where {T <: IO} = new{T}(io, Vector{UInt8}(1024)) + + XDRIO(io::T) where {T <: IO} = new{T}(io, Vector{UInt8}(1024)) end readint32(io::XDRIO) = ntoh(read(io.sub, Int32)) diff --git a/src/sxtypes.jl b/src/sxtypes.jl index d049aa5..a9bbe66 100644 --- a/src/sxtypes.jl +++ b/src/sxtypes.jl @@ -112,7 +112,7 @@ struct RVector{T, S} <: RVEC{T, S} data::Vector{T} attr::Hash # collection of R object attributes - (::Type{RVector{T,S}})(v::Vector{T}=T[], attr::Hash=Hash()) where {T,S} = + RVector{T,S}(v::Vector{T}=T[], attr::Hash=Hash()) where {T,S} = new{T,S}(v, attr) end From 400834c6eb6960b396dcf7cf138755d411e042c6 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sun, 26 Nov 2017 00:58:19 +0100 Subject: [PATCH 3/4] readfloat[orNA](io): avoid reinterpret() reinterpret() returns ReinterpretArray in 0.7 instead of Vector, whereas RVector{} only accepts Vector --- src/io/XDRIO.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/io/XDRIO.jl b/src/io/XDRIO.jl index 9066b27..3885a00 100644 --- a/src/io/XDRIO.jl +++ b/src/io/XDRIO.jl @@ -10,7 +10,7 @@ end readint32(io::XDRIO) = ntoh(read(io.sub, Int32)) readuint32(io::XDRIO) = ntoh(read(io.sub, UInt32)) -readfloat64(io::XDRIO) = reinterpret(Float64, ntoh(read(io.sub, Int64))) +readfloat64(io::XDRIO) = ntoh(read(io.sub, Float64)) readintorNA(io::XDRIO) = readint32(io) function readintorNA(io::XDRIO, n::RVecLength) @@ -22,8 +22,8 @@ end # R's NA is silently converted to NaN when the value is loaded in the register(?) #readfloatorNA(io::XDRIO) = readfloat64(io) function readfloatorNA(io::XDRIO, n::RVecLength) - v = read(io.sub, UInt64, n) - reinterpret(Float64, map!(ntoh, v, v)) + v = read(io.sub, Float64, n) + map!(ntoh, v, v) end readuint8(io::XDRIO, n::RVecLength) = read(io.sub, UInt8, n) From 2168169ec1496f559961a3569551b4702adf9e7e Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Sun, 26 Nov 2017 01:00:41 +0100 Subject: [PATCH 4/4] add readfloatorNA!(io, vec::AbstractVector) allows to fix the result of readcomplex() from ReinterpretArray to Vector on 0.7 --- src/io/ASCIIIO.jl | 15 ++++++++------- src/io/XDRIO.jl | 5 +++++ src/readers.jl | 5 +++-- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/io/ASCIIIO.jl b/src/io/ASCIIIO.jl index 15b72ad..a9f9453 100644 --- a/src/io/ASCIIIO.jl +++ b/src/io/ASCIIIO.jl @@ -24,20 +24,21 @@ readintorNA(io::ASCIIIO, n::RVecLength) = Int32[readintorNA(io) for i in 1:n] # str == R_NA_STRING ? R_NA_FLOAT64 : parse(Float64, str) #end -function readfloatorNA(io::ASCIIIO, n::RVecLength) - res = Vector{Float64}(n) - res_uint = reinterpret(UInt64, res) # alias of res for setting NA - @inbounds for i in 1:n +function readfloatorNA!(io::ASCIIIO, v::AbstractVector{Float64}) + v_uint = reinterpret(UInt64, v) # alias of res for setting NA + @inbounds for i in eachindex(v) str = chomp(readline(io.sub)) if str != R_NA_STRING - res[i] = parse(Float64, str) + v[i] = parse(Float64, str) else - res_uint[i] = R_NA_FLOAT64 # see JuliaStats/RData.jl#5 + v_uint[i] = R_NA_FLOAT64 # see JuliaStats/RData.jl#5 end end - res + v end +readfloatorNA(io::ASCIIIO, n::RVecLength) = readfloatorNA!(io, Vector{Float64}(n)) + readuint8(io::ASCIIIO, n::RVecLength) = UInt8[hex2bytes(chomp(readline(io.sub)))[1] for i in 1:n] # FIXME optimize for speed function readnchars(io::ASCIIIO, n::Int32) # reads N bytes-sized string diff --git a/src/io/XDRIO.jl b/src/io/XDRIO.jl index 3885a00..f8c2dc3 100644 --- a/src/io/XDRIO.jl +++ b/src/io/XDRIO.jl @@ -26,6 +26,11 @@ function readfloatorNA(io::XDRIO, n::RVecLength) map!(ntoh, v, v) end +function readfloatorNA!(io::XDRIO, v::AbstractVector{Float64}) + readbytes!(io.sub, reinterpret(UInt8, v)) + map!(ntoh, v, v) +end + readuint8(io::XDRIO, n::RVecLength) = read(io.sub, UInt8, n) function readnchars(io::XDRIO, n::Int32) # a single character string diff --git a/src/readers.jl b/src/readers.jl index d0e030e..feeea06 100644 --- a/src/readers.jl +++ b/src/readers.jl @@ -38,8 +38,9 @@ end function readcomplex(ctx::RDAContext, fl::RDATag) @assert sxtype(fl) == CPLXSXP n = readlength(ctx.io) - RComplexVector(reinterpret(Complex128, readfloatorNA(ctx.io, 2n)), - readattrs(ctx, fl)) + v = Vector{Complex128}(n) + readfloatorNA!(ctx.io, reinterpret(Float64, v)) + RComplexVector(v, readattrs(ctx, fl)) end function readstring(ctx::RDAContext, fl::RDATag)