Skip to content

Commit

Permalink
Merge 2168169 into ebc1335
Browse files Browse the repository at this point in the history
  • Loading branch information
alyst committed Nov 26, 2017
2 parents ebc1335 + 2168169 commit ee96a6c
Show file tree
Hide file tree
Showing 10 changed files with 141 additions and 77 deletions.
11 changes: 11 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
## RData v0.3.0 Release Notes

Updated to DataFrames v0.11, switched from [DataArrays](https://github.com/JuliaData/DataArrays.jl) to [Missings](https://github.com/JuliaData/Missings.jl) and [CategoricalArrays](https://github.com/JuliaData/CategoricalArrays.jl).

##### Changes
* updated to DataFrames v0.11 [#28]
* switched from `DataVector` to `Vector{Union{T,Null}}` for NAs [#28]
* R factors converted into `CategoricalVector` (instead of `PooledDataArray`) [#28]

[#28]: https://github.com/JuliaStats/RData.jl/issues/28

## RData v0.2.0 Release Notes

Updated to Julia v0.6 (older versions not supported).
Expand Down
5 changes: 3 additions & 2 deletions REQUIRE
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
julia 0.6
DataFrames 0.9
DataArrays 0.4
DataFrames 0.11
Missings 0.2
CategoricalArrays 0.3
FileIO 0.1.2
CodecZlib 0.4
2 changes: 1 addition & 1 deletion src/RData.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ __precompile__()

module RData

using DataFrames, DataArrays, CodecZlib, FileIO
using DataFrames, CategoricalArrays, Missings, CodecZlib, FileIO
import DataFrames: identifier
import FileIO: load

Expand Down
116 changes: 79 additions & 37 deletions src/convert.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,83 +3,125 @@

function Base.convert(::Type{Hash}, pl::RPairList)
res = Hash()
for i in 1:length(pl.items)
setindex!(res, pl.items[i], pl.tags[i])
for i in eachindex(pl.items)
@inbounds setindex!(res, pl.items[i], pl.tags[i])
end
res
end

##############################################################################
##
## Conversion of intermediate R objects into DataArray and DataFrame objects
## Conversion of intermediate R objects into Vector{T} and DataFrame objects
##
##############################################################################

namask(rl::RLogicalVector) = BitArray(rl.data .== R_NA_INT32)
namask(ri::RIntegerVector) = BitArray(ri.data .== R_NA_INT32)
namask(rn::RNumericVector) = BitArray(map(isna_float64, reinterpret(UInt64, rn.data)))
isna(x::Int32) = x == R_NA_INT32
isna(x::Float64) = isna_float64(reinterpret(UInt64, x))
# if re or im is NA, the whole complex number is NA
# FIXME avoid temporary Vector{Bool}
namask(rc::RComplexVector) = BitArray([isna_float64(v.re) || isna_float64(v.im) for v in reinterpret(Complex{UInt64}, rc.data)])
namask(rv::RNullableVector) = rv.na
isna(x::Complex128) = isna(real(x)) || isna(imag(x))

DataArrays.data(rv::RVEC) = DataArray(rv.data, namask(rv))
# convert R vector into Vector holding elements of type T
# if force_missing is true, the result is always Vector{Union{T,Missing}},
# otherwise it's Vector{T} if `rv` doesn't contain NAs
function jlvec(::Type{T}, rv::RVEC, force_missing::Bool=true) where T
anyna = any(isna, rv.data)
if force_missing || anyna
res = convert(Vector{Union{T,Missing}}, rv.data)
if anyna
@inbounds for (i,x) in enumerate(rv.data)
isna(x) && (res[i] = missing)
end
end
return res
else
return convert(Vector{T}, rv.data)
end
end

# convert R nullable vector (has an explicit NA mask) into Vector{T[?]}
function jlvec(::Type{T}, rv::RNullableVector{R}, force_missing::Bool=true) where {T, R}
anyna = any(rv.na)
if force_missing || anyna
res = convert(Vector{Union{T,Missing}}, rv.data)
anyna && @inbounds res[rv.na] = missing
return res
else
return convert(Vector{T}, rv.data)
end
end

# convert R vector into Vector of appropriate type
jlvec(rv::RVEC, force_missing::Bool=true) = jlvec(eltype(rv.data), rv, force_missing)

function DataArrays.data(ri::RIntegerVector)
if !isfactor(ri) return DataArray(ri.data, namask(ri)) end
# convert factor into PooledDataArray
pool = getattr(ri, "levels", emptystrvec)
sz = length(pool)
# convert R logical vector (uses Int32 to store values) into Vector{Bool[?]}
function jlvec(rl::RLogicalVector, force_missing::Bool=true)
anyna = any(isna, rl.data)
if force_missing || anyna
return Union{Bool,Missing}[ifelse(isna(x), missing, x != 0) for x in rl.data]
else
return Bool[x != 0 for x in rl.data]
end
end

# kernel method that converts Vector{Int32} into Vector{R} replacing R_NA_INT32 with 0
# it's assumed that v fits into R
na2zero(::Type{R}, v::Vector{Int32}) where R =
[ifelse(!isna(x), x % R, zero(R)) for x in v]

# convert to CategoricalVector{String[?]} if `ri` is a factor,
# or to Vector{Int32[?]} otherwise
function jlvec(ri::RIntegerVector, force_missing::Bool=true)
isfactor(ri) || return jlvec(eltype(ri.data), ri, force_missing)

rlevels = getattr(ri, "levels", emptystrvec)
sz = length(rlevels)
REFTYPE = sz <= typemax(UInt8) ? UInt8 :
sz <= typemax(UInt16) ? UInt16 :
sz <= typemax(UInt32) ? UInt32 :
UInt64
dd = ri.data
dd[namask(ri)] = 0
refs = convert(Vector{REFTYPE}, dd)
return PooledDataArray(DataArrays.RefArray(refs), pool)
# FIXME set ordered flag
refs = na2zero(REFTYPE, ri.data)
anyna = any(iszero, refs)
pool = CategoricalPool{String, REFTYPE}(rlevels)
if force_missing || anyna
return CategoricalArray{Union{String, Missing}, 1}(refs, pool)
else
return CategoricalArray{String, 1}(refs, pool)
end
end

# convert R logical vector (uses Int32 to store values) into DataVector{Bool}
DataArrays.data(rl::RLogicalVector) =
return DataArray(Bool[x != 0 for x in rl.data], namask(rl))

function sexp2julia(rex::RSEXPREC)
warn("Conversion of $(typeof(rex)) to Julia is not implemented")
return nothing
end

function sexp2julia(rv::RVEC)
# FIXME dimnames
# FIXME forceDataArrays option to always convert to DataArray
nas = namask(rv)
hasna = any(nas)
# TODO dimnames?
# FIXME add force_missing option to control whether always convert to Union{T, Missing}
jv = jlvec(rv, false)
if hasnames(rv)
# if data has no NA, convert to simple Vector
return DictoVec(hasna ? DataArray(rv.data, nas) : rv.data, names(rv))
return DictoVec(jv, names(rv))
else
hasdims = hasdim(rv)
if !hasdims && length(rv.data)==1
# scalar
# FIXME handle NAs
# if hasna
return rv.data[1]
return jv[1]
elseif !hasdims
# vectors
return hasna ? DataArray(rv.data, nas) : rv.data
return jv
else
# matrices and so on
dims = tuple(convert(Vector{Int64}, getattr(rv, "dim"))...)
return hasna ? DataArray(reshape(rv.data, dims), reshape(nas, dims)) :
reshape(rv.data, dims)
dims = tuple(convert(Vector{Int}, getattr(rv, "dim"))...)
return reshape(jv, dims)
end
end
end

function sexp2julia(rl::RList)
if isdataframe(rl)
# FIXME remove Any type assertion workaround
DataFrame(Any[data(col) for col in rl.data], map(identifier, names(rl)))
# FIXME add force_missing option to control whether always convert to Union{T, Missing}
DataFrame(Any[jlvec(col, false) for col in rl.data], identifier.(names(rl)))
elseif hasnames(rl)
DictoVec(Any[sexp2julia(item) for item in rl.data], names(rl))
else
Expand Down
17 changes: 9 additions & 8 deletions src/io/ASCIIIO.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ ASCII RData format IO stream wrapper.
struct ASCIIIO{T<:IO} <: RDAIO
sub::T # underlying IO stream

(::Type{ASCIIIO})(io::T) where {T<:IO} = new{T}(io)
ASCIIIO(io::T) where {T<:IO} = new{T}(io)
end

readint32(io::ASCIIIO) = parse(Int32, readline(io.sub))
Expand All @@ -24,20 +24,21 @@ readintorNA(io::ASCIIIO, n::RVecLength) = Int32[readintorNA(io) for i in 1:n]
# str == R_NA_STRING ? R_NA_FLOAT64 : parse(Float64, str)
#end

function readfloatorNA(io::ASCIIIO, n::RVecLength)
res = Vector{Float64}(n)
res_uint = reinterpret(UInt64, res) # alias of res for setting NA
@inbounds for i in 1:n
function readfloatorNA!(io::ASCIIIO, v::AbstractVector{Float64})
v_uint = reinterpret(UInt64, v) # alias of res for setting NA
@inbounds for i in eachindex(v)
str = chomp(readline(io.sub))
if str != R_NA_STRING
res[i] = parse(Float64, str)
v[i] = parse(Float64, str)
else
res_uint[i] = R_NA_FLOAT64 # see JuliaStats/RData.jl#5
v_uint[i] = R_NA_FLOAT64 # see JuliaStats/RData.jl#5
end
end
res
v
end

readfloatorNA(io::ASCIIIO, n::RVecLength) = readfloatorNA!(io, Vector{Float64}(n))

readuint8(io::ASCIIIO, n::RVecLength) = UInt8[hex2bytes(chomp(readline(io.sub)))[1] for i in 1:n] # FIXME optimize for speed

function readnchars(io::ASCIIIO, n::Int32) # reads N bytes-sized string
Expand Down
3 changes: 2 additions & 1 deletion src/io/NativeIO.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@ TODO write readers
"""
struct NativeIO{T<:IO} <: RDAIO
sub::T # underlying IO stream
(::Type{NativeIO})(io::T) where {T<:IO} = new{T}(io)

NativeIO(io::T) where {T<:IO} = new{T}(io)
end
14 changes: 10 additions & 4 deletions src/io/XDRIO.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@ XDR (machine-independent binary) RData format IO stream wrapper.
struct XDRIO{T<:IO} <: RDAIO
sub::T # underlying IO stream
buf::Vector{UInt8} # buffer for strings
(::Type{XDRIO})(io::T) where {T <: IO} = new{T}(io, Vector{UInt8}(1024))

XDRIO(io::T) where {T <: IO} = new{T}(io, Vector{UInt8}(1024))
end

readint32(io::XDRIO) = ntoh(read(io.sub, Int32))
readuint32(io::XDRIO) = ntoh(read(io.sub, UInt32))
readfloat64(io::XDRIO) = reinterpret(Float64, ntoh(read(io.sub, Int64)))
readfloat64(io::XDRIO) = ntoh(read(io.sub, Float64))

readintorNA(io::XDRIO) = readint32(io)
function readintorNA(io::XDRIO, n::RVecLength)
Expand All @@ -21,8 +22,13 @@ end
# R's NA is silently converted to NaN when the value is loaded in the register(?)
#readfloatorNA(io::XDRIO) = readfloat64(io)
function readfloatorNA(io::XDRIO, n::RVecLength)
v = read(io.sub, UInt64, n)
reinterpret(Float64, map!(ntoh, v, v))
v = read(io.sub, Float64, n)
map!(ntoh, v, v)
end

function readfloatorNA!(io::XDRIO, v::AbstractVector{Float64})
readbytes!(io.sub, reinterpret(UInt8, v))
map!(ntoh, v, v)
end

readuint8(io::XDRIO, n::RVecLength) = read(io.sub, UInt8, n)
Expand Down
5 changes: 3 additions & 2 deletions src/readers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,9 @@ end
function readcomplex(ctx::RDAContext, fl::RDATag)
@assert sxtype(fl) == CPLXSXP
n = readlength(ctx.io)
RComplexVector(reinterpret(Complex128, readfloatorNA(ctx.io, 2n)),
readattrs(ctx, fl))
v = Vector{Complex128}(n)
readfloatorNA!(ctx.io, reinterpret(Float64, v))
RComplexVector(v, readattrs(ctx, fl))
end

function readstring(ctx::RDAContext, fl::RDATag)
Expand Down
2 changes: 1 addition & 1 deletion src/sxtypes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ struct RVector{T, S} <: RVEC{T, S}
data::Vector{T}
attr::Hash # collection of R object attributes

(::Type{RVector{T,S}})(v::Vector{T}=T[], attr::Hash=Hash()) where {T,S} =
RVector{T,S}(v::Vector{T}=T[], attr::Hash=Hash()) where {T,S} =
new{T,S}(v, attr)
end

Expand Down
43 changes: 22 additions & 21 deletions test/RDA.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,53 +6,54 @@ module TestRDA
# check for Float64 NA
@testset "Detect R floating-point NAs" begin
@test !RData.isna_float64(reinterpret(UInt64, 1.0))
@test !RData.isna_float64(reinterpret(UInt64, NaN))
@test !RData.isna_float64(reinterpret(UInt64, Inf))
@test !RData.isna_float64(reinterpret(UInt64, -Inf))
@test RData.isna_float64(reinterpret(UInt64, RData.R_NA_FLOAT64))
@test !RData.isna(1.0)
@test !RData.isna(NaN)
@test !RData.isna(Inf)
@test !RData.isna(-Inf)
@test RData.isna_float64(RData.R_NA_FLOAT64)
# check that alternative NA is also recognized (#10)
@test RData.isna_float64(reinterpret(UInt64, RData.R_NA_FLOAT64 | ((Base.significand_mask(Float64) + 1) >> 1)))
end

testdir = dirname(@__FILE__)
@testset "Reading minimal RData" begin
df = DataFrame(num = [1.1, 2.2])
@test isequal(sexp2julia(load("$testdir/data/minimal.rda",convert=false)["df"]), df)
@test isequal(load("$testdir/data/minimal.rda",convert=true)["df"], df)
@test isequal(load("$testdir/data/minimal_ascii.rda")["df"], df)
@test sexp2julia(load("$testdir/data/minimal.rda",convert=false)["df"]) == df
@test load("$testdir/data/minimal.rda",convert=true)["df"] == df
@test load("$testdir/data/minimal_ascii.rda")["df"] == df
end

@testset "Conversion to Julia types" begin
df = DataFrame(num = [1.1, 2.2],
int = Int32[1, 2],
logi = [true, false],
chr = ["ab", "c"],
factor = pool(["ab", "c"]),
cplx = Complex128[1.1+0.5im, 1.0im])
factor = categorical(["ab", "c"], true),
cplx = [1.1+0.5im, 1.0im])
rdf = sexp2julia(load("$testdir/data/types.rda",convert=false)["df"])
@test eltypes(rdf) == eltypes(df)
@test isequal(rdf, df)
@test rdf == df
rdf_ascii = sexp2julia(load("$testdir/data/types_ascii.rda",convert=false)["df"])
@test eltypes(rdf_ascii) == eltypes(df)
@test isequal(rdf_ascii, df)
@test rdf_ascii == df
end

@testset "NAs conversion" begin
df = DataFrame(num = [1.1, 2.2],
int = Int32[1, 2],
logi = [true, false],
chr = ["ab", "c"],
factor = pool(["ab", "c"]),
cplx = Complex128[1.1+0.5im, 1.0im])
df = DataFrame(num = Union{Float64, Missing}[1.1, 2.2],
int = Union{Int32, Missing}[1, 2],
logi = Union{Bool, Missing}[true, false],
chr = Union{String, Missing}["ab", "c"],
factor = categorical(Union{String, Missing}["ab", "c"], true),
cplx = Union{Complex128, Missing}[1.1+0.5im, 1.0im])

df[2, :] = NA
df[2, :] = missing
append!(df, df[2, :])
df[3, :num] = NaN
df[:, :cplx] = @data [NA, Complex128(1,NaN), NaN]
df[:, :cplx] = [missing, Complex128(1,NaN), NaN]
@test isequal(sexp2julia(load("$testdir/data/NAs.rda",convert=false)["df"]), df)
# ASCII format saves NaN as NA
df[3, :num] = NA
df[:, :cplx] = @data [NA, NA, NA]
df[3, :num] = missing
df[:, :cplx] = missing
@test isequal(sexp2julia(load("$testdir/data/NAs_ascii.rda",convert=false)["df"]), df)
end

Expand Down

0 comments on commit ee96a6c

Please sign in to comment.