Skip to content

Commit

Permalink
Merge pull request #10560 from tanmaykm/readcsv
Browse files Browse the repository at this point in the history
correctly parse integer types in readdlm. fixes #9289
  • Loading branch information
JeffBezanson committed Mar 20, 2015
2 parents 39e7015 + a68f950 commit e1d6e56
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 61 deletions.
80 changes: 46 additions & 34 deletions base/datafmt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
module DataFmt

importall Base
import Base: _default_delims
import Base: _default_delims, tryparse_internal

export countlines, readdlm, readcsv, writedlm, writecsv

Expand Down Expand Up @@ -137,15 +137,14 @@ type DLMStore{T,S<:ByteString} <: DLMHandler
sbuff::S
auto::Bool
eol::Char
tmp64::Array{Float64,1}
end

function DLMStore{T,S<:ByteString}(::Type{T}, dims::NTuple{2,Integer}, has_header::Bool, sbuff::S, auto::Bool, eol::Char)
(nrows,ncols) = dims
nrows <= 0 && throw(ArgumentError("number of rows in dims must be > 0, got $nrows"))
ncols <= 0 && throw(ArgumentError("number of columns in dims must be > 0, got $ncols"))
hdr_offset = has_header ? 1 : 0
DLMStore{T,S}(fill(SubString(sbuff,1,0), 1, ncols), Array(T, nrows-hdr_offset, ncols), nrows, ncols, 0, 0, hdr_offset, sbuff, auto, eol, Array(Float64,1))
DLMStore{T,S}(fill(SubString(sbuff,1,0), 1, ncols), Array(T, nrows-hdr_offset, ncols), nrows, ncols, 0, 0, hdr_offset, sbuff, auto, eol)
end

_chrinstr(sbuff::ByteString, chr::UInt8, startpos::Int, endpos::Int) = (endpos >= startpos) && (C_NULL != ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), pointer(sbuff.data)+startpos-1, chr, endpos-startpos+1))
Expand All @@ -158,7 +157,6 @@ function store_cell{T,S<:ByteString}(dlmstore::DLMStore{T,S}, row::Int, col::Int
lastrow = dlmstore.lastrow
cells::Array{T,2} = dlmstore.data
sbuff::S = dlmstore.sbuff
tmp64 = dlmstore.tmp64

endpos = prevind(sbuff, nextind(sbuff,endpos))
(endpos > 0) && ('\n' == dlmstore.eol) && ('\r' == Char(sbuff[endpos])) && (endpos = prevind(sbuff, endpos))
Expand Down Expand Up @@ -189,9 +187,9 @@ function store_cell{T,S<:ByteString}(dlmstore::DLMStore{T,S}, row::Int, col::Int
# fill data
if quoted && _chrinstr(sbuff, UInt8('"'), startpos, endpos)
unescaped = replace(SubString(sbuff,startpos,endpos), r"\"\"", "\"")
fail = colval(unescaped, 1, length(unescaped), cells, drow, col, tmp64)
fail = colval(unescaped, 1, length(unescaped), cells, drow, col)
else
fail = colval(sbuff, startpos, endpos, cells, drow, col, tmp64)
fail = colval(sbuff, startpos, endpos, cells, drow, col)
end
if fail
sval = SubString(sbuff,startpos,endpos)
Expand All @@ -204,9 +202,9 @@ function store_cell{T,S<:ByteString}(dlmstore::DLMStore{T,S}, row::Int, col::Int
# fill header
if quoted && _chrinstr(sbuff, UInt8('"'), startpos, endpos)
unescaped = replace(SubString(sbuff,startpos,endpos), r"\"\"", "\"")
colval(unescaped, 1, length(unescaped), dlmstore.hdr, 1, col, tmp64)
colval(unescaped, 1, length(unescaped), dlmstore.hdr, 1, col)
else
colval(sbuff, startpos,endpos, dlmstore.hdr, 1, col, tmp64)
colval(sbuff, startpos,endpos, dlmstore.hdr, 1, col)
end
end

Expand Down Expand Up @@ -304,6 +302,8 @@ function dlm_fill(T::DataType, offarr::Vector{Vector{Int}}, dims::NTuple{2,Integ
idx = 1
offidx = 1
offsets = offarr[1]
row = 0
col = 0
try
dh = DLMStore(T, dims, has_header, sbuff, auto, eol)
while idx <= length(offsets)
Expand All @@ -320,40 +320,52 @@ function dlm_fill(T::DataType, offarr::Vector{Vector{Int}}, dims::NTuple{2,Integ
return result(dh)
catch ex
isa(ex, TypeError) && (ex.func == :store_cell) && (return dlm_fill(ex.expected, offarr, dims, has_header, sbuff, auto, eol))
rethrow(ex)
error("at row $row, column $col : $ex")
end
end


function colval{T<:Bool, S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{T,2}, row::Int, col::Int, tmp64::Array{Float64,1})
len = endpos-startpos+1
if (len == 4) && (0 == ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), pointer(sbuff)+startpos-1, pointer("true"), 4))
cells[row,col] = true
return false
elseif (len == 5) && (0 == ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), pointer(sbuff)+startpos-1, pointer("false"), 5))
cells[row,col] = false
return false
end
true
function colval{S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{Bool,2}, row::Int, col::Int)
n = tryparse_internal(Bool, sbuff, startpos, endpos, false)
isnull(n) || (cells[row,col] = get(n))
isnull(n)
end
function colval{T<:Number, S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{T,2}, row::Int, col::Int, tmp64::Array{Float64,1})
if 0 == ccall(:jl_substrtod, Int32, (Ptr{UInt8},Csize_t,Cint,Ptr{Float64}), sbuff, startpos-1, endpos-startpos+1, tmp64)
cells[row,col] = tmp64[1]
return false
end
true
function colval{T<:Integer, S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{T,2}, row::Int, col::Int)
n = tryparse_internal(T, sbuff, startpos, endpos, 0, false)
isnull(n) || (cells[row,col] = get(n))
isnull(n)
end
colval{T<:AbstractString, S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{T,2}, row::Int, col::Int, tmp64::Array{Float64,1}) = ((cells[row,col] = SubString(sbuff,startpos,endpos)); false)
function colval{S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{Any,2}, row::Int, col::Int, tmp64::Array{Float64,1})
if 0 == ccall(:jl_substrtod, Int32, (Ptr{UInt8},Csize_t,Cint,Ptr{Float64}), sbuff, startpos-1, endpos-startpos+1, tmp64)
cells[row,col] = tmp64[1]
else
cells[row,col] = SubString(sbuff, startpos, endpos)
function colval{S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{Float64,2}, row::Int, col::Int)
n = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Cint), sbuff, startpos-1, endpos-startpos+1)
isnull(n) || (cells[row,col] = get(n))
isnull(n)
end
function colval{S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{Float32,2}, row::Int, col::Int)
n = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Cint), sbuff, startpos-1, endpos-startpos+1)
isnull(n) || (cells[row,col] = get(n))
isnull(n)
end
colval{T<:AbstractString, S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{T,2}, row::Int, col::Int) = ((cells[row,col] = SubString(sbuff,startpos,endpos)); false)
function colval{S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{Any,2}, row::Int, col::Int)
# if array is of Any type, attempt parsing only the most common types: Int, Bool, Float64 and fallback to SubString
len = endpos-startpos+1
if len > 0
# check Inteter
ni64 = tryparse_internal(Int, sbuff, startpos, endpos, 0, false)
isnull(ni64) || (cells[row,col] = get(ni64); return false)

# check Bool
nb = tryparse_internal(Bool, sbuff, startpos, endpos, false)
isnull(nb) || (cells[row,col] = get(nb); return false)

# check float64
nf64 = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Cint), sbuff, startpos-1, endpos-startpos+1)
isnull(nf64) || (cells[row,col] = get(nf64); return false)
end
cells[row,col] = SubString(sbuff, startpos, endpos)
false
end
colval{T<:Char, S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{T,2}, row::Int, col::Int, tmp64::Array{Float64,1}) = ((startpos==endpos) ? ((cells[row,col] = next(sbuff,startpos)[1]); false) : true)
colval{S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array, row::Int, col::Int, tmp64::Array{Float64,1}) = true
colval{T<:Char, S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{T,2}, row::Int, col::Int) = ((startpos==endpos) ? ((cells[row,col] = next(sbuff,startpos)[1]); false) : true)
colval{S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array, row::Int, col::Int) = true

dlm_parse(s::ASCIIString, eol::Char, dlm::Char, qchar::Char, cchar::Char, ign_adj_dlm::Bool, allow_quote::Bool, allow_comments::Bool, skipstart::Int, skipblanks::Bool, dh::DLMHandler) = begin
dlm_parse(s.data, UInt32(eol)%UInt8, UInt32(dlm)%UInt8, UInt32(qchar)%UInt8, UInt32(cchar)%UInt8,
Expand Down
7 changes: 3 additions & 4 deletions base/gmp.jl
Original file line number Diff line number Diff line change
Expand Up @@ -76,18 +76,17 @@ signed(x::BigInt) = x
BigInt(x::BigInt) = x
BigInt(s::AbstractString) = parse(BigInt,s)

function tryparse_internal(::Type{BigInt}, s::AbstractString, base::Int, raise::Bool)
function tryparse_internal(::Type{BigInt}, s::AbstractString, startpos::Int, endpos::Int, base::Int, raise::Bool)
_n = Nullable{BigInt}()
s = bytestring(s)
sgn, base, i = Base.parseint_preamble(true,s,base)
sgn, base, i = Base.parseint_preamble(true,base,s,startpos,endpos)
if i == 0
raise && throw(ArgumentError("premature end of integer: $(repr(s))"))
return _n
end
z = BigInt()
err = ccall((:__gmpz_set_str, :libgmp),
Int32, (Ptr{BigInt}, Ptr{UInt8}, Int32),
&z, SubString(s,i), base)
&z, SubString(s,i,endpos), base)
if err != 0
raise && throw(ArgumentError("invalid BigInt: $(repr(s))"))
return _n
Expand Down
57 changes: 34 additions & 23 deletions base/string.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1485,31 +1485,31 @@ function parse{T<:Integer}(::Type{T}, c::Char, base::Integer=36)
convert(T, d)
end

function parseint_next(s::AbstractString, i::Int=start(s))
done(s,i) && (return Char(0), 0, 0)
j = i
c, i = next(s,i)
c, i, j
function parseint_next(s::AbstractString, startpos::Int, endpos::Int)
(0 < startpos <= endpos) || (return Char(0), 0, 0)
j = startpos
c, startpos = next(s,startpos)
c, startpos, j
end

function parseint_preamble(signed::Bool, s::AbstractString, base::Int)
c, i, j = parseint_next(s)
function parseint_preamble(signed::Bool, base::Int, s::AbstractString, startpos::Int, endpos::Int)
c, i, j = parseint_next(s, startpos, endpos)

while isspace(c)
c, i, j = parseint_next(s,i)
c, i, j = parseint_next(s,i,endpos)
end
(j == 0) && (return 0, 0, 0)

sgn = 1
if signed
if c == '-' || c == '+'
(c == '-') && (sgn = -1)
c, i, j = parseint_next(s,i)
c, i, j = parseint_next(s,i,endpos)
end
end

while isspace(c)
c, i, j = parseint_next(s,i)
c, i, j = parseint_next(s,i,endpos)
end
(j == 0) && (return 0, 0, 0)

Expand All @@ -1518,7 +1518,7 @@ function parseint_preamble(signed::Bool, s::AbstractString, base::Int)
c, i = next(s,i)
base = c=='b' ? 2 : c=='o' ? 8 : c=='x' ? 16 : 10
if base != 10
c, i, j = parseint_next(s,i)
c, i, j = parseint_next(s,i,endpos)
end
else
base = 10
Expand All @@ -1527,21 +1527,30 @@ function parseint_preamble(signed::Bool, s::AbstractString, base::Int)
return sgn, base, j
end

function tryparse_internal{S<:ByteString}(::Type{Bool}, sbuff::S, startpos::Int, endpos::Int, raise::Bool)
len = endpos-startpos+1
p = pointer(sbuff)+startpos-1
(len == 4) && (0 == ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), p, "true", 4)) && (return Nullable(true))
(len == 5) && (0 == ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), p, "false", 5)) && (return Nullable(false))
raise && throw(ArgumentError("invalid Bool representation: $(repr(SubString(s,startpos,endpos)))"))
Nullable{Bool}()
end

safe_add{T<:Integer}(n1::T, n2::T) = ((n2 > 0) ? (n1 > (typemax(T) - n2)) : (n1 < (typemin(T) - n2))) ? Nullable{T}() : Nullable{T}(n1 + n2)
safe_mul{T<:Integer}(n1::T, n2::T) = ((n2 > 0) ? ((n1 > div(typemax(T),n2)) || (n1 < div(typemin(T),n2))) :
(n2 < -1) ? ((n1 > div(typemin(T),n2)) || (n1 < div(typemax(T),n2))) :
((n2 == -1) && n1 == typemin(T))) ? Nullable{T}() : Nullable{T}(n1 * n2)

function tryparse_internal{T<:Integer}(::Type{T}, s::AbstractString, base::Int, a::Int, raise::Bool)
function tryparse_internal{T<:Integer}(::Type{T}, s::AbstractString, startpos::Int, endpos::Int, base::Int, a::Int, raise::Bool)
_n = Nullable{T}()
sgn, base, i = parseint_preamble(T<:Signed,s,base)
sgn, base, i = parseint_preamble(T<:Signed, base, s, startpos, endpos)
if i == 0
raise && throw(ArgumentError("premature end of integer: $(repr(s))"))
raise && throw(ArgumentError("premature end of integer: $(repr(SubString(s,startpos,endpos)))"))
return _n
end
c, i = parseint_next(s,i)
c, i = parseint_next(s,i,endpos)
if i == 0
raise && throw(ArgumentError("premature end of integer: $(repr(s))"))
raise && throw(ArgumentError("premature end of integer: $(repr(SubString(s,startpos,endpos)))"))
return _n
end

Expand All @@ -1553,12 +1562,12 @@ function tryparse_internal{T<:Integer}(::Type{T}, s::AbstractString, base::Int,
'A' <= c <= 'Z' ? c-'A'+10 :
'a' <= c <= 'z' ? c-'a'+a : base
if d >= base
raise && throw(ArgumentError("invalid base $base digit $(repr(c)) in $(repr(s))"))
raise && throw(ArgumentError("invalid base $base digit $(repr(c)) in $(repr(SubString(s,startpos,endpos)))"))
return _n
end
n *= base
n += d
if done(s,i)
if i > endpos
n *= sgn
return Nullable{T}(n)
end
Expand All @@ -1571,7 +1580,7 @@ function tryparse_internal{T<:Integer}(::Type{T}, s::AbstractString, base::Int,
'A' <= c <= 'Z' ? c-'A'+10 :
'a' <= c <= 'z' ? c-'a'+a : base
if d >= base
raise && throw(ArgumentError("invalid base $base digit $(repr(c)) in $(repr(s))"))
raise && throw(ArgumentError("invalid base $base digit $(repr(c)) in $(repr(SubString(s,startpos,endpos)))"))
return _n
end
(T <: Signed) && (d *= sgn)
Expand All @@ -1583,20 +1592,22 @@ function tryparse_internal{T<:Integer}(::Type{T}, s::AbstractString, base::Int,
return _n
end
n = get(safe_n)
done(s,i) && return Nullable{T}(n)
(i > endpos) && return Nullable{T}(n)
c, i = next(s,i)
end
while !done(s,i)
while i <= endpos
c, i = next(s,i)
if !isspace(c)
raise && throw(ArgumentError("extra characters after whitespace in $(repr(s))"))
raise && throw(ArgumentError("extra characters after whitespace in $(repr(SubString(s,startpos,endpos)))"))
return _n
end
end
return Nullable{T}(n)
end
tryparse_internal{T<:Integer}(::Type{T}, s::AbstractString, base::Int, raise::Bool) =
tryparse_internal(T, s, base, base <= 36 ? 10 : 36, raise)
tryparse_internal(T,s,start(s),length(s),base,raise)
tryparse_internal{T<:Integer}(::Type{T}, s::AbstractString, startpos::Int, endpos::Int, base::Int, raise::Bool) =
tryparse_internal(T, s, startpos, endpos, base, base <= 36 ? 10 : 36, raise)
tryparse{T<:Integer}(::Type{T}, s::AbstractString, base::Int) =
2 <= base <= 62 ? tryparse_internal(T,s,Int(base),false) : throw(ArgumentError("invalid base: base must be 2 ≤ base ≤ 62, got $base"))
tryparse{T<:Integer}(::Type{T}, s::AbstractString) = tryparse_internal(T,s,0,false)
Expand Down
5 changes: 5 additions & 0 deletions test/readdlm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -197,3 +197,8 @@ let i18n_data = ["Origin (English)", "Name (English)", "Origin (Native)", "Name
writedlm(i18n_buff, i18n_arr, '\t')
@test (data, hdr) == readdlm(i18n_buff, '\t', header=true)
end

@test isequaldlm(readcsv(IOBuffer("1,22222222222222222222222222222222222222,0x3,10e6\n2000.1,true,false,-10.34"), Any),
reshape(Any[1,2000.1,Float64(22222222222222222222222222222222222222),true,0x3,false,10e6,-10.34], 2, 4), Any)

@test isequaldlm(readcsv(IOBuffer("-9223355253176920979,9223355253176920979"), Int64), Int64[-9223355253176920979 9223355253176920979], Int64)

0 comments on commit e1d6e56

Please sign in to comment.