Skip to content

Commit

Permalink
Improve handling of missing values (#173)
Browse files Browse the repository at this point in the history
* Improve handling of missing values

Replace the nullable::Union{Bool,Missing} argument with allowmissing::Symbol, with possible choices
:all, :auto and :none, and make :all the default to avoid errors when the first missing value appears
beyond rows used for type detection. Also avoid overriding types specified explicitly via the types argument,
which allows for more flexibility.

While deprecating nullable, also deprecate null in favor of missingstring for consistency with truestring
and falsestring.

* Fix up merge commits

* Fix failure after rebase
  • Loading branch information
nalimilan authored and quinnj committed Mar 31, 2018
1 parent de16d0e commit 4ca71c5
Show file tree
Hide file tree
Showing 12 changed files with 274 additions and 224 deletions.
10 changes: 5 additions & 5 deletions benchmark/benchmarks.jl
Expand Up @@ -77,14 +77,14 @@ using CSV, TextParse
for T in (Int, Float64, WeakRefStrings.WeakRefString{UInt8}, Date, DateTime)
println("comparing for T = $T...")
# T == WeakRefStrings.WeakRefString{UInt8} && continue
@time CSV.read("/Users/jacobquinn/Downloads/randoms_$(T).csv"; nullable=true);
@time CSV.read("/Users/jacobquinn/Downloads/randoms_$(T).csv");
# @time TextParse.csvread("/Users/jacobquinn/Downloads/randoms_$T.csv");
end

for T in (Int, Float64, WeakRefStrings.WeakRefString{UInt8}, Date, DateTime)
println("comparing for T = $T...")
# T == WeakRefStrings.WeakRefString{UInt8} && continue
# @time CSV.read("/Users/jacobquinn/Downloads/randoms_$(T).csv"; nullable=true);
# @time CSV.read("/Users/jacobquinn/Downloads/randoms_$(T).csv");
@time TextParse.csvread("/Users/jacobquinn/Downloads/randoms_$T.csv");
end

Expand Down Expand Up @@ -135,8 +135,8 @@ print(end - start)

T = Int64
@time source = CSV.Source("/Users/jacobquinn/Downloads/randoms_$(T).csv";)
@time source = CSV.Source("/Users/jacobquinn/Downloads/randoms_small.csv"; nullable=true)
@time source = CSV.Source("/Users/jacobquinn/Downloads/randoms_small.csv"; nullable=false)
@time source = CSV.Source("/Users/jacobquinn/Downloads/randoms_small.csv"; allowmissing=:auto)
@time source = CSV.Source("/Users/jacobquinn/Downloads/randoms_small.csv"; allowmissing=:none)
# source.schema = DataStreams.Data.Schema(DataStreams.Data.header(source.schema), (Int, String, String, Float64, Float64, Date, DateTime), 9)
# @time df = CSV.read(source, NamedTuple);
sink = Si = NamedTuple
Expand Down Expand Up @@ -171,7 +171,7 @@ end

t = Vector{Int}(1000000)

# having CSV.parsefield(io, T) where T !>: Null decreases allocations by 1.00M
# having CSV.parsefield(io, T) where T !>: Missing decreases allocations by 1.00M
# inlining CSV.parsefield also dropped allocations
# making CSV.Options not have a type parameter also sped things up
#
Expand Down
14 changes: 8 additions & 6 deletions src/CSV.jl
Expand Up @@ -50,7 +50,7 @@ Keyword Arguments:
* `delim::Union{Char,UInt8}`: how fields in the file are delimited; default `','`
* `quotechar::Union{Char,UInt8}`: the character that indicates a quoted field that may contain the `delim` or newlines; default `'"'`
* `escapechar::Union{Char,UInt8}`: the character that escapes a `quotechar` in a quoted field; default `'\\'`
* `null::String`: indicates how NULL values are represented in the dataset; default `""`
* `missingstring::String`: indicates how missing values are represented in the dataset; default `""`
* `dateformat::Union{AbstractString,Dates.DateFormat}`: how dates/datetimes are represented in the dataset; default `Base.Dates.ISODateTimeFormat`
* `decimal::Union{Char,UInt8}`: character to recognize as the decimal point in a float number, e.g. `3.14` or `3,14`; default `'.'`
* `truestring`: string to represent `true::Bool` values in a csv file; default `"true"`. Note that `truestring` and `falsestring` cannot start with the same character.
Expand All @@ -60,8 +60,9 @@ struct Options{D}
delim::UInt8
quotechar::UInt8
escapechar::UInt8
null::Vector{UInt8}
nullcheck::Bool
missingstring::Vector{UInt8}
null::Union{Vector{UInt8},Nothing} # deprecated
missingcheck::Bool
dateformat::D
decimal::UInt8
truestring::Vector{UInt8}
Expand All @@ -73,16 +74,17 @@ struct Options{D}
types
end

Options(;delim=COMMA, quotechar=QUOTE, escapechar=ESCAPE, null="", dateformat=nothing, decimal=PERIOD, truestring="true", falsestring="false", datarow=-1, rows=0, header=1, types=Type[]) =
Options(;delim=COMMA, quotechar=QUOTE, escapechar=ESCAPE, missingstring="", null=nothing, dateformat=nothing, decimal=PERIOD, truestring="true", falsestring="false", datarow=-1, rows=0, header=1, types=Type[]) =
Options(delim%UInt8, quotechar%UInt8, escapechar%UInt8,
map(UInt8, collect(ascii(String(null)))), null != "", isa(dateformat, AbstractString) ? Dates.DateFormat(dateformat) : dateformat,
map(UInt8, collect(ascii(String(missingstring)))), null === nothing ? nothing : map(UInt8, collect(ascii(String(null)))), missingstring != "" || (null != "" && null != nothing),
isa(dateformat, AbstractString) ? Dates.DateFormat(dateformat) : dateformat,
decimal%UInt8, map(UInt8, collect(truestring)), map(UInt8, collect(falsestring)), datarow, rows, header, types)
function Base.show(io::IO,op::Options)
println(io, " CSV.Options:")
println(io, " delim: '", Char(op.delim), "'")
println(io, " quotechar: '", Char(op.quotechar), "'")
print(io, " escapechar: '"); escape_string(io, string(Char(op.escapechar)), "\\"); println(io, "'")
print(io, " null: \""); escape_string(io, isempty(op.null) ? "" : String(collect(op.null)), "\\"); println(io, "\"")
print(io, " missingstring: \""); escape_string(io, isempty(op.missingstring) ? "" : String(collect(op.missingstring)), "\\"); println(io, "\"")
println(io, " dateformat: ", op.dateformat)
println(io, " decimal: '", Char(op.decimal), "'")
println(io, " truestring: '$(String(op.truestring))'")
Expand Down
12 changes: 6 additions & 6 deletions src/Sink.jl
Expand Up @@ -2,7 +2,7 @@ function Sink(fullpath::Union{AbstractString, IO};
delim::Char=',',
quotechar::Char='"',
escapechar::Char='\\',
null::AbstractString="",
missingstring::AbstractString="",
dateformat=nothing,
header::Bool=true,
colnames::Vector{String}=String[],
Expand All @@ -11,7 +11,7 @@ function Sink(fullpath::Union{AbstractString, IO};
delim = delim % UInt8; quotechar = quotechar % UInt8; escapechar = escapechar % UInt8
dateformat = isa(dateformat, AbstractString) ? Dates.DateFormat(dateformat) : dateformat
io = IOBuffer()
options = CSV.Options(delim=delim, quotechar=quotechar, escapechar=escapechar, null=null, dateformat=dateformat)
options = CSV.Options(delim=delim, quotechar=quotechar, escapechar=escapechar, missingstring=missingstring, dateformat=dateformat)
!append && header && !isempty(colnames) && writeheaders(io, colnames, options, Val{quotefields})
return Sink(options, io, fullpath, position(io), !append && header && !isempty(colnames), colnames, length(colnames), append, Val{quotefields})
end
Expand Down Expand Up @@ -62,7 +62,7 @@ end

const EMPTY_UINT8_ARRAY = UInt8[]
function Data.streamto!(sink::Sink, ::Type{Data.Field}, val::Missing, row, col::Int)
Base.write(sink.io, sink.options.nullcheck ? sink.options.null : EMPTY_UINT8_ARRAY, ifelse(col == sink.cols, NEWLINE, sink.options.delim))
Base.write(sink.io, sink.options.missingstringcheck ? sink.options.missingstring : EMPTY_UINT8_ARRAY, ifelse(col == sink.cols, NEWLINE, sink.options.delim))
return nothing
end

Expand Down Expand Up @@ -91,7 +91,7 @@ Keyword Arguments:
* `delim::Union{Char,UInt8}`; how fields in the file will be delimited; default is `UInt8(',')`
* `quotechar::Union{Char,UInt8}`; the character that indicates a quoted field that may contain the `delim` or newlines; default is `UInt8('"')`
* `escapechar::Union{Char,UInt8}`; the character that escapes a `quotechar` in a quoted field; default is `UInt8('\\')`
* `null::String`; the ascii string that indicates how NULL values will be represented in the dataset; default is the emtpy string `""`
* `missingstring::String`; the ascii string that indicates how missing values will be represented in the dataset; default is the empty string `""`
* `dateformat`; how dates/datetimes will be represented in the dataset; default is ISO-8601 `yyyy-mm-ddTHH:MM:SS.s`
* `header::Bool`; whether to write out the column names from `source`
* `colnames::Vector{String}`; a vector of string column names to be used when writing the header row
Expand All @@ -106,8 +106,8 @@ CSV.write("out.csv", df)
# write out a DataFrame, this time as a tab-delimited file
CSV.write("out.csv", df; delim='\t')
# write out a DataFrame, with null values represented by the string "NA"
CSV.write("out.csv", df; null="NA")
# write out a DataFrame, with missing values represented by the string "NA"
CSV.write("out.csv", df; missingstring="NA")
# write out a "header-less" file, with actual data starting on row 1
CSV.write("out.csv", df; header=false)
Expand Down
72 changes: 48 additions & 24 deletions src/Source.jl
Expand Up @@ -4,12 +4,14 @@ function Source(fullpath::Union{AbstractString,IO};
delim=COMMA,
quotechar=QUOTE,
escapechar=ESCAPE,
null::AbstractString="",
missingstring::AbstractString="",
null::Union{AbstractString,Nothing}=nothing,

header::Union{Integer, UnitRange{Int}, Vector}=1, # header can be a row number, range of rows, or actual string vector
datarow::Int=-1, # by default, data starts immediately after header or start of file
types=Type[],
nullable::Union{Bool, Missing}=missing,
allowmissing::Symbol=:all,
nullable::Union{Bool,Missing,Nothing}=nothing,
dateformat=nothing,
decimal=PERIOD,
truestring="true",
Expand All @@ -29,8 +31,8 @@ function Source(fullpath::Union{AbstractString,IO};
options=CSV.Options(delim=typeof(delim) <: String ? UInt8(first(delim)) : (delim % UInt8),
quotechar=typeof(quotechar) <: String ? UInt8(first(quotechar)) : (quotechar % UInt8),
escapechar=typeof(escapechar) <: String ? UInt8(first(escapechar)) : (escapechar % UInt8),
null=null, dateformat=dateformat, decimal=decimal, truestring=truestring, falsestring=falsestring),
header=header, datarow=datarow, types=types, nullable=nullable, categorical=categorical, weakrefstrings=weakrefstrings, footerskip=footerskip,
missingstring=missingstring, null=null, dateformat=dateformat, decimal=decimal, truestring=truestring, falsestring=falsestring),
header=header, datarow=datarow, types=types, allowmissing=allowmissing, nullable=nullable, categorical=categorical, weakrefstrings=weakrefstrings, footerskip=footerskip,
rows_for_type_detect=rows_for_type_detect, rows=rows, use_mmap=use_mmap)
end

Expand All @@ -40,7 +42,8 @@ function Source(;fullpath::Union{AbstractString,IO}="",
header::Union{Integer,UnitRange{Int},Vector}=1, # header can be a row number, range of rows, or actual string vector
datarow::Int=-1, # by default, data starts immediately after header or start of file
types=Type[],
nullable::Union{Bool, Missing}=missing,
allowmissing::Symbol=:all,
nullable::Union{Bool,Missing,Nothing}=nothing,
categorical::Bool=true,
weakrefstrings::Bool=true,

Expand All @@ -52,6 +55,16 @@ function Source(;fullpath::Union{AbstractString,IO}="",
isa(fullpath, AbstractString) && (isfile(fullpath) || throw(ArgumentError("\"$fullpath\" is not a valid file")))
header = (isa(header, Integer) && header == 1 && datarow == 1) ? -1 : header
isa(header, Integer) && datarow != -1 && (datarow > header || throw(ArgumentError("data row ($datarow) must come after header row ($header)")))
if options.null !== nothing
resize!(options.missingstring, length(options.null))
copyto!(options.missingstring, options.null)
Base.depwarn("null option is deprecated, use missingstring instead", :Source)
end
if nullable !== nothing
allowmissing = ismissing(nullable) ? :auto :
nullable ? :all : :none
Base.depwarn("nullable=$nullable argument is deprecated, use allowmissing=$(repr(allowmissing)) instead", :Source)
end

# open the file for property detection
if isa(fullpath, IOBuffer)
Expand Down Expand Up @@ -86,7 +99,7 @@ function Source(;fullpath::Union{AbstractString,IO}="",

# figure out # of columns and header, either an Integer, AbstractRange, or Vector{String}
# also ensure that `f` is positioned at the start of data
row_vals = Vector{RawField}()
row_vals = RawField[]
if isa(header, Integer)
# default header = 1
if header <= 0
Expand Down Expand Up @@ -134,7 +147,7 @@ function Source(;fullpath::Union{AbstractString,IO}="",
# types might be a Vector{DataType}, which will be a problem if Unions are needed
columntypes = convert(Vector{Type}, types)
elseif isa(types, Dict) || isempty(types)
columntypes = fill!(Vector{Type}(uninitialized, cols), Any)
columntypes = fill!(Vector{Type}(undef, cols), Any)
levels = [Dict{WeakRefString{UInt8}, Int}() for _ = 1:cols]
lineschecked = 0
while !eof(source) && lineschecked < min(rows < 0 ? rows_for_type_detect : rows, rows_for_type_detect)
Expand All @@ -151,7 +164,7 @@ function Source(;fullpath::Union{AbstractString,IO}="",
if options.dateformat === nothing && any(x->Missings.T(x) <: Dates.TimeType, columntypes)
# auto-detected TimeType
options = Options(delim=options.delim, quotechar=options.quotechar, escapechar=options.escapechar,
null=options.null, dateformat=Dates.ISODateTimeFormat, decimal=options.decimal,
missingstring=options.missingstring, dateformat=Dates.ISODateTimeFormat, decimal=options.decimal,
datarow=options.datarow, rows=options.rows, header=options.header, types=options.types)
end
if categorical
Expand All @@ -165,30 +178,41 @@ function Source(;fullpath::Union{AbstractString,IO}="",
else
throw(ArgumentError("$cols number of columns detected; `types` argument has $(length(types)) entries"))
end
if isa(types, Dict{Int, <:Any})
for (col, typ) in types
columntypes[col] = typ
if isa(types, Dict)
if isa(types, Dict{String})
@static if VERSION >= v"0.7.0-DEV.3627"
colinds = indexin(keys(types), columnnames)
else
colinds = indexin(collect(keys(types)), columnnames)
end
else
colinds = keys(types)
end
elseif isa(types, Dict{String, <:Any})
for (col, typ) in types
c = findfirst(x->x == col, columnnames)
columntypes[c] = typ
for (col, typ) in zip(colinds, values(types))
columntypes[col] = typ
end
autocols = setdiff(1:cols, colinds)
elseif isempty(types)
autocols = collect(1:cols)
else
autocols = Int[]
end
if !weakrefstrings
columntypes = [(T !== Missing && Missings.T(T) <: WeakRefString) ? substitute(T, String) : T for T in columntypes]
end
if !ismissing(nullable)
if nullable # allow missing values in all columns
for i = 1:cols
if allowmissing != :auto
if allowmissing == :all # allow missing values in all automatically detected columns
for i = autocols
T = columntypes[i]
columntypes[i] = Union{Missings.T(T), Missing}
end
else # disallow missing values in all columns
for i = 1:cols
elseif allowmissing == :none # disallow missing values in all automatically detected columns
for i = autocols
T = columntypes[i]
columntypes[i] = Missings.T(T)
end
else
throw(ArgumentError("allowmissing must be either :all, :none or :auto"))
end
end
seek(source, datapos)
Expand Down Expand Up @@ -231,15 +255,15 @@ Keyword Arguments:
* `delim::Union{Char,UInt8}`: how fields in the file are delimited; default `','`
* `quotechar::Union{Char,UInt8}`: the character that indicates a quoted field that may contain the `delim` or newlines; default `'"'`
* `escapechar::Union{Char,UInt8}`: the character that escapes a `quotechar` in a quoted field; default `'\\'`
* `null::String`: indicates how NULL values are represented in the dataset; default `""`
* `missingstring::String`: indicates how missing values are represented in the dataset; default `""`
* `dateformat::Union{AbstractString,Dates.DateFormat}`: how dates/datetimes are represented in the dataset; default `Base.Dates.ISODateTimeFormat`
* `decimal::Union{Char,UInt8}`: character to recognize as the decimal point in a float number, e.g. `3.14` or `3,14`; default `'.'`
* `truestring`: string to represent `true::Bool` values in a csv file; default `"true"`. Note that `truestring` and `falsestring` cannot start with the same character.
* `falsestring`: string to represent `false::Bool` values in a csv file; default `"false"`
* `header`: column names can be provided manually as a complete Vector{String}, or as an Int/AbstractRange which indicates the row/rows that contain the column names
* `datarow::Int`: specifies the row on which the actual data starts in the file; by default, the data is expected on the next row after the header row(s); for a file without column names (header), specify `datarow=1`
* `types`: column types can be provided manually as a complete Vector{Type}, or in a Dict to reference individual columns by name or number
* `nullable::Bool`: indicates whether values can be nullable or not; `true` by default. If set to `false` and missing values are encountered, a `Data.NullException` will be thrown
* `allowmissing::Symbol=:all`: indicates whether columns should allow for missing values or not, that is whether their element type should be `Union{T,Missing}`; by default, all columns are allowed to contain missing values. If set to `:none`, no column can contain missing values, and if set to `:auto`, only colums which contain missing values in the first `rows_for_type_detect` rows are allowed to contain missing values. Column types specified via `types` are not affected by this argument.
* `footerskip::Int`: indicates the number of rows to skip at the end of the file
* `rows_for_type_detect::Int=100`: indicates how many rows should be read to infer the types of columns
* `rows::Int`: indicates the total number of rows to read from the file; by default the file is pre-parsed to count the # of rows; `-1` can be passed to skip a full-file scan, but the `Data.Sink` must be set up to account for a potentially unknown # of rows
Expand Down Expand Up @@ -267,8 +291,8 @@ Other example invocations may include:
# read in a tab-delimited file
CSV.read(file; delim='\t')
# read in a comma-delimited file with null values represented as '\\N', such as a MySQL export
CSV.read(file; null="\\N")
# read in a comma-delimited file with missing values represented as '\\N', such as a MySQL export
CSV.read(file; missingstring="\\N")
# read a csv file that happens to have column names in the first column, and grouped data in rows instead of columns
CSV.read(file; transpose=true)
Expand Down

0 comments on commit 4ca71c5

Please sign in to comment.