Improve handling of missing values (#173)

* Improve handling of missing values Replace the nullable::Union{Bool,Missing} argument with allowmissing::Symbol, with possible choices :all, :auto and :none, and make :all the default to avoid errors when the first missing value appears beyond rows used for type detection. Also avoid overriding types specified explicitly via the types argument, which allows for more flexibility. While deprecating nullable, also deprecate null in favor of missingstring for consistency with truestring and falsestring. * Fix up merge commits * Fix failure after rebase
JuliaData · Mar 31, 2018 · 4ca71c5 · 4ca71c5
1 parent de16d0e
commit 4ca71c5
Show file tree

Hide file tree

Showing 12 changed files with 274 additions and 224 deletions.
diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
@@ -77,14 +77,14 @@ using CSV, TextParse
 for T in (Int, Float64, WeakRefStrings.WeakRefString{UInt8}, Date, DateTime)
     println("comparing for T = $T...")
     # T == WeakRefStrings.WeakRefString{UInt8} && continue
-    @time CSV.read("/Users/jacobquinn/Downloads/randoms_$(T).csv"; nullable=true);
+    @time CSV.read("/Users/jacobquinn/Downloads/randoms_$(T).csv");
     # @time TextParse.csvread("/Users/jacobquinn/Downloads/randoms_$T.csv");
 end
 
 for T in (Int, Float64, WeakRefStrings.WeakRefString{UInt8}, Date, DateTime)
     println("comparing for T = $T...")
     # T == WeakRefStrings.WeakRefString{UInt8} && continue
-    # @time CSV.read("/Users/jacobquinn/Downloads/randoms_$(T).csv"; nullable=true);
+    # @time CSV.read("/Users/jacobquinn/Downloads/randoms_$(T).csv");
     @time TextParse.csvread("/Users/jacobquinn/Downloads/randoms_$T.csv");
 end
 
@@ -135,8 +135,8 @@ print(end - start)
 
 T = Int64
 @time source = CSV.Source("/Users/jacobquinn/Downloads/randoms_$(T).csv";)
-@time source = CSV.Source("/Users/jacobquinn/Downloads/randoms_small.csv"; nullable=true)
-@time source = CSV.Source("/Users/jacobquinn/Downloads/randoms_small.csv"; nullable=false)
+@time source = CSV.Source("/Users/jacobquinn/Downloads/randoms_small.csv"; allowmissing=:auto)
+@time source = CSV.Source("/Users/jacobquinn/Downloads/randoms_small.csv"; allowmissing=:none)
 # source.schema = DataStreams.Data.Schema(DataStreams.Data.header(source.schema), (Int, String, String, Float64, Float64, Date, DateTime), 9)
 # @time df = CSV.read(source, NamedTuple);
 sink = Si = NamedTuple
@@ -171,7 +171,7 @@ end
 
 t = Vector{Int}(1000000)
 
-# having CSV.parsefield(io, T) where T !>: Null decreases allocations by 1.00M
+# having CSV.parsefield(io, T) where T !>: Missing decreases allocations by 1.00M
 # inlining CSV.parsefield also dropped allocations
 # making CSV.Options not have a type parameter also sped things up
 #

diff --git a/src/CSV.jl b/src/CSV.jl
@@ -50,7 +50,7 @@ Keyword Arguments:
  * `delim::Union{Char,UInt8}`: how fields in the file are delimited; default `','`
  * `quotechar::Union{Char,UInt8}`: the character that indicates a quoted field that may contain the `delim` or newlines; default `'"'`
  * `escapechar::Union{Char,UInt8}`: the character that escapes a `quotechar` in a quoted field; default `'\\'`
- * `null::String`: indicates how NULL values are represented in the dataset; default `""`
+ * `missingstring::String`: indicates how missing values are represented in the dataset; default `""`
  * `dateformat::Union{AbstractString,Dates.DateFormat}`: how dates/datetimes are represented in the dataset; default `Base.Dates.ISODateTimeFormat`
  * `decimal::Union{Char,UInt8}`: character to recognize as the decimal point in a float number, e.g. `3.14` or `3,14`; default `'.'`
  * `truestring`: string to represent `true::Bool` values in a csv file; default `"true"`. Note that `truestring` and `falsestring` cannot start with the same character.
@@ -60,8 +60,9 @@ struct Options{D}
     delim::UInt8
     quotechar::UInt8
     escapechar::UInt8
-    null::Vector{UInt8}
-    nullcheck::Bool
+    missingstring::Vector{UInt8}
+    null::Union{Vector{UInt8},Nothing} # deprecated
+    missingcheck::Bool
     dateformat::D
     decimal::UInt8
     truestring::Vector{UInt8}
@@ -73,16 +74,17 @@ struct Options{D}
     types
 end
 
-Options(;delim=COMMA, quotechar=QUOTE, escapechar=ESCAPE, null="", dateformat=nothing, decimal=PERIOD, truestring="true", falsestring="false", datarow=-1, rows=0, header=1, types=Type[]) =
+Options(;delim=COMMA, quotechar=QUOTE, escapechar=ESCAPE, missingstring="", null=nothing, dateformat=nothing, decimal=PERIOD, truestring="true", falsestring="false", datarow=-1, rows=0, header=1, types=Type[]) =
     Options(delim%UInt8, quotechar%UInt8, escapechar%UInt8,
-            map(UInt8, collect(ascii(String(null)))), null != "", isa(dateformat, AbstractString) ? Dates.DateFormat(dateformat) : dateformat,
+            map(UInt8, collect(ascii(String(missingstring)))), null === nothing ? nothing : map(UInt8, collect(ascii(String(null)))), missingstring != "" || (null != "" && null != nothing),
+            isa(dateformat, AbstractString) ? Dates.DateFormat(dateformat) : dateformat,
             decimal%UInt8, map(UInt8, collect(truestring)), map(UInt8, collect(falsestring)), datarow, rows, header, types)
 function Base.show(io::IO,op::Options)
     println(io, "    CSV.Options:")
     println(io, "        delim: '", Char(op.delim), "'")
     println(io, "        quotechar: '", Char(op.quotechar), "'")
     print(io, "        escapechar: '"); escape_string(io, string(Char(op.escapechar)), "\\"); println(io, "'")
-    print(io, "        null: \""); escape_string(io, isempty(op.null) ? "" : String(collect(op.null)), "\\"); println(io, "\"")
+    print(io, "        missingstring: \""); escape_string(io, isempty(op.missingstring) ? "" : String(collect(op.missingstring)), "\\"); println(io, "\"")
     println(io, "        dateformat: ", op.dateformat)
     println(io, "        decimal: '", Char(op.decimal), "'")
     println(io, "        truestring: '$(String(op.truestring))'")

diff --git a/src/Sink.jl b/src/Sink.jl
@@ -2,7 +2,7 @@ function Sink(fullpath::Union{AbstractString, IO};
               delim::Char=',',
               quotechar::Char='"',
               escapechar::Char='\\',
-              null::AbstractString="",
+              missingstring::AbstractString="",
               dateformat=nothing,
               header::Bool=true,
               colnames::Vector{String}=String[],
@@ -11,7 +11,7 @@ function Sink(fullpath::Union{AbstractString, IO};
     delim = delim % UInt8; quotechar = quotechar % UInt8; escapechar = escapechar % UInt8
     dateformat = isa(dateformat, AbstractString) ? Dates.DateFormat(dateformat) : dateformat
     io = IOBuffer()
-    options = CSV.Options(delim=delim, quotechar=quotechar, escapechar=escapechar, null=null, dateformat=dateformat)
+    options = CSV.Options(delim=delim, quotechar=quotechar, escapechar=escapechar, missingstring=missingstring, dateformat=dateformat)
     !append && header && !isempty(colnames) && writeheaders(io, colnames, options, Val{quotefields})
     return Sink(options, io, fullpath, position(io), !append && header && !isempty(colnames), colnames, length(colnames), append, Val{quotefields})
 end
@@ -62,7 +62,7 @@ end
 
 const EMPTY_UINT8_ARRAY = UInt8[]
 function Data.streamto!(sink::Sink, ::Type{Data.Field}, val::Missing, row, col::Int)
-    Base.write(sink.io, sink.options.nullcheck ? sink.options.null : EMPTY_UINT8_ARRAY, ifelse(col == sink.cols, NEWLINE, sink.options.delim))
+    Base.write(sink.io, sink.options.missingstringcheck ? sink.options.missingstring : EMPTY_UINT8_ARRAY, ifelse(col == sink.cols, NEWLINE, sink.options.delim))
     return nothing
 end
 
@@ -91,7 +91,7 @@ Keyword Arguments:
 * `delim::Union{Char,UInt8}`; how fields in the file will be delimited; default is `UInt8(',')`
 * `quotechar::Union{Char,UInt8}`; the character that indicates a quoted field that may contain the `delim` or newlines; default is `UInt8('"')`
 * `escapechar::Union{Char,UInt8}`; the character that escapes a `quotechar` in a quoted field; default is `UInt8('\\')`
-* `null::String`; the ascii string that indicates how NULL values will be represented in the dataset; default is the emtpy string `""`
+* `missingstring::String`; the ascii string that indicates how missing values will be represented in the dataset; default is the empty string `""`
 * `dateformat`; how dates/datetimes will be represented in the dataset; default is ISO-8601 `yyyy-mm-ddTHH:MM:SS.s`
 * `header::Bool`; whether to write out the column names from `source`
 * `colnames::Vector{String}`; a vector of string column names to be used when writing the header row
@@ -106,8 +106,8 @@ CSV.write("out.csv", df)
 # write out a DataFrame, this time as a tab-delimited file
 CSV.write("out.csv", df; delim='\t')
 
-# write out a DataFrame, with null values represented by the string "NA"
-CSV.write("out.csv", df; null="NA")
+# write out a DataFrame, with missing values represented by the string "NA"
+CSV.write("out.csv", df; missingstring="NA")
 
 # write out a "header-less" file, with actual data starting on row 1
 CSV.write("out.csv", df; header=false)

diff --git a/src/Source.jl b/src/Source.jl
@@ -4,12 +4,14 @@ function Source(fullpath::Union{AbstractString,IO};
               delim=COMMA,
               quotechar=QUOTE,
               escapechar=ESCAPE,
-              null::AbstractString="",
+              missingstring::AbstractString="",
+              null::Union{AbstractString,Nothing}=nothing,
 
               header::Union{Integer, UnitRange{Int}, Vector}=1, # header can be a row number, range of rows, or actual string vector
               datarow::Int=-1, # by default, data starts immediately after header or start of file
               types=Type[],
-              nullable::Union{Bool, Missing}=missing,
+              allowmissing::Symbol=:all,
+              nullable::Union{Bool,Missing,Nothing}=nothing,
               dateformat=nothing,
               decimal=PERIOD,
               truestring="true",
@@ -29,8 +31,8 @@ function Source(fullpath::Union{AbstractString,IO};
                         options=CSV.Options(delim=typeof(delim) <: String ? UInt8(first(delim)) : (delim % UInt8),
                                             quotechar=typeof(quotechar) <: String ? UInt8(first(quotechar)) : (quotechar % UInt8),
                                             escapechar=typeof(escapechar) <: String ? UInt8(first(escapechar)) : (escapechar % UInt8),
-                                            null=null, dateformat=dateformat, decimal=decimal, truestring=truestring, falsestring=falsestring),
-                        header=header, datarow=datarow, types=types, nullable=nullable, categorical=categorical, weakrefstrings=weakrefstrings, footerskip=footerskip,
+                                            missingstring=missingstring, null=null, dateformat=dateformat, decimal=decimal, truestring=truestring, falsestring=falsestring),
+                        header=header, datarow=datarow, types=types, allowmissing=allowmissing, nullable=nullable, categorical=categorical, weakrefstrings=weakrefstrings, footerskip=footerskip,
                         rows_for_type_detect=rows_for_type_detect, rows=rows, use_mmap=use_mmap)
 end
 
@@ -40,7 +42,8 @@ function Source(;fullpath::Union{AbstractString,IO}="",
                 header::Union{Integer,UnitRange{Int},Vector}=1, # header can be a row number, range of rows, or actual string vector
                 datarow::Int=-1, # by default, data starts immediately after header or start of file
                 types=Type[],
-                nullable::Union{Bool, Missing}=missing,
+                allowmissing::Symbol=:all,
+                nullable::Union{Bool,Missing,Nothing}=nothing,
                 categorical::Bool=true,
                 weakrefstrings::Bool=true,
 
@@ -52,6 +55,16 @@ function Source(;fullpath::Union{AbstractString,IO}="",
     isa(fullpath, AbstractString) && (isfile(fullpath) || throw(ArgumentError("\"$fullpath\" is not a valid file")))
     header = (isa(header, Integer) && header == 1 && datarow == 1) ? -1 : header
     isa(header, Integer) && datarow != -1 && (datarow > header || throw(ArgumentError("data row ($datarow) must come after header row ($header)")))
+    if options.null !== nothing
+        resize!(options.missingstring, length(options.null))
+        copyto!(options.missingstring, options.null)
+        Base.depwarn("null option is deprecated, use missingstring instead", :Source)
+    end
+    if nullable !== nothing
+        allowmissing = ismissing(nullable) ? :auto :
+                       nullable            ? :all  : :none
+        Base.depwarn("nullable=$nullable argument is deprecated, use allowmissing=$(repr(allowmissing)) instead", :Source)
+    end
 
     # open the file for property detection
     if isa(fullpath, IOBuffer)
@@ -86,7 +99,7 @@ function Source(;fullpath::Union{AbstractString,IO}="",
 
     # figure out # of columns and header, either an Integer, AbstractRange, or Vector{String}
     # also ensure that `f` is positioned at the start of data
-    row_vals = Vector{RawField}()
+    row_vals = RawField[]
     if isa(header, Integer)
         # default header = 1
         if header <= 0
@@ -134,7 +147,7 @@ function Source(;fullpath::Union{AbstractString,IO}="",
         # types might be a Vector{DataType}, which will be a problem if Unions are needed
         columntypes = convert(Vector{Type}, types)
     elseif isa(types, Dict) || isempty(types)
-        columntypes = fill!(Vector{Type}(uninitialized, cols), Any)
+        columntypes = fill!(Vector{Type}(undef, cols), Any)
         levels = [Dict{WeakRefString{UInt8}, Int}() for _ = 1:cols]
         lineschecked = 0
         while !eof(source) && lineschecked < min(rows < 0 ? rows_for_type_detect : rows, rows_for_type_detect)
@@ -151,7 +164,7 @@ function Source(;fullpath::Union{AbstractString,IO}="",
         if options.dateformat === nothing && any(x->Missings.T(x) <: Dates.TimeType, columntypes)
             # auto-detected TimeType
             options = Options(delim=options.delim, quotechar=options.quotechar, escapechar=options.escapechar,
-                              null=options.null, dateformat=Dates.ISODateTimeFormat, decimal=options.decimal,
+                              missingstring=options.missingstring, dateformat=Dates.ISODateTimeFormat, decimal=options.decimal,
                               datarow=options.datarow, rows=options.rows, header=options.header, types=options.types)
         end
         if categorical
@@ -165,30 +178,41 @@ function Source(;fullpath::Union{AbstractString,IO}="",
     else
         throw(ArgumentError("$cols number of columns detected; `types` argument has $(length(types)) entries"))
     end
-    if isa(types, Dict{Int, <:Any})
-        for (col, typ) in types
-            columntypes[col] = typ
+    if isa(types, Dict)
+        if isa(types, Dict{String})
+            @static if VERSION >= v"0.7.0-DEV.3627"
+                colinds = indexin(keys(types), columnnames)
+            else
+                colinds = indexin(collect(keys(types)), columnnames)
+            end
+        else
+            colinds = keys(types)
         end
-    elseif isa(types, Dict{String, <:Any})
-        for (col, typ) in types
-            c = findfirst(x->x == col, columnnames)
-            columntypes[c] = typ
+        for (col, typ) in zip(colinds, values(types))
+            columntypes[col] = typ
         end
+        autocols = setdiff(1:cols, colinds)
+    elseif isempty(types)
+        autocols = collect(1:cols)
+    else
+        autocols = Int[]
     end
     if !weakrefstrings
         columntypes = [(T !== Missing && Missings.T(T) <: WeakRefString) ? substitute(T, String) : T for T in columntypes]
     end
-    if !ismissing(nullable)
-        if nullable # allow missing values in all columns
-            for i = 1:cols
+    if allowmissing != :auto
+        if allowmissing == :all # allow missing values in all automatically detected columns
+            for i = autocols
                 T = columntypes[i]
                 columntypes[i] = Union{Missings.T(T), Missing}
             end
-        else # disallow missing values in all columns
-            for i = 1:cols
+        elseif allowmissing == :none # disallow missing values in all automatically detected columns
+            for i = autocols
                 T = columntypes[i]
                 columntypes[i] = Missings.T(T)
             end
+        else
+            throw(ArgumentError("allowmissing must be either :all, :none or :auto"))
         end
     end
     seek(source, datapos)
@@ -231,15 +255,15 @@ Keyword Arguments:
 * `delim::Union{Char,UInt8}`: how fields in the file are delimited; default `','`
 * `quotechar::Union{Char,UInt8}`: the character that indicates a quoted field that may contain the `delim` or newlines; default `'"'`
 * `escapechar::Union{Char,UInt8}`: the character that escapes a `quotechar` in a quoted field; default `'\\'`
-* `null::String`: indicates how NULL values are represented in the dataset; default `""`
+* `missingstring::String`: indicates how missing values are represented in the dataset; default `""`
 * `dateformat::Union{AbstractString,Dates.DateFormat}`: how dates/datetimes are represented in the dataset; default `Base.Dates.ISODateTimeFormat`
 * `decimal::Union{Char,UInt8}`: character to recognize as the decimal point in a float number, e.g. `3.14` or `3,14`; default `'.'`
 * `truestring`: string to represent `true::Bool` values in a csv file; default `"true"`. Note that `truestring` and `falsestring` cannot start with the same character.
 * `falsestring`: string to represent `false::Bool` values in a csv file; default `"false"`
 * `header`: column names can be provided manually as a complete Vector{String}, or as an Int/AbstractRange which indicates the row/rows that contain the column names
 * `datarow::Int`: specifies the row on which the actual data starts in the file; by default, the data is expected on the next row after the header row(s); for a file without column names (header), specify `datarow=1`
 * `types`: column types can be provided manually as a complete Vector{Type}, or in a Dict to reference individual columns by name or number
-* `nullable::Bool`: indicates whether values can be nullable or not; `true` by default. If set to `false` and missing values are encountered, a `Data.NullException` will be thrown
+* `allowmissing::Symbol=:all`: indicates whether columns should allow for missing values or not, that is whether their element type should be `Union{T,Missing}`; by default, all columns are allowed to contain missing values. If set to `:none`, no column can contain missing values, and if set to `:auto`, only colums which contain missing values in the first `rows_for_type_detect` rows are allowed to contain missing values. Column types specified via `types` are not affected by this argument.
 * `footerskip::Int`: indicates the number of rows to skip at the end of the file
 * `rows_for_type_detect::Int=100`: indicates how many rows should be read to infer the types of columns
 * `rows::Int`: indicates the total number of rows to read from the file; by default the file is pre-parsed to count the # of rows; `-1` can be passed to skip a full-file scan, but the `Data.Sink` must be set up to account for a potentially unknown # of rows
@@ -267,8 +291,8 @@ Other example invocations may include:
 # read in a tab-delimited file
 CSV.read(file; delim='\t')
 
-# read in a comma-delimited file with null values represented as '\\N', such as a MySQL export
-CSV.read(file; null="\\N")
+# read in a comma-delimited file with missing values represented as '\\N', such as a MySQL export
+CSV.read(file; missingstring="\\N")
 
 # read a csv file that happens to have column names in the first column, and grouped data in rows instead of columns
 CSV.read(file; transpose=true)