Support groupmark (#1093)

* support groupmark * add more documentation --------- Co-authored-by: Lilith Hafner <Lilith.Hafner@gmail.com>
JuliaData · Jun 5, 2023 · 03c22d9 · 03c22d9
1 parent 631e456
commit 03c22d9
Show file tree

Hide file tree

Showing 10 changed files with 78 additions and 11 deletions.
diff --git a/docs/src/examples.md b/docs/src/examples.md
@@ -542,6 +542,42 @@ col1;col2;col3
 file = CSV.File(IOBuffer(data); delim=';', decimal=',')
 ```
 
+## [Thousands separator](@ref thousands_example)
+
+```julia
+using CSV
+
+# In many places in the world, digits to the left of the decimal place are broken into
+# groups by a thousands separator. We can ignore those separators by passing the `groupmark`
+# keyword argument.
+data = """
+x y
+1 2
+2 1,729
+3 87,539,319
+"""
+
+file = CSV.File(IOBuffer(data); groupmark=',')
+```
+```
+
+## [Custom groupmarks](@ref groupmark_example)
+
+```julia
+using CSV
+
+# In some contexts, separators other than thousands separators group digits in a number.
+# `groupmark` supports ignoring them as long as the separator character is ASCII
+data = """
+name;ssn;credit card number
+Ayodele Beren;597-21-8366;5538-6111-0574-2633
+Trinidad Shiori;387-35-5126;3017-9300-0776-5301
+Ori Cherokee;731-12-4606;4682-5416-0636-3877
+"""
+
+file = CSV.File(IOBuffer(data); groupmark='-')
+```
+
 ## [Custom bool strings](@id truestrings_example)
 
 ```julia
@@ -577,11 +613,11 @@ data = """
 file = CSV.File(IOBuffer(data); header=false)
 file = CSV.File(IOBuffer(data); header=false, delim=' ', types=Float64)
 
-# as a last step if you want to convert this to a Matrix, this can be done by reading in first as a DataFrame and then 
+# as a last step if you want to convert this to a Matrix, this can be done by reading in first as a DataFrame and then
 # function chaining to a Matrix
 using DataFrames
 A = file|>DataFrame|>Matrix
- 
+
 # another alternative is to simply use CSV.Tables.matrix and say
 B = file|>CSV.Tables.matrix # does not require DataFrames
 ```

diff --git a/docs/src/reading.md b/docs/src/reading.md
@@ -162,6 +162,14 @@ An ASCII `Char` argument that is used when parsing float values that indicates w
 ### Examples
   * [Custom decimal separator](@ref decimal_example)
 
+## [`groupmark` / thousands separator](@id groupmark)
+
+A "groupmark" is a symbol that separates groups of digits so that it easier for humans to read a number. Thousands separators are a common example of groupmarks. The argument `groupmark`, if provided, must be an ASCII `Char` which will be ignored during parsing when it occurs between two digits on the left hand side of the decimal. e.g the groupmark in the integer `1,729` is `','` and the groupmark for the US social security number `875-39-3196` is `-`. By default, `groupmark=nothing` which indicates that there are no stray characters separating digits.
+
+### Examples
+  * [Thousands separator](@ref thousands_example)
+  * [Custom groupmarks](@ref groupmark_example)
+
 ## [`truestrings` / `falsestrings`](@id truestrings)
 
 These arguments can be provided as `Vector{String}` to specify custom values that should be treated as the `Bool` `true`/`false` values for all the columns of a data input. By default, `["true", "True", "TRUE", "T", "1"]` string values are used to detect `true` values, and `["false", "False", "FALSE", "F", "0"]` string values are used to detect `false` values. Note that even though `"1"` and `"0"` _can_ be used to parse `true`/`false` values, in terms of _auto_ detecting column types, those values will be parsed as `Int64` first, instead of `Bool`. To instead parse those values as `Bool`s for a column, you can manually provide that column's type as `Bool` (see the [type](@ref types) argument).

diff --git a/src/chunks.jl b/src/chunks.jl
@@ -58,6 +58,7 @@ function Chunks(source::ValidSources;
     dateformat::Union{String, Dates.DateFormat, Nothing, AbstractDict}=nothing,
     dateformats=nothing,
     decimal::Union{UInt8, Char}=UInt8('.'),
+    groupmark::Union{Char, Nothing}=nothing,
     truestrings::Union{Vector{String}, Nothing}=TRUE_STRINGS,
     falsestrings::Union{Vector{String}, Nothing}=FALSE_STRINGS,
     stripwhitespace::Bool=false,
@@ -77,7 +78,7 @@ function Chunks(source::ValidSources;
     validate=true,
     )
 
-    ctx = @refargs Context(source, header, normalizenames, datarow, skipto, footerskip, transpose, comment, ignoreemptyrows, ignoreemptylines, select, drop, limit, buffer_in_memory, nothing, ntasks, tasks, rows_to_check, lines_to_check, missingstrings, missingstring, delim, ignorerepeated, quoted, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, truestrings, falsestrings, stripwhitespace, type, types, typemap, pool, downcast, lazystrings, stringtype, strict, silencewarnings, maxwarnings, debug, parsingdebug, validate, false)
+    ctx = @refargs Context(source, header, normalizenames, datarow, skipto, footerskip, transpose, comment, ignoreemptyrows, ignoreemptylines, select, drop, limit, buffer_in_memory, nothing, ntasks, tasks, rows_to_check, lines_to_check, missingstrings, missingstring, delim, ignorerepeated, quoted, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, groupmark, truestrings, falsestrings, stripwhitespace, type, types, typemap, pool, downcast, lazystrings, stringtype, strict, silencewarnings, maxwarnings, debug, parsingdebug, validate, false)
     !ctx.threaded && throw(ArgumentError("unable to iterate chunks from input file source"))
     foreach(col -> col.lock = ReentrantLock(), ctx.columns)
     return Chunks(ctx)

diff --git a/src/context.jl b/src/context.jl
@@ -233,6 +233,7 @@ function Context(source::ValidSources;
     dateformat::Union{String, Dates.DateFormat, Nothing, AbstractDict}=nothing,
     dateformats=nothing,
     decimal::Union{UInt8, Char}=UInt8('.'),
+    groupmark::Union{Char, Nothing}=nothing,
     truestrings::Union{Vector{String}, Nothing}=TRUE_STRINGS,
     falsestrings::Union{Vector{String}, Nothing}=FALSE_STRINGS,
     stripwhitespace::Bool=false,
@@ -251,7 +252,7 @@ function Context(source::ValidSources;
     parsingdebug::Bool=false,
     validate::Bool=true,
     )
-    return @refargs Context(source, header, normalizenames, datarow, skipto, footerskip, transpose, comment, ignoreemptyrows, ignoreemptylines, select, drop, limit, buffer_in_memory, threaded, ntasks, tasks, rows_to_check, lines_to_check, missingstrings, missingstring, delim, ignorerepeated, quoted, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, truestrings, falsestrings, stripwhitespace, type, types, typemap, pool, downcast, lazystrings, stringtype, strict, silencewarnings, maxwarnings, debug, parsingdebug, validate, false)
+    return @refargs Context(source, header, normalizenames, datarow, skipto, footerskip, transpose, comment, ignoreemptyrows, ignoreemptylines, select, drop, limit, buffer_in_memory, threaded, ntasks, tasks, rows_to_check, lines_to_check, missingstrings, missingstring, delim, ignorerepeated, quoted, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, groupmark, truestrings, falsestrings, stripwhitespace, type, types, typemap, pool, downcast, lazystrings, stringtype, strict, silencewarnings, maxwarnings, debug, parsingdebug, validate, false)
 end
 
 @refargs function Context(source::ValidSources,
@@ -288,6 +289,7 @@ end
     dateformat::Union{Nothing, String, Dates.DateFormat, Parsers.Format, AbstractVector, AbstractDict},
     dateformats::Union{Nothing, String, Dates.DateFormat, Parsers.Format, AbstractVector, AbstractDict},
     decimal::Union{UInt8, Char},
+    groupmark::Union{Char, Nothing},
     truestrings::Union{Nothing, Vector{String}},
     falsestrings::Union{Nothing, Vector{String}},
     stripwhitespace::Bool,
@@ -439,14 +441,14 @@ end
         d, rowsguess = detectdelimandguessrows(buf, headerpos, datapos, len, oq, eq, cq, cmt, ignoreemptyrows, del)
         wh1 = d == UInt(' ') ? 0x00 : wh1
         wh2 = d == UInt8('\t') ? 0x00 : wh2
-        options = Parsers.Options(sentinel, wh1, wh2, oq, cq, eq, d, decimal, trues, falses, df, ignorerepeated, ignoreemptyrows, comment, quoted, parsingdebug, stripwhitespace)
+        options = Parsers.Options(sentinel, wh1, wh2, oq, cq, eq, d, decimal, trues, falses, df, ignorerepeated, ignoreemptyrows, comment, quoted, parsingdebug, stripwhitespace, false, groupmark)
     elseif del isa Char
         _, rowsguess = detectdelimandguessrows(buf, headerpos, datapos, len, oq, eq, cq, cmt, ignoreemptyrows)
-        options = Parsers.Options(sentinel, wh1, wh2, oq, cq, eq, del, decimal, trues, falses, df, ignorerepeated, ignoreemptyrows, comment, quoted, parsingdebug, stripwhitespace)
+        options = Parsers.Options(sentinel, wh1, wh2, oq, cq, eq, del, decimal, trues, falses, df, ignorerepeated, ignoreemptyrows, comment, quoted, parsingdebug, stripwhitespace, false, groupmark)
         d = del
     elseif del isa String
         _, rowsguess = detectdelimandguessrows(buf, headerpos, datapos, len, oq, eq, cq, cmt, ignoreemptyrows)
-        options = Parsers.Options(sentinel, wh1, wh2, oq, cq, eq, del, decimal, trues, falses, df, ignorerepeated, ignoreemptyrows, comment, quoted, parsingdebug, stripwhitespace)
+        options = Parsers.Options(sentinel, wh1, wh2, oq, cq, eq, del, decimal, trues, falses, df, ignorerepeated, ignoreemptyrows, comment, quoted, parsingdebug, stripwhitespace, false, groupmark)
         d = del
     else
         error("invalid delim type")
@@ -501,7 +503,7 @@ end
             # devdoc: if we want to add any other column-specific parsing options, this is where we'd at the logic
             # e.g. per-column sentinel, decimal, trues, falses, openquotechar, closequotechar, escapechar, etc.
             if df !== nothing
-                columns[i].options = Parsers.Options(sentinel, wh1, wh2, oq, cq, eq, d, decimal, trues, falses, df, ignorerepeated, ignoreemptyrows, comment, true, parsingdebug, stripwhitespace)
+                columns[i].options = Parsers.Options(sentinel, wh1, wh2, oq, cq, eq, d, decimal, trues, falses, df, ignorerepeated, ignoreemptyrows, comment, true, parsingdebug, stripwhitespace, false, groupmark)
             end
         end
         validate && checkinvalidcolumns(dateformat, "dateformat", ncols, names)

diff --git a/src/file.jl b/src/file.jl
@@ -194,6 +194,7 @@ function File(source::ValidSources;
     dateformat::Union{String, Dates.DateFormat, Nothing, AbstractDict}=nothing,
     dateformats=nothing,
     decimal::Union{UInt8, Char}=UInt8('.'),
+    groupmark::Union{Char, Nothing}=nothing,
     truestrings::Union{Vector{String}, Nothing}=TRUE_STRINGS,
     falsestrings::Union{Vector{String}, Nothing}=FALSE_STRINGS,
     stripwhitespace::Bool=false,
@@ -215,10 +216,10 @@ function File(source::ValidSources;
     # header=1;normalizenames=false;datarow=-1;skipto=-1;footerskip=0;transpose=false;comment=nothing;ignoreemptyrows=true;ignoreemptylines=nothing;
     # select=nothing;drop=nothing;limit=nothing;threaded=nothing;ntasks=Threads.nthreads();tasks=nothing;rows_to_check=30;lines_to_check=nothing;missingstrings=String[];missingstring="";
     # delim=nothing;ignorerepeated=false;quoted=true;quotechar='"';openquotechar=nothing;closequotechar=nothing;escapechar='"';dateformat=nothing;
-    # dateformats=nothing;decimal=UInt8('.');truestrings=nothing;falsestrings=nothing;type=nothing;types=nothing;typemap=IdDict{Type,Type}();
+    # dateformats=nothing;decimal=UInt8('.');groupmark=nothing;truestrings=nothing;falsestrings=nothing;type=nothing;types=nothing;typemap=IdDict{Type,Type}();
     # pool=CSV.DEFAULT_POOL;downcast=false;lazystrings=false;stringtype=String;strict=false;silencewarnings=false;maxwarnings=100;debug=false;parsingdebug=false;buffer_in_memory=false
     # @descend CSV.Context(CSV.Arg(source), CSV.Arg(header), CSV.Arg(normalizenames), CSV.Arg(datarow), CSV.Arg(skipto), CSV.Arg(footerskip), CSV.Arg(transpose), CSV.Arg(comment), CSV.Arg(ignoreemptyrows), CSV.Arg(ignoreemptylines), CSV.Arg(select), CSV.Arg(drop), CSV.Arg(limit), CSV.Arg(buffer_in_memory), CSV.Arg(threaded), CSV.Arg(ntasks), CSV.Arg(tasks), CSV.Arg(rows_to_check), CSV.Arg(lines_to_check), CSV.Arg(missingstrings), CSV.Arg(missingstring), CSV.Arg(delim), CSV.Arg(ignorerepeated), CSV.Arg(quoted), CSV.Arg(quotechar), CSV.Arg(openquotechar), CSV.Arg(closequotechar), CSV.Arg(escapechar), CSV.Arg(dateformat), CSV.Arg(dateformats), CSV.Arg(decimal), CSV.Arg(truestrings), CSV.Arg(falsestrings), CSV.Arg(type), CSV.Arg(types), CSV.Arg(typemap), CSV.Arg(pool), CSV.Arg(downcast), CSV.Arg(lazystrings), CSV.Arg(stringtype), CSV.Arg(strict), CSV.Arg(silencewarnings), CSV.Arg(maxwarnings), CSV.Arg(debug), CSV.Arg(parsingdebug), CSV.Arg(false))
-    ctx = @refargs Context(source, header, normalizenames, datarow, skipto, footerskip, transpose, comment, ignoreemptyrows, ignoreemptylines, select, drop, limit, buffer_in_memory, threaded, ntasks, tasks, rows_to_check, lines_to_check, missingstrings, missingstring, delim, ignorerepeated, quoted, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, truestrings, falsestrings, stripwhitespace, type, types, typemap, pool, downcast, lazystrings, stringtype, strict, silencewarnings, maxwarnings, debug, parsingdebug, validate, false)
+    ctx = @refargs Context(source, header, normalizenames, datarow, skipto, footerskip, transpose, comment, ignoreemptyrows, ignoreemptylines, select, drop, limit, buffer_in_memory, threaded, ntasks, tasks, rows_to_check, lines_to_check, missingstrings, missingstring, delim, ignorerepeated, quoted, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, groupmark, truestrings, falsestrings, stripwhitespace, type, types, typemap, pool, downcast, lazystrings, stringtype, strict, silencewarnings, maxwarnings, debug, parsingdebug, validate, false)
     return File(ctx)
 end
 

diff --git a/src/keyworddocs.jl b/src/keyworddocs.jl
@@ -28,6 +28,7 @@ const KEYWORD_DOCS = """
   * `escapechar='"'`: the `Char` used to escape quote characters in a quoted field
   * `dateformat::Union{String, Dates.DateFormat, Nothing, AbstractDict}`: a date format string to indicate how Date/DateTime columns are formatted for the entire file; if given as an `AbstractDict`, date format strings to indicate how the Date/DateTime columns corresponding to the keys are formatted. The Dict can map column index `Int`, or name `Symbol` or `String` to the format string for that column.
   * `decimal='.'`: a `Char` indicating how decimals are separated in floats, i.e. `3.14` uses `'.'`, or `3,14` uses a comma `','`
+  * `groupmark=nothing`: optionally specify a single-byte character denoting the number grouping mark, this allows parsing of numbers that have, e.g., thousand separators (`1,000.00`).
   * `truestrings`, `falsestrings`: `Vector{String}`s that indicate how `true` or `false` values are represented; by default `"true", "True", "TRUE", "T", "1"` are used to detect `true` and `"false", "False", "FALSE", "F", "0"` are used to detect `false`; note that columns with only `1` and `0` values will default to `Int64` column type unless explicitly requested to be `Bool` via `types` keyword argument
   * `stripwhitespace=false`: if true, leading and trailing whitespace are stripped from string values, including column names
 

diff --git a/src/rows.jl b/src/rows.jl
@@ -102,6 +102,7 @@ function Rows(source::ValidSources;
     dateformat::Union{String, Dates.DateFormat, Nothing, AbstractDict}=nothing,
     dateformats=nothing,
     decimal::Union{UInt8, Char}=UInt8('.'),
+    groupmark::Union{Char, Nothing}=nothing,
     truestrings::Union{Vector{String}, Nothing}=TRUE_STRINGS,
     falsestrings::Union{Vector{String}, Nothing}=FALSE_STRINGS,
     stripwhitespace::Bool=false,
@@ -121,7 +122,7 @@ function Rows(source::ValidSources;
     validate::Bool=true,
     reusebuffer::Bool=false,
     )
-    ctx = @refargs Context(source, header, normalizenames, datarow, skipto, footerskip, transpose, comment, ignoreemptyrows, ignoreemptylines, select, drop, limit, buffer_in_memory, nothing, nothing, nothing, 0, nothing, missingstrings, missingstring, delim, ignorerepeated, quoted, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, truestrings, falsestrings, stripwhitespace, type, types, typemap, pool, downcast, lazystrings, stringtype, strict, silencewarnings, maxwarnings, debug, parsingdebug, validate, true)
+    ctx = @refargs Context(source, header, normalizenames, datarow, skipto, footerskip, transpose, comment, ignoreemptyrows, ignoreemptylines, select, drop, limit, buffer_in_memory, nothing, nothing, nothing, 0, nothing, missingstrings, missingstring, delim, ignorerepeated, quoted, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, groupmark, truestrings, falsestrings, stripwhitespace, type, types, typemap, pool, downcast, lazystrings, stringtype, strict, silencewarnings, maxwarnings, debug, parsingdebug, validate, true)
     foreach(col -> col.pool = 0.0, ctx.columns)
     allocate!(ctx.columns, 1)
     values = all(x->x.type === ctx.stringtype && x.anymissing, ctx.columns) && ctx.stringtype === PosLenString ? Vector{PosLen}(undef, ctx.cols) : Vector{Any}(undef, ctx.cols)

diff --git a/test/testfiles.jl b/test/testfiles.jl
@@ -676,6 +676,17 @@ testfiles = [
             col3 = ["quoted field 3", "quoted\"field C", missing, "C", "w\n\n\n\"", missing, " z", "\\\n,", "unquotedfield"]
         )
     ),
+    # https://github.com/JuliaData/CSV.jl/pull/1093
+    ("groupmark_quote.csv", (groupmark=',',),
+        (2, 2),
+        NamedTuple{(:x, :y), Tuple{Int, Int}},
+        (x = [15887, 23603], y = [24651, 14076])
+    ),
+    ("groupmark_space.csv", (groupmark=',',),
+        (2, 2),
+        NamedTuple{(:x, :y), Tuple{Int, Int}},
+        (x = [15887, 23603], y = [24651, 14076])
+    ),
 ];
 
 @static if VERSION >= v"1.3-DEV"

diff --git a/test/testfiles/groupmark_quote.csv b/test/testfiles/groupmark_quote.csv
@@ -0,0 +1,3 @@
+x,y
+"15,887","24,651"
+"23,603","14,076"
diff --git a/test/testfiles/groupmark_space.csv b/test/testfiles/groupmark_space.csv
@@ -0,0 +1,3 @@
+x y
+15,887 24,651
+23,603 14,076