Merge pull request #789 from JuliaData/jq/788

Ensure we check for commented rows when skipping rows for header/data
JuliaData · Nov 24, 2020 · def726a · def726a
2 parents c94256a + 7308bb3
commit def726a
Show file tree

Hide file tree

Showing 6 changed files with 54 additions and 23 deletions.
diff --git a/src/detection.jl b/src/detection.jl
@@ -5,22 +5,19 @@ function detectheaderdatapos(buf, pos, len, oq, eq, cq, cmt, ignoreemptylines, h
     if header isa Integer
         if header <= 0
             # no header row in dataset; skip to data
-            datapos = skiptorow(buf, pos, len, oq, eq, cq, 1, datarow)
+            datapos = skiptorow(buf, pos, len, oq, eq, cq, cmt, ignoreemptylines, 1, datarow)
         else
-            headerpos = skiptorow(buf, pos, len, oq, eq, cq, 1, header)
-            headerpos = checkcommentandemptyline(buf, headerpos, len, cmt, ignoreemptylines)
-            datapos = skiptorow(buf, headerpos, len, oq, eq, cq, header, datarow)
+            headerpos = skiptorow(buf, pos, len, oq, eq, cq, cmt, ignoreemptylines, 1, header)
+            datapos = skiptorow(buf, headerpos, len, oq, eq, cq, cmt, ignoreemptylines, header, datarow)
         end
     elseif header isa AbstractVector{<:Integer}
-        headerpos = skiptorow(buf, pos, len, oq, eq, cq, 1, header[1])
-        headerpos = checkcommentandemptyline(buf, headerpos, len, cmt, ignoreemptylines)
-        datapos = skiptorow(buf, headerpos, len, oq, eq, cq, header[1], datarow)
+        headerpos = skiptorow(buf, pos, len, oq, eq, cq, cmt, ignoreemptylines, 1, header[1])
+        datapos = skiptorow(buf, headerpos, len, oq, eq, cq, cmt, ignoreemptylines, header[1], datarow)
     elseif header isa Union{AbstractVector{Symbol}, AbstractVector{String}}
-        datapos = skiptorow(buf, pos, len, oq, eq, cq, 1, datarow)
+        datapos = skiptorow(buf, pos, len, oq, eq, cq, cmt, ignoreemptylines, 1, datarow)
     else
         throw(ArgumentError("unsupported header argument: $header"))
     end
-    datapos = max(1, checkcommentandemptyline(buf, datapos, len, cmt, ignoreemptylines))
     return headerpos, datapos
 end
 
@@ -163,6 +160,8 @@ function incr!(c::ByteValueCounter, b::UInt8)
     return
 end
 
+ignoreemptylines(opts::Parsers.Options{ir, iel}) where {ir, iel} = iel
+
 # given the various header and normalization options, figure out column names for a file
 function detectcolumnnames(buf, headerpos, datapos, len, options, header, normalizenames)
     if header isa Union{AbstractVector{Symbol}, AbstractVector{String}}
@@ -182,7 +181,7 @@ function detectcolumnnames(buf, headerpos, datapos, len, options, header, normal
     elseif header isa AbstractVector{<:Integer}
         names, pos = readsplitline(buf, headerpos, len, options)
         for row = 2:length(header)
-            pos = skiptorow(buf, pos, len, options.oq, options.e, options.cq, 1, header[row] - header[row - 1])
+            pos = skiptorow(buf, pos, len, options.oq, options.e, options.cq, options.cmt, ignoreemptylines(options), 1, header[row] - header[row - 1])
             fields, pos = readsplitline(buf, pos, len, options)
             for (i, x) in enumerate(fields)
                 names[i] *= "_" * x
@@ -193,9 +192,12 @@ function detectcolumnnames(buf, headerpos, datapos, len, options, header, normal
 end
 
 # efficiently skip from `cur` to `dest` row
-function skiptorow(buf, pos, len, oq, eq, cq, cur, dest)
-    cur >= dest && return pos
-    for _ = 1:(dest - cur)
+function skiptorow(buf, pos, len, oq, eq, cq, cmt, ignoreemptylines, cur, dest)
+    nlines = Ref{Int}(0)
+    pos = checkcommentandemptyline(buf, pos, len, cmt, ignoreemptylines, nlines)
+    cur += nlines[]
+    nlines[] = 0
+    while cur < dest && pos < len
         while pos <= len
             @inbounds b = buf[pos]
             pos += 1
@@ -217,12 +219,17 @@ function skiptorow(buf, pos, len, oq, eq, cq, cur, dest)
                 end
             elseif b == UInt8('\n')
                 typeof(buf) == ReversedBuf && pos <= len && buf[pos] == UInt8('\r') && (pos += 1)
+                cur += 1
                 break
             elseif b == UInt8('\r')
                 pos <= len && buf[pos] == UInt8('\n') && (pos += 1)
+                cur += 1
                 break
             end
         end
+        pos = checkcommentandemptyline(buf, pos, len, cmt, ignoreemptylines, nlines)
+        cur += nlines[]
+        nlines[] = 0
     end
     return pos
 end
@@ -268,7 +275,9 @@ end
     return pos
 end
 
-function checkcommentandemptyline(buf, pos, len, cmt, ignoreemptylines)
+const NLINES = Ref{Int}(0)
+
+function checkcommentandemptyline(buf, pos, len, cmt, ignoreemptylines, nlines=NLINES)
     cmtptr, cmtlen = cmt === nothing ? (C_NULL, 0) : cmt
     ptr = pointer(buf, pos)
     while pos <= len
@@ -278,6 +287,7 @@ function checkcommentandemptyline(buf, pos, len, cmt, ignoreemptylines)
             if newpos > pos
                 pos = newpos
                 skipped = true
+                nlines[] += 1
             end
         end
         if cmtlen > 0 && (pos + cmtlen - 1) <= len
@@ -293,6 +303,7 @@ function checkcommentandemptyline(buf, pos, len, cmt, ignoreemptylines)
                 end
                 b == UInt8('\r') && pos <= len && buf[pos + 1] == UInt8('\n') && (pos += 1)
                 pos += 1
+                nlines[] += 1
             end
         end
         (skipped | matched) || break

diff --git a/src/file.jl b/src/file.jl
@@ -139,14 +139,14 @@ tbl = CSV.File(file) |> SQLite.load!(db, "sqlite_table")
 
 Supported keyword arguments include:
 * File layout options:
-  * `header=1`: the `header` argument can be an `Int`, indicating the row to parse for column names; or a `Range`, indicating a span of rows to be concatenated together as column names; or an entire `Vector{Symbol}` or `Vector{String}` to use as column names; if a file doesn't have column names, either provide them as a `Vector`, or set `header=0` or `header=false` and column names will be auto-generated (`Column1`, `Column2`, etc.)
+  * `header=1`: the `header` argument can be an `Int`, indicating the row to parse for column names; or a `Range`, indicating a span of rows to be concatenated together as column names; or an entire `Vector{Symbol}` or `Vector{String}` to use as column names; if a file doesn't have column names, either provide them as a `Vector`, or set `header=0` or `header=false` and column names will be auto-generated (`Column1`, `Column2`, etc.). Note that if a row number header and `comment` or `ignoreemtpylines` are provided, the header row will be the first non-commented/non-empty row _after_ the row number, meaning if the provided row number is a commented row, the header row will actually be the next non-commented row.
   * `normalizenames=false`: whether column names should be "normalized" into valid Julia identifier symbols; useful when iterating rows and accessing column values of a row via `getproperty` (e.g. `row.col1`)
-  * `datarow`: an `Int` argument to specify the row where the data starts in the csv file; by default, the next row after the `header` row is used. If `header=0`, then the 1st row is assumed to be the start of data; providing a `datarow` or `skipto` argument does _not_ affect the `header` argument
+  * `datarow`: an `Int` argument to specify the row where the data starts in the csv file; by default, the next row after the `header` row is used. If `header=0`, then the 1st row is assumed to be the start of data; providing a `datarow` or `skipto` argument does _not_ affect the `header` argument. Note that if a row number `datarow` and `comment` or `ignoreemtpylines` are provided, the data row will be the first non-commented/non-empty row _after_ the row number, meaning if the provided row number is a commented row, the data row will actually be the next non-commented row.
   * `skipto::Int`: identical to `datarow`, specifies the number of rows to skip before starting to read data
-  * `footerskip::Int`: number of rows at the end of a file to skip parsing
+  * `footerskip::Int`: number of rows at the end of a file to skip parsing.  Do note that commented rows (see the `comment` keyword argument) *do not* count towards the row number provided for `footerskip`, they are completely ignored by the parser
   * `limit`: an `Int` to indicate a limited number of rows to parse in a csv file; use in combination with `skipto` to read a specific, contiguous chunk within a file; note for large files when multiple threads are used for parsing, the `limit` argument may not result in exact an exact # of rows parsed; use `threaded=false` to ensure an exact limit if necessary
   * `transpose::Bool`: read a csv file "transposed", i.e. each column is parsed as a row
-  * `comment`: rows that begin with this `String` will be skipped while parsing
+  * `comment`: rows that begin with this `String` will be skipped while parsing. Note that if a row number header or `datarow` and `comment` are provided, the header/data row will be the first non-commented/non-empty row _after_ the row number, meaning if the provided row number is a commented row, the header/data row will actually be the next non-commented row.
   * `ignoreemptylines::Bool=true`: whether empty rows/lines in a file should be ignored (if `false`, each column will be assigned `missing` for that empty row)
   * `threaded::Bool`: whether parsing should utilize multiple threads; by default threads are used on large enough files, but isn't allowed when `transpose=true`; only available in Julia 1.3+
   * `tasks::Integer=Threads.nthreads()`: for multithreaded parsing, this controls the number of tasks spawned to read a file in chunks concurrently; defaults to the # of threads Julia was started with (i.e. `JULIA_NUM_THREADS` environment variable)
@@ -701,7 +701,7 @@ end
                     options.silencewarnings || numwarnings[] > maxwarnings || toomanycolumns(ncols, rowoffset + row)
                     numwarnings[] += 1
                     # ignore the rest of the line
-                    pos = skiptorow(buf, pos, len, options.oq, options.e, options.cq, 1, 2)
+                    pos = skiptorow(buf, pos, len, options.oq, options.e, options.cq, options.cmt, ignoreemptylines(options), 1, 2)
                 end
             end
         end

diff --git a/src/header.jl b/src/header.jl
@@ -131,7 +131,7 @@ getdf(x::AbstractDict{Int}, nm, i) = haskey(x, i) ? x[i] : nothing
         lastbyte = buf[end]
         endpos = (lastbyte == UInt8('\r') || lastbyte == UInt8('\n')) +
             (lastbyte == UInt8('\n') && buf[end - 1] == UInt8('\r'))
-        revlen = skiptorow(ReversedBuf(buf), 1 + endpos, len, oq, eq, cq, 0, footerskip) - 2
+        revlen = skiptorow(ReversedBuf(buf), 1 + endpos, len, oq, eq, cq, cmt, ignoreemptylines, 0, footerskip) - 2
         len -= revlen
         debug && println("adjusted for footerskip, len = $(len + revlen - 1) => $len")
     end

diff --git a/src/rows.jl b/src/rows.jl
@@ -70,13 +70,13 @@ end
 
 Supported keyword arguments include:
 * File layout options:
-  * `header=1`: the `header` argument can be an `Int`, indicating the row to parse for column names; or a `Range`, indicating a span of rows to be concatenated together as column names; or an entire `Vector{Symbol}` or `Vector{String}` to use as column names; if a file doesn't have column names, either provide them as a `Vector`, or set `header=0` or `header=false` and column names will be auto-generated (`Column1`, `Column2`, etc.)
+  * `header=1`: the `header` argument can be an `Int`, indicating the row to parse for column names; or a `Range`, indicating a span of rows to be concatenated together as column names; or an entire `Vector{Symbol}` or `Vector{String}` to use as column names; if a file doesn't have column names, either provide them as a `Vector`, or set `header=0` or `header=false` and column names will be auto-generated (`Column1`, `Column2`, etc.). Note that if a row number header and `comment` or `ignoreemtpylines` are provided, the header row will be the first non-commented/non-empty row _after_ the row number, meaning if the provided row number is a commented row, the header row will actually be the next non-commented row.
   * `normalizenames=false`: whether column names should be "normalized" into valid Julia identifier symbols; useful when iterating rows and accessing column values of a row via `getproperty` (e.g. `row.col1`)
-  * `datarow`: an `Int` argument to specify the row where the data starts in the csv file; by default, the next row after the `header` row is used. If `header=0`, then the 1st row is assumed to be the start of data
+  * `datarow`: an `Int` argument to specify the row where the data starts in the csv file; by default, the next row after the `header` row is used. If `header=0`, then the 1st row is assumed to be the start of data; providing a `datarow` or `skipto` argument does _not_ affect the `header` argument. Note that if a row number `datarow` and `comment` or `ignoreemtpylines` are provided, the data row will be the first non-commented/non-empty row _after_ the row number, meaning if the provided row number is a commented row, the data row will actually be the next non-commented row.
   * `skipto::Int`: similar to `datarow`, specifies the number of rows to skip before starting to read data
   * `limit`: an `Int` to indicate a limited number of rows to parse in a csv file; use in combination with `skipto` to read a specific, contiguous chunk within a file
   * `transpose::Bool`: read a csv file "transposed", i.e. each column is parsed as a row
-  * `comment`: rows that begin with this `String` will be skipped while parsing
+  * `comment`: rows that begin with this `String` will be skipped while parsing. Note that if a row number header or `datarow` and `comment` are provided, the header/data row will be the first non-commented/non-empty row _after_ the row number, meaning if the provided row number is a commented row, the header/data row will actually be the next non-commented row.
   * `ignoreemptylines::Bool=true`: whether empty rows/lines in a file should be ignored (if `false`, each column will be assigned `missing` for that empty row)
 * Parsing options:
   * `missingstrings`, `missingstring`: either a `String`, or `Vector{String}` to use as sentinel values that will be parsed as `missing`; by default, only an empty field (two consecutive delimiters) is considered `missing`

diff --git a/src/utils.jl b/src/utils.jl
@@ -411,6 +411,7 @@ end
 Base.size(a::ReversedBuf) = size(a.buf)
 Base.IndexStyle(::Type{ReversedBuf}) = Base.IndexLinear()
 Base.getindex(a::ReversedBuf, i::Int) = a.buf[end + 1 - i]
+Base.pointer(a::ReversedBuf, pos::Integer=1) = pointer(a.buf, length(a.buf) + 1 - pos)
 
 memset!(ptr, value, num) = ccall(:memset, Ptr{Cvoid}, (Ptr{Cvoid}, Cint, Csize_t), ptr, value, num)
 

diff --git a/test/basics.jl b/test/basics.jl
@@ -545,4 +545,23 @@ f = CSV.File(codeunits("a\n1"))
 @test length(f) == 1
 @test f.a == [1]
 
+# 788
+f = CSV.File(IOBuffer("""
+# 1'2
+name
+junk
+1
+"""), comment="#", header=2, datarow=4)
+@test length(f) == 1
+@test f[1].name == 1
+
+f = CSV.File(IOBuffer("""
+# 1'2"
+name
+junk
+1
+"""), comment="#", header=2, datarow=4)
+@test length(f) == 1
+@test f[1].name == 1
+
 end