Skip to content

Commit

Permalink
Merge pull request #789 from JuliaData/jq/788
Browse files Browse the repository at this point in the history
Ensure we check for commented rows when skipping rows for header/data
  • Loading branch information
quinnj committed Nov 24, 2020
2 parents c94256a + 7308bb3 commit def726a
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 23 deletions.
39 changes: 25 additions & 14 deletions src/detection.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,19 @@ function detectheaderdatapos(buf, pos, len, oq, eq, cq, cmt, ignoreemptylines, h
if header isa Integer
if header <= 0
# no header row in dataset; skip to data
datapos = skiptorow(buf, pos, len, oq, eq, cq, 1, datarow)
datapos = skiptorow(buf, pos, len, oq, eq, cq, cmt, ignoreemptylines, 1, datarow)
else
headerpos = skiptorow(buf, pos, len, oq, eq, cq, 1, header)
headerpos = checkcommentandemptyline(buf, headerpos, len, cmt, ignoreemptylines)
datapos = skiptorow(buf, headerpos, len, oq, eq, cq, header, datarow)
headerpos = skiptorow(buf, pos, len, oq, eq, cq, cmt, ignoreemptylines, 1, header)
datapos = skiptorow(buf, headerpos, len, oq, eq, cq, cmt, ignoreemptylines, header, datarow)
end
elseif header isa AbstractVector{<:Integer}
headerpos = skiptorow(buf, pos, len, oq, eq, cq, 1, header[1])
headerpos = checkcommentandemptyline(buf, headerpos, len, cmt, ignoreemptylines)
datapos = skiptorow(buf, headerpos, len, oq, eq, cq, header[1], datarow)
headerpos = skiptorow(buf, pos, len, oq, eq, cq, cmt, ignoreemptylines, 1, header[1])
datapos = skiptorow(buf, headerpos, len, oq, eq, cq, cmt, ignoreemptylines, header[1], datarow)
elseif header isa Union{AbstractVector{Symbol}, AbstractVector{String}}
datapos = skiptorow(buf, pos, len, oq, eq, cq, 1, datarow)
datapos = skiptorow(buf, pos, len, oq, eq, cq, cmt, ignoreemptylines, 1, datarow)
else
throw(ArgumentError("unsupported header argument: $header"))
end
datapos = max(1, checkcommentandemptyline(buf, datapos, len, cmt, ignoreemptylines))
return headerpos, datapos
end

Expand Down Expand Up @@ -163,6 +160,8 @@ function incr!(c::ByteValueCounter, b::UInt8)
return
end

ignoreemptylines(opts::Parsers.Options{ir, iel}) where {ir, iel} = iel

# given the various header and normalization options, figure out column names for a file
function detectcolumnnames(buf, headerpos, datapos, len, options, header, normalizenames)
if header isa Union{AbstractVector{Symbol}, AbstractVector{String}}
Expand All @@ -182,7 +181,7 @@ function detectcolumnnames(buf, headerpos, datapos, len, options, header, normal
elseif header isa AbstractVector{<:Integer}
names, pos = readsplitline(buf, headerpos, len, options)
for row = 2:length(header)
pos = skiptorow(buf, pos, len, options.oq, options.e, options.cq, 1, header[row] - header[row - 1])
pos = skiptorow(buf, pos, len, options.oq, options.e, options.cq, options.cmt, ignoreemptylines(options), 1, header[row] - header[row - 1])
fields, pos = readsplitline(buf, pos, len, options)
for (i, x) in enumerate(fields)
names[i] *= "_" * x
Expand All @@ -193,9 +192,12 @@ function detectcolumnnames(buf, headerpos, datapos, len, options, header, normal
end

# efficiently skip from `cur` to `dest` row
function skiptorow(buf, pos, len, oq, eq, cq, cur, dest)
cur >= dest && return pos
for _ = 1:(dest - cur)
function skiptorow(buf, pos, len, oq, eq, cq, cmt, ignoreemptylines, cur, dest)
nlines = Ref{Int}(0)
pos = checkcommentandemptyline(buf, pos, len, cmt, ignoreemptylines, nlines)
cur += nlines[]
nlines[] = 0
while cur < dest && pos < len
while pos <= len
@inbounds b = buf[pos]
pos += 1
Expand All @@ -217,12 +219,17 @@ function skiptorow(buf, pos, len, oq, eq, cq, cur, dest)
end
elseif b == UInt8('\n')
typeof(buf) == ReversedBuf && pos <= len && buf[pos] == UInt8('\r') && (pos += 1)
cur += 1
break
elseif b == UInt8('\r')
pos <= len && buf[pos] == UInt8('\n') && (pos += 1)
cur += 1
break
end
end
pos = checkcommentandemptyline(buf, pos, len, cmt, ignoreemptylines, nlines)
cur += nlines[]
nlines[] = 0
end
return pos
end
Expand Down Expand Up @@ -268,7 +275,9 @@ end
return pos
end

function checkcommentandemptyline(buf, pos, len, cmt, ignoreemptylines)
const NLINES = Ref{Int}(0)

function checkcommentandemptyline(buf, pos, len, cmt, ignoreemptylines, nlines=NLINES)
cmtptr, cmtlen = cmt === nothing ? (C_NULL, 0) : cmt
ptr = pointer(buf, pos)
while pos <= len
Expand All @@ -278,6 +287,7 @@ function checkcommentandemptyline(buf, pos, len, cmt, ignoreemptylines)
if newpos > pos
pos = newpos
skipped = true
nlines[] += 1
end
end
if cmtlen > 0 && (pos + cmtlen - 1) <= len
Expand All @@ -293,6 +303,7 @@ function checkcommentandemptyline(buf, pos, len, cmt, ignoreemptylines)
end
b == UInt8('\r') && pos <= len && buf[pos + 1] == UInt8('\n') && (pos += 1)
pos += 1
nlines[] += 1
end
end
(skipped | matched) || break
Expand Down
10 changes: 5 additions & 5 deletions src/file.jl
Original file line number Diff line number Diff line change
Expand Up @@ -139,14 +139,14 @@ tbl = CSV.File(file) |> SQLite.load!(db, "sqlite_table")
Supported keyword arguments include:
* File layout options:
* `header=1`: the `header` argument can be an `Int`, indicating the row to parse for column names; or a `Range`, indicating a span of rows to be concatenated together as column names; or an entire `Vector{Symbol}` or `Vector{String}` to use as column names; if a file doesn't have column names, either provide them as a `Vector`, or set `header=0` or `header=false` and column names will be auto-generated (`Column1`, `Column2`, etc.)
* `header=1`: the `header` argument can be an `Int`, indicating the row to parse for column names; or a `Range`, indicating a span of rows to be concatenated together as column names; or an entire `Vector{Symbol}` or `Vector{String}` to use as column names; if a file doesn't have column names, either provide them as a `Vector`, or set `header=0` or `header=false` and column names will be auto-generated (`Column1`, `Column2`, etc.). Note that if a row number header and `comment` or `ignoreemtpylines` are provided, the header row will be the first non-commented/non-empty row _after_ the row number, meaning if the provided row number is a commented row, the header row will actually be the next non-commented row.
* `normalizenames=false`: whether column names should be "normalized" into valid Julia identifier symbols; useful when iterating rows and accessing column values of a row via `getproperty` (e.g. `row.col1`)
* `datarow`: an `Int` argument to specify the row where the data starts in the csv file; by default, the next row after the `header` row is used. If `header=0`, then the 1st row is assumed to be the start of data; providing a `datarow` or `skipto` argument does _not_ affect the `header` argument
* `datarow`: an `Int` argument to specify the row where the data starts in the csv file; by default, the next row after the `header` row is used. If `header=0`, then the 1st row is assumed to be the start of data; providing a `datarow` or `skipto` argument does _not_ affect the `header` argument. Note that if a row number `datarow` and `comment` or `ignoreemtpylines` are provided, the data row will be the first non-commented/non-empty row _after_ the row number, meaning if the provided row number is a commented row, the data row will actually be the next non-commented row.
* `skipto::Int`: identical to `datarow`, specifies the number of rows to skip before starting to read data
* `footerskip::Int`: number of rows at the end of a file to skip parsing
* `footerskip::Int`: number of rows at the end of a file to skip parsing. Do note that commented rows (see the `comment` keyword argument) *do not* count towards the row number provided for `footerskip`, they are completely ignored by the parser
* `limit`: an `Int` to indicate a limited number of rows to parse in a csv file; use in combination with `skipto` to read a specific, contiguous chunk within a file; note for large files when multiple threads are used for parsing, the `limit` argument may not result in exact an exact # of rows parsed; use `threaded=false` to ensure an exact limit if necessary
* `transpose::Bool`: read a csv file "transposed", i.e. each column is parsed as a row
* `comment`: rows that begin with this `String` will be skipped while parsing
* `comment`: rows that begin with this `String` will be skipped while parsing. Note that if a row number header or `datarow` and `comment` are provided, the header/data row will be the first non-commented/non-empty row _after_ the row number, meaning if the provided row number is a commented row, the header/data row will actually be the next non-commented row.
* `ignoreemptylines::Bool=true`: whether empty rows/lines in a file should be ignored (if `false`, each column will be assigned `missing` for that empty row)
* `threaded::Bool`: whether parsing should utilize multiple threads; by default threads are used on large enough files, but isn't allowed when `transpose=true`; only available in Julia 1.3+
* `tasks::Integer=Threads.nthreads()`: for multithreaded parsing, this controls the number of tasks spawned to read a file in chunks concurrently; defaults to the # of threads Julia was started with (i.e. `JULIA_NUM_THREADS` environment variable)
Expand Down Expand Up @@ -701,7 +701,7 @@ end
options.silencewarnings || numwarnings[] > maxwarnings || toomanycolumns(ncols, rowoffset + row)
numwarnings[] += 1
# ignore the rest of the line
pos = skiptorow(buf, pos, len, options.oq, options.e, options.cq, 1, 2)
pos = skiptorow(buf, pos, len, options.oq, options.e, options.cq, options.cmt, ignoreemptylines(options), 1, 2)
end
end
end
Expand Down
2 changes: 1 addition & 1 deletion src/header.jl
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ getdf(x::AbstractDict{Int}, nm, i) = haskey(x, i) ? x[i] : nothing
lastbyte = buf[end]
endpos = (lastbyte == UInt8('\r') || lastbyte == UInt8('\n')) +
(lastbyte == UInt8('\n') && buf[end - 1] == UInt8('\r'))
revlen = skiptorow(ReversedBuf(buf), 1 + endpos, len, oq, eq, cq, 0, footerskip) - 2
revlen = skiptorow(ReversedBuf(buf), 1 + endpos, len, oq, eq, cq, cmt, ignoreemptylines, 0, footerskip) - 2
len -= revlen
debug && println("adjusted for footerskip, len = $(len + revlen - 1) => $len")
end
Expand Down
6 changes: 3 additions & 3 deletions src/rows.jl
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,13 @@ end
Supported keyword arguments include:
* File layout options:
* `header=1`: the `header` argument can be an `Int`, indicating the row to parse for column names; or a `Range`, indicating a span of rows to be concatenated together as column names; or an entire `Vector{Symbol}` or `Vector{String}` to use as column names; if a file doesn't have column names, either provide them as a `Vector`, or set `header=0` or `header=false` and column names will be auto-generated (`Column1`, `Column2`, etc.)
* `header=1`: the `header` argument can be an `Int`, indicating the row to parse for column names; or a `Range`, indicating a span of rows to be concatenated together as column names; or an entire `Vector{Symbol}` or `Vector{String}` to use as column names; if a file doesn't have column names, either provide them as a `Vector`, or set `header=0` or `header=false` and column names will be auto-generated (`Column1`, `Column2`, etc.). Note that if a row number header and `comment` or `ignoreemtpylines` are provided, the header row will be the first non-commented/non-empty row _after_ the row number, meaning if the provided row number is a commented row, the header row will actually be the next non-commented row.
* `normalizenames=false`: whether column names should be "normalized" into valid Julia identifier symbols; useful when iterating rows and accessing column values of a row via `getproperty` (e.g. `row.col1`)
* `datarow`: an `Int` argument to specify the row where the data starts in the csv file; by default, the next row after the `header` row is used. If `header=0`, then the 1st row is assumed to be the start of data
* `datarow`: an `Int` argument to specify the row where the data starts in the csv file; by default, the next row after the `header` row is used. If `header=0`, then the 1st row is assumed to be the start of data; providing a `datarow` or `skipto` argument does _not_ affect the `header` argument. Note that if a row number `datarow` and `comment` or `ignoreemtpylines` are provided, the data row will be the first non-commented/non-empty row _after_ the row number, meaning if the provided row number is a commented row, the data row will actually be the next non-commented row.
* `skipto::Int`: similar to `datarow`, specifies the number of rows to skip before starting to read data
* `limit`: an `Int` to indicate a limited number of rows to parse in a csv file; use in combination with `skipto` to read a specific, contiguous chunk within a file
* `transpose::Bool`: read a csv file "transposed", i.e. each column is parsed as a row
* `comment`: rows that begin with this `String` will be skipped while parsing
* `comment`: rows that begin with this `String` will be skipped while parsing. Note that if a row number header or `datarow` and `comment` are provided, the header/data row will be the first non-commented/non-empty row _after_ the row number, meaning if the provided row number is a commented row, the header/data row will actually be the next non-commented row.
* `ignoreemptylines::Bool=true`: whether empty rows/lines in a file should be ignored (if `false`, each column will be assigned `missing` for that empty row)
* Parsing options:
* `missingstrings`, `missingstring`: either a `String`, or `Vector{String}` to use as sentinel values that will be parsed as `missing`; by default, only an empty field (two consecutive delimiters) is considered `missing`
Expand Down
1 change: 1 addition & 0 deletions src/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,7 @@ end
Base.size(a::ReversedBuf) = size(a.buf)
Base.IndexStyle(::Type{ReversedBuf}) = Base.IndexLinear()
Base.getindex(a::ReversedBuf, i::Int) = a.buf[end + 1 - i]
Base.pointer(a::ReversedBuf, pos::Integer=1) = pointer(a.buf, length(a.buf) + 1 - pos)

memset!(ptr, value, num) = ccall(:memset, Ptr{Cvoid}, (Ptr{Cvoid}, Cint, Csize_t), ptr, value, num)

Expand Down
19 changes: 19 additions & 0 deletions test/basics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -545,4 +545,23 @@ f = CSV.File(codeunits("a\n1"))
@test length(f) == 1
@test f.a == [1]

# 788
f = CSV.File(IOBuffer("""
# 1'2
name
junk
1
"""), comment="#", header=2, datarow=4)
@test length(f) == 1
@test f[1].name == 1

f = CSV.File(IOBuffer("""
# 1'2"
name
junk
1
"""), comment="#", header=2, datarow=4)
@test length(f) == 1
@test f[1].name == 1

end

0 comments on commit def726a

Please sign in to comment.