-
Notifications
You must be signed in to change notification settings - Fork 52
/
Tables.jl
727 lines (593 loc) · 35 KB
/
Tables.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
module Tables
using LinearAlgebra, DataValueInterfaces, DataAPI, TableTraits, IteratorInterfaceExtensions, OrderedCollections
export rowtable, columntable
if !hasmethod(getproperty, Tuple{Tuple, Int})
Base.getproperty(t::Tuple, i::Int) = t[i]
end
import Base: ==
"""
Tables.AbstractColumns
An interface type defined as an ordered set of columns that support
retrieval of individual columns by name or index. A retrieved column
must be a 1-based indexable collection with known length, i.e. an object
that supports `length(col)` and `col[i]` for any `i = 1:length(col)`.
`Tables.columns` must return an object that satisfies the `Tables.AbstractColumns` interface.
While `Tables.AbstractColumns` is an abstract type that custom "columns" types may subtype for
useful default behavior (indexing, iteration, property-access, etc.), users should not use it
for dispatch, as Tables.jl interface objects **are not required** to subtype, but only
implement the required interface methods.
Interface definition:
| Required Methods | Default Definition | Brief Description |
|----------------------------------------------------------|-----------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `Tables.getcolumn(table, i::Int)` | getfield(table, i) | Retrieve a column by index |
| `Tables.getcolumn(table, nm::Symbol)` | getproperty(table, nm) | Retrieve a column by name |
| `Tables.columnnames(table)` | propertynames(table) | Return column names for a table as an indexable collection |
| **Optional methods** | | |
| `Tables.getcolumn(table, ::Type{T}, i::Int, nm::Symbol)` | Tables.getcolumn(table, nm) | Given a column eltype `T`, index `i`, and column name `nm`, retrieve the column. Provides a type-stable or even constant-prop-able mechanism for efficiency. |
Note that subtypes of `Tables.AbstractColumns` **must** overload all required methods listed
above instead of relying on these methods' default definitions.
While types aren't required to subtype `Tables.AbstractColumns`, benefits of doing so include:
* Indexing interface defined (using `getcolumn`); i.e. `tbl[i]` will retrieve the column at index `i`
* Property access interface defined (using `columnnames` and `getcolumn`); i.e. `tbl.col1` will retrieve column named `col1`
* Iteration interface defined; i.e. `for col in table` will iterate each column in the table
* `AbstractDict` methods defined (`get`, `haskey`, etc.) for checking and retrieving columns
* A default `show` method
This allows a custom table type to behave as close as possible to a builtin `NamedTuple` of vectors object.
"""
abstract type AbstractColumns end
"""
Tables.AbstractRow
Abstract interface type representing the expected `eltype` of the iterator returned from `Tables.rows(table)`.
`Tables.rows` must return an iterator of elements that satisfy the `Tables.AbstractRow` interface.
While `Tables.AbstractRow` is an abstract type that custom "row" types may subtype for
useful default behavior (indexing, iteration, property-access, etc.), users should not use it
for dispatch, as Tables.jl interface objects **are not required** to subtype, but only
implement the required interface methods.
Interface definition:
| Required Methods | Default Definition | Brief Description |
|--------------------------------------------------------|---------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `Tables.getcolumn(row, i::Int)` | getfield(row, i) | Retrieve a column value by index |
| `Tables.getcolumn(row, nm::Symbol)` | getproperty(row, nm) | Retrieve a column value by name |
| `Tables.columnnames(row)` | propertynames(row) | Return column names for a row as an indexable collection |
| **Optional methods** | | |
| `Tables.getcolumn(row, ::Type{T}, i::Int, nm::Symbol)` | Tables.getcolumn(row, nm) | Given a column element type `T`, index `i`, and column name `nm`, retrieve the column value. Provides a type-stable or even constant-prop-able mechanism for efficiency. |
Note that subtypes of `Tables.AbstractRow` **must** overload all required methods listed above
instead of relying on these methods' default definitions.
While custom row types aren't required to subtype `Tables.AbstractRow`, benefits of doing so include:
* Indexing interface defined (using `getcolumn`); i.e. `row[i]` will return the column value at index `i`
* Property access interface defined (using `columnnames` and `getcolumn`); i.e. `row.col1` will retrieve the value for the column named `col1`
* Iteration interface defined; i.e. `for x in row` will iterate each column value in the row
* `AbstractDict` methods defined (`get`, `haskey`, etc.) for checking and retrieving column values
* A default `show` method
This allows the custom row type to behave as close as possible to a builtin `NamedTuple` object.
"""
abstract type AbstractRow end
"""
Tables.getcolumn(::AbstractColumns, nm::Symbol) => Indexable collection with known length
Tables.getcolumn(::AbstractColumns, i::Int) => Indexable collection with known length
Tables.getcolumn(::AbstractColumns, T, i::Int, nm::Symbol) => Indexable collection with known length
Tables.getcolumn(::AbstractRow, nm::Symbol) => Column value
Tables.getcolumn(::AbstractRow, i::Int) => Column value
Tables.getcolumn(::AbstractRow, T, i::Int, nm::Symbol) => Column value
Retrieve an entire column (from `AbstractColumns`) or single row column value (from an `AbstractRow`) by column name (`nm`), index (`i`),
or if desired, by column element type (`T`), index (`i`), and name (`nm`). When called on a `AbstractColumns` interface object,
the returned object should be an indexable collection with known length. When called on a `AbstractRow` interface
object, it returns the single column value. The methods taking a single `Symbol` or `Int` are both required
for the `AbstractColumns` and `AbstractRow` interfaces; the third method is optional if type stability is possible.
The default definition of `Tables.getcolumn(x, i::Int)` is `getfield(x, i)`. The default definition of
`Tables.getcolumn(x, nm::Symbol)` is `getproperty(x, nm)`.
"""
function getcolumn end
getcolumn(x, i::Int) = getfield(x, i)
getcolumn(x, nm::Symbol) = getproperty(x, nm)
getcolumn(x, ::Type{T}, i::Int, nm::Symbol) where {T} = getcolumn(x, nm)
getcolumn(x::NamedTuple{names, types}, ::Type{T}, i::Int, nm::Symbol) where {names, types, T} = Core.getfield(x, i)
"""
Tables.columnnames(::Union{AbstractColumns, AbstractRow}) => Indexable collection
Retrieves the list of column names as an indexable collection (like a `Tuple` or `Vector`)
for a `AbstractColumns` or `AbstractRow` interface object.
The default definition calls `propertynames(x)`.
The returned column names must be unique.
"""
function columnnames end
columnnames(x) = propertynames(x)
"""
Tables.isrowtable(x) => Bool
For convenience, some table objects that are naturally "row oriented" can
define `Tables.isrowtable(::Type{TableType}) = true` to simplify satisfying the
Tables.jl interface. Requirements for defining `isrowtable` include:
* `Tables.rows(x) === x`, i.e. the table object itself is a `Row` iterator
* If the table object is mutable, it should support:
* `push!(x, row)`: allow pushing a single row onto table
* `append!(x, rows)`: allow appending set of rows onto table
* If table object is mutable and indexable, it should support:
* `x[i] = row`: allow replacing of a row with another row by index
A table object that defines `Tables.isrowtable` will have definitions for
`Tables.istable`, `Tables.rowaccess`, and `Tables.rows` automatically defined.
"""
function isrowtable end
isrowtable(::T) where {T} = isrowtable(T)
isrowtable(::Type{T}) where {T} = false
# to avoid ambiguities
isrowtable(::Type{T}) where {T <: AbstractVector{Union{}}} = false
# default definitions for AbstractDict to act as an AbstractColumns or AbstractRow
getcolumn(x::AbstractDict{Symbol}, i::Int) = x[columnnames(x)[i]]
getcolumn(x::AbstractDict{Symbol}, nm::Symbol) = x[nm]
getcolumn(x::AbstractDict{Symbol}, ::Type{T}, i::Int, nm::Symbol) where {T} = x[nm]
columnnames(x::AbstractDict{Symbol}) = collect(keys(x))
getcolumn(x::AbstractDict{String}, i::Int) = x[String(columnnames(x)[i])]
getcolumn(x::AbstractDict{String}, nm::Symbol) = x[String(nm)]
getcolumn(x::AbstractDict{String}, ::Type{T}, i::Int, nm::Symbol) where {T} = x[String(nm)]
columnnames(x::AbstractDict{String}) = collect(Symbol(k) for k in keys(x))
# AbstractVector of Dicts for Tables.rows
const DictRows = AbstractVector{T} where {T <: Union{AbstractDict{String}, AbstractDict{Symbol}}}
isrowtable(::Type{<:DictRows}) = true
# DictRows doesn't naturally lend itself to the `Tables.schema` requirement
# we can't just look at the first row, because the types might change,
# row-to-row (e.g. `missing`, then `1.1`, etc.). Therefore, the safest option
# is to just return `nothing`
schema(x::DictRows) = nothing
# Dict of AbstractVectors for Tables.columns
const DictColumns = AbstractDict{K, V} where {K <: Union{Symbol, String}, V <: AbstractVector}
istable(::Type{<:DictColumns}) = true
columnaccess(::Type{<:DictColumns}) = true
columns(x::DictColumns) = x
schema(x::DictColumns) = Schema(collect(keys(x)), eltype.(values(x)))
# for other AbstractDict, let's throw an informative error
columns(x::T) where {T <: AbstractDict} = error("to treat $T as a table, it must have a key type of `Symbol`, and a value type `<: AbstractVector`")
# default definitions for AbstractRow, AbstractColumns
const RorC = Union{AbstractRow, AbstractColumns}
# avoids mutual recursion with default definitions (issue #221)
getcolumn(::T, ::Int) where {T <: RorC} = error("`Tables.getcolumn` must be specifically overloaded for $T <: Union{AbstractRow, AbstractColumns}`")
getcolumn(::T, ::Symbol) where {T <: RorC} = error("`Tables.getcolumn` must be specifically overloaded for $T <: Union{AbstractRow, AbstractColumns}`")
columnnames(::T) where {T <: RorC} = error("`Tables.columnnames` must be specifically overloaded for $T <: Union{AbstractRow, AbstractColumns}`")
Base.IteratorSize(::Type{R}) where {R <: RorC} = Base.HasLength()
Base.length(r::RorC) = length(columnnames(r))
Base.IndexStyle(::Type{<:RorC}) = Base.IndexLinear()
Base.firstindex(r::RorC) = 1
Base.lastindex(r::RorC) = length(r)
Base.getindex(r::RorC, i::Int) = getcolumn(r, i)
Base.getindex(r::RorC, nm::Symbol) = getcolumn(r, nm)
Base.getproperty(r::RorC, nm::Symbol) = getcolumn(r, nm)
Base.getproperty(r::RorC, i::Int) = getcolumn(r, i)
Base.propertynames(r::RorC) = columnnames(r)
Base.keys(r::RorC) = columnnames(r)
Base.values(r::RorC) = collect(r)
Base.haskey(r::RorC, key::Symbol) = key in columnnames(r)
Base.haskey(r::RorC, i::Int) = 0 < i <= length(columnnames(r))
Base.get(r::RorC, key::Union{Integer, Symbol}, default) = haskey(r, key) ? getcolumn(r, key) : default
Base.get(f::Base.Callable, r::RorC, key::Union{Integer, Symbol}) = haskey(r, key) ? getcolumn(r, key) : f()
Base.iterate(r::RorC, i=1) = i > length(r) ? nothing : (getcolumn(r, i), i + 1)
Base.isempty(r::RorC) = length(r) == 0
function Base.NamedTuple(r::RorC)
names = columnnames(r)
return NamedTuple{Tuple(map(Symbol, names))}(Tuple(getcolumn(r, nm) for nm in names))
end
function Base.show(io::IO, x::T) where {T <: AbstractRow}
if get(io, :compact, false) || get(io, :limit, false)
print(io, "$T: ")
show(io, NamedTuple(x))
else
println(io, "$T:")
names = collect(columnnames(x))
values = [getcolumn(x, nm) for nm in names]
Base.print_matrix(io, hcat(names, values))
end
end
function Base.show(io::IO, table::AbstractColumns; max_cols = 20)
ncols = length(columnnames(table))
print(io, "$(typeof(table)) with $(rowcount(table)) rows, $(ncols) columns, and ")
sch = schema(table)
if sch !== nothing
print(io, "schema:\n")
show(IOContext(io, :print_schema_header => false), sch)
else
print(io, "an unknown schema.")
end
end
# AbstractRow AbstractVector as Rows
const AbstractRowTable = AbstractVector{T} where {T <: AbstractRow}
isrowtable(::Type{<:AbstractRowTable}) = true
schema(x::AbstractRowTable) = nothing
# AbstractColumns as Columns
istable(::Type{<:AbstractColumns}) = true
columnaccess(::Type{<:AbstractColumns}) = true
columns(x::AbstractColumns) = x
schema(x::AbstractColumns) = nothing
"""
Tables.Row(row)
Convenience type to wrap any `AbstractRow` interface object in a dedicated struct
to provide useful default behaviors (allows any `AbstractRow` to be used like a `NamedTuple`):
* Indexing interface defined; i.e. `row[i]` will return the column value at index `i`, `row[nm]` will return column value for column name `nm`
* Property access interface defined; i.e. `row.col1` will retrieve the value for the column named `col1`
* Iteration interface defined; i.e. `for x in row` will iterate each column value in the row
* `AbstractDict` methods defined (`get`, `haskey`, etc.) for checking and retrieving column values
"""
struct Row{T} <: AbstractRow
x::T
end
Row(x::Row) = x
"""
Tables.Columns(tbl)
Convenience type that calls `Tables.columns` on an input `tbl` and wraps the resulting `AbstractColumns` interface object in a dedicated struct
to provide useful default behaviors (allows any `AbstractColumns` to be used like a `NamedTuple` of `Vectors`):
* Indexing interface defined; i.e. `row[i]` will return the column at index `i`, `row[nm]` will return column for column name `nm`
* Property access interface defined; i.e. `row.col1` will retrieve the value for the column named `col1`
* Iteration interface defined; i.e. `for x in row` will iterate each column in the row
* `AbstractDict` methods defined (`get`, `haskey`, etc.) for checking and retrieving columns
Note that `Tables.Columns` calls `Tables.columns` internally on the provided table argument.
`Tables.Columns` can be used for dispatch if needed.
"""
struct Columns{T} <: AbstractColumns
x::T
function Columns(x)
cols = columns(x)
return new{typeof(cols)}(cols)
end
end
Columns(x::Columns) = x
# Columns can only wrap something that is a table, so we pass the schema through
schema(x::Columns) = schema(getx(x))
const RorC2 = Union{Row, Columns}
getx(x::RorC2) = getfield(x, :x)
getcolumn(x::RorC2, i::Int) = getcolumn(getx(x), i)
getcolumn(x::RorC2, nm::Symbol) = getcolumn(getx(x), nm)
getcolumn(x::RorC2, ::Type{T}, i::Int, nm::Symbol) where {T} = getcolumn(getx(x), T, i, nm)
columnnames(x::RorC2) = columnnames(getx(x))
"""
Tables.istable(x) => Bool
Check if an object has specifically defined that it is a table. Note that
not all valid tables will return true, since it's possible to satisfy the
Tables.jl interface at "run-time", e.g. a `Generator` of `NamedTuple`s iterates
`NamedTuple`s, which satisfies the `AbstractRow` interface, but there's no static way
of knowing that the generator is a table.
It is recommended that for users implementing `MyType`, they define only
`istable(::Type{MyType})`. `istable(::MyType)` will then automatically delegate to this
method.
"""
function istable end
istable(x::T) where {T} = istable(T) || TableTraits.isiterabletable(x) === true
istable(::Type{T}) where {T} = isrowtable(T)
# to avoid ambiguities
istable(::Type{T}) where {T <: AbstractVector{Union{}}} = false
istable(::AbstractVector{Union{}}) = false
"""
Tables.rowaccess(x) => Bool
Check whether an object has specifically defined that it implements the `Tables.rows`
function that does _not_ copy table data. That is to say, `Tables.rows(x)` must be done
with O(1) time and space complexity when `Tables.rowaccess(x) == true`. Note that
`Tables.rows` will work on any object that iterates `AbstractRow`-compatible objects, even if
they don't define `rowaccess`, e.g. a `Generator` of `NamedTuple`s. However, this
generic fallback may copy the data from input table `x`. Also note that just because
an object defines `rowaccess` doesn't mean a user should call `Tables.rows` on it;
`Tables.columns` will also work, providing a valid `AbstractColumns` object from the rows.
Hence, users should call `Tables.rows` or `Tables.columns` depending on what is most
natural for them to *consume* instead of worrying about what and how the input is oriented.
It is recommended that for users implementing `MyType`, they define only
`rowaccess(::Type{MyType})`. `rowaccess(::MyType)` will then automatically delegate to this
method.
"""
function rowaccess end
rowaccess(x::T) where {T} = rowaccess(T)
rowaccess(::Type{T}) where {T} = isrowtable(T)
"""
Tables.columnaccess(x) => Bool
Check whether an object has specifically defined that it implements the `Tables.columns`
function that does _not_ copy table data. That is to say, `Tables.columns(x)` must be done
with O(1) time and space complexity when `Tables.columnaccess(x) == true`. Note that
`Tables.columns` has generic fallbacks allowing it to produces `AbstractColumns` objects, even if
the input doesn't define `columnaccess`. However, this generic fallback may copy the data
from input table `x`. Also note that just because an object defines `columnaccess` doesn't
mean a user should call `Tables.columns` on it; `Tables.rows` will also work, providing a
valid `AbstractRow` iterator. Hence, users should call `Tables.rows` or `Tables.columns` depending
on what is most natural for them to *consume* instead of worrying about what and how the
input is oriented.
It is recommended that for users implementing `MyType`, they define only
`columnaccess(::Type{MyType})`. `columnaccess(::MyType)` will then automatically delegate to
this method.
"""
function columnaccess end
columnaccess(x::T) where {T} = columnaccess(T)
columnaccess(::Type{T}) where {T} = false
"""
Tables.schema(x) => Union{Nothing, Tables.Schema}
Attempt to retrieve the schema of the object returned by `Tables.rows` or `Tables.columns`.
If the `AbstractRow` iterator or `AbstractColumns` object can't determine its schema, `nothing` will be returned.
Otherwise, a `Tables.Schema` object is returned, with the column names and types available for use.
"""
function schema end
schema(x) = nothing
"""
Tables.materializer(x) => Callable
For a table input, return the "sink" function or "materializing" function that can take a
Tables.jl-compatible table input and make an instance of the table type. This enables "transform"
workflows that take table inputs, apply transformations, potentially converting the table to
a different form, and end with producing a table of the same type as the original input. The
default materializer is `Tables.columntable`, which converts any table input into a `NamedTuple`
of `Vector`s.
It is recommended that for users implementing `MyType`, they define only
`materializer(::Type{<:MyType})`. `materializer(::MyType)` will then automatically delegate to
this method.
"""
function materializer end
materializer(x::T) where {T} = materializer(T)
materializer(::Type{T}) where {T} = columntable
"""
Tables.columns(x) => AbstractColumns-compatible object
Accesses data of input table source `x` by returning an [`AbstractColumns`](@ref)-compatible
object, which allows retrieving entire columns by name or index. A retrieved column
is a 1-based indexable object that has a known length, i.e. supports
`length(col)` and `col[i]` for any `i = 1:length(col)`. Note that
even if the input table source is row-oriented by nature, an efficient generic
definition of `Tables.columns` is defined in Tables.jl to build a `AbstractColumns`-
compatible object object from the input rows.
The [`Tables.Schema`](@ref) of a `AbstractColumns` object can be queried via `Tables.schema(columns)`,
which may return `nothing` if the schema is unknown.
Column names can always be queried by calling `Tables.columnnames(columns)`, and individual columns
can be accessed by calling `Tables.getcolumn(columns, i::Int )` or `Tables.getcolumn(columns, nm::Symbol)`
with a column index or name, respectively.
"""
function columns end
"""
Tables.rows(x) => Row iterator
Accesses data of input table source `x` row-by-row by returning an [`AbstractRow`](@ref)-compatible iterator.
Note that even if the input table source is column-oriented by nature, an efficient generic
definition of `Tables.rows` is defined in Tables.jl to return an iterator of row views into
the columns of the input.
The [`Tables.Schema`](@ref) of an `AbstractRow` iterator can be queried via `Tables.schema(rows)`,
which may return `nothing` if the schema is unknown.
Column names can always be queried by calling `Tables.columnnames(row)` on an individual row,
and row values can be accessed by calling `Tables.getcolumn(rows, i::Int )` or
`Tables.getcolumn(rows, nm::Symbol)` with a column index or name, respectively.
See also [`rowtable`](@ref) and [`namedtupleiterator`](@ref).
"""
function rows end
# Schema implementation
"""
Tables.Schema(names, types)
Create a `Tables.Schema` object that holds the column names and types for an `AbstractRow` iterator
returned from `Tables.rows` or an `AbstractColumns` object returned from `Tables.columns`.
`Tables.Schema` is dual-purposed: provide an easy interface for users to query these properties,
as well as provide a convenient "structural" type for code generation.
To get a table's schema, one can call `Tables.schema` on the result of `Tables.rows` or `Tables.columns`,
but also note that a table may return `nothing`, indicating that its column names and/or column element types
are unknown (usually not inferrable). This is similar to the `Base.EltypeUnknown()` trait for iterators
when `Base.IteratorEltype` is called. Users should account for the `Tables.schema(tbl) => nothing` case
by using the properties of the results of `Tables.rows(x)` and `Tables.columns(x)` directly.
To access the names, one can simply call `sch.names` to return a collection of Symbols (`Tuple` or `Vector`).
To access column element types, one can similarly call `sch.types`, which will return a collection of types (like `(Int64, Float64, String)`).
The actual type definition is
```julia
struct Schema{names, types}
storednames::Union{Nothing, Vector{Symbol}}
storedtypes::Union{Nothing, Vector{Type}}
end
```
Where `names` is a tuple of `Symbol`s or `nothing`, and `types` is a tuple _type_ of types (like `Tuple{Int64, Float64, String}`) or `nothing`.
Encoding the names & types as type parameters allows convenient use of the type in generated functions
and other optimization use-cases, but users should note that when `names` and/or `types` are the `nothing` value, the names and/or types
are stored in the `storednames` and `storedtypes` fields. This is to account for extremely wide tables with columns in the 10s of thousands
where encoding the names/types as type parameters becomes prohibitive to the compiler. So while optimizations can be written on the typed
`names`/`types` type parameters, users should also consider handling the extremely wide tables by specializing on `Tables.Schema{nothing, nothing}`.
"""
struct Schema{names, types}
storednames::Union{Nothing, Vector{Symbol}}
storedtypes::Union{Nothing, Vector{Type}}
end
Schema{names, types}() where {names, types} = Schema{names, types}(nothing, nothing)
Schema(names::Tuple{Vararg{Symbol}}, ::Type{T}) where {T <: Tuple} = Schema{names, T}()
Schema(::Type{NamedTuple{names, types}}) where {names, types} = Schema{names, types}()
# whether names/types are stored or not
stored(::Schema{names, types}) where {names, types} = names === nothing && types === nothing
stored(::Nothing) = false
# pass through Ints to allow Tuples to act as rows
sym(x) = Symbol(x)
sym(x::Int) = x
Schema(names, ::Nothing) = Schema{Tuple(map(sym, names)), nothing}()
const SCHEMA_SPECIALIZATION_THRESHOLD = (2^16) - 1
function Schema(names, types; stored::Bool=false)
if stored || length(names) > SCHEMA_SPECIALIZATION_THRESHOLD
return Schema{nothing, nothing}([sym(x) for x in names], Type[T for T in types])
else
return Schema{Tuple(map(sym, names)), Tuple{types...}}()
end
end
function Base.show(io::IO, sch::Schema)
get(io, :print_schema_header, true) && println(io, "Tables.Schema:")
nms = sch.names
Base.print_matrix(io, hcat(nms isa Vector ? nms : collect(nms), sch.types === nothing ? fill(nothing, length(nms)) : collect(sch.types)))
end
function Base.getproperty(sch::Schema{names, types}, field::Symbol) where {names, types}
if field === :names
return names === nothing ? getfield(sch, :storednames) : names
elseif field === :types
T = getfield(sch, :storedtypes)
return types === nothing ? (T !== nothing ? T : nothing) : Tuple(fieldtype(types, i) for i = 1:fieldcount(types))
else
throw(ArgumentError("unsupported property for Tables.Schema"))
end
end
Base.propertynames(::Schema) = (:names, :types)
==(a::Schema, b::Schema) = a.names == b.names && a.types == b.types
# partitions
"""
Tables.partitions(x)
Request a "table" iterator from `x`. Each iterated element must be a "table" in the sense
that one may call `Tables.rows` or `Tables.columns` to get a row-iterator or collection
of columns. All iterated elements _must_ have identical schema, so that users may call
`Tables.schema(first_element)` on the first iterated element and know that each
subsequent iteration will match the same schema. The default definition is:
```julia
Tables.partitions(x) = (x,)
```
So that any input is assumed to be a single "table". This means users should feel free
to call `Tables.partitions` anywhere they're currently calling `Tables.columns` or
`Tables.rows`, and get back an iterator of those instead. In other words, "sink" functions
can use `Tables.partitions` whether or not the user passes a partionable table, since the
default is to treat a single input as a single, non-partitioned table.
`Tables.partitioner(itr)`(@ref) is a convenience wrapper to provide table partitions
from any table iterator; this allows for easy wrapping of a `Vector` or iterator of tables
as valid partitions, since by default, they'd be treated as a single table.
A 2nd convenience method is provided with the defintion:
```julia
Tables.partitions(x...) = x
```
That allows passing vararg tables and they'll be treated as separate partitions. Sink
functions may allow vararg table inputs and can "splat them through" to `partitions`.
For convenience, `Tables.partitions(Iterators.partition(...))` is defined for cases
where user-controlled partitioning is desired over an applicable input (an input iterator).
"""
partitions(x) = (x,)
partitions(x...) = x
partitions(x::Iterators.PartitionIterator) = x
"""
Tables.LazyTable(f, arg)
A "table" type that delays materialization until `Tables.columns` or `Tables.rows` is called.
This allows, for example, sending a `LazyTable` to a remote process or thread which can
then call `Tables.columns` or `Tables.rows` to "materialize" the table. Is used by default
in `Tables.partitioner(f, itr)` where a materializer function `f` is passed to each element
of an iterable `itr`, allowing distributed/concurrent patterns like:
```julia
for tbl in Tables.partitions(Tables.partitioner(CSV.File, list_of_csv_files))
Threads.@spawn begin
cols = Tables.columns(tbl)
# do stuff with cols
end
end
```
In this example, `CSV.File` will be called like `CSV.File(x)` for each element of the
`list_of_csv_files` iterable, but _not until_ `Tables.columns(tbl)` is called, which
in this case happens in a thread-spawned task, allowing files to be parsed and processed
in parallel.
"""
struct LazyTable{F, T}
f::F
x::T
end
columns(x::LazyTable) = columns(x.f(x.x))
rows(x::LazyTable) = rows(x.f(x.x))
struct Partitioner{T}
x::T
end
"""
Tables.subset(x, inds; viewhint=nothing)
Return one or more rows from table `x` according to the position(s) specified by `inds`:
- If `inds` is a single non-boolean integer return a row object.
- If `inds` is a vector of non-boolean integers, a vector of booleans, or a `:`, return a subset of the original table according to the indices.
In this case, the returned type is not necessarily the same as the original table type.
If other types of `inds` are passed than specified above the behavior is undefined.
The `viewhint` argument tries to influence whether the returned object is a view of the original table
or an independent copy:
- If `viewhint=nothing` (the default) then the implementation for a specific table type
is free to decide whether to return a copy or a view.
- If `viewhint=true` then a view is returned and if `viewhint=false` a copy is returned.
This applies both to returning a row or a table.
Any specialized implementation of `subset` must support the `viewhint=nothing` argument.
Support for `viewhint=true` or `viewhint=false` is optional
(i.e. implementations may ignore the keyword argument and return a view or a copy regardless of `viewhint` value).
"""
function subset(x::T, inds; viewhint::Union{Bool, Nothing}=nothing, view::Union{Bool, Nothing}=nothing) where {T}
if view !== nothing
@warn "`view` keyword argument is deprecated for `Tables.subset`, use `viewhint` instead"
viewhint = view
end
# because this method is being called, we know `x` didn't define it's own Tables.subset
# first check if it supports column access, and if so, apply inds and wrap columns in a DictColumnTable
if columnaccess(x)
cols = columns(x)
if inds isa Integer
return ColumnsRow(cols, inds)
else
ret = viewhint === true ? _map(c -> Base.view(c, inds), cols) : _map(c -> c[inds], cols)
return DictColumnTable(schema(cols), ret)
end
end
# otherwise, let's get the rows and see if we can apply inds to them
r = rows(x)
if r isa AbstractVector
inds isa Integer && return r[inds]
ret = viewhint === true ? Base.view(x, inds) : x[inds]
(ret isa AbstractVector) || throw(ArgumentError("`Tables.subset`: invalid `inds` argument, expected `AbstractVector` output, got $(typeof(ret))"))
return ret
end
throw(ArgumentError("no default `Tables.subset` implementation for type: $T"))
end
vectorcheck(x::AbstractVector) = x
vectorcheck(x) = throw(ArgumentError("`Tables.subset`: invalid `inds` argument, expected `AbstractVector` output, got $(typeof(x))"))
_map(f, cols) = OrderedDict(nm => vectorcheck(f(getcolumn(cols, nm))) for nm in columnnames(cols))
"""
Tables.partitioner(f, itr)
Tables.partitioner(x)
Convenience methods to generate table iterators. The first method takes a "materializer"
function `f` and an iterator `itr`, and will call `Tables.LazyTable(f, x) for x in itr`
for each iteration. This allows delaying table materialization until `Tables.columns`
or `Tables.rows` are called on the `LazyTable` object (which will call `f(x)`). This
allows a common desired pattern of materializing and processing a table on a remote
process or thread, like:
```julia
for tbl in Tables.partitions(Tables.partitioner(CSV.File, list_of_csv_files))
Threads.@spawn begin
cols = Tables.columns(tbl)
# do stuff with cols
end
end
```
The second method is provided because the default behavior of `Tables.partition(x)`
is to treat `x` as a single, non-partitioned table. This method allows users to easily
wrap a `Vector` or generator of tables as table partitions to pass to sink functions
able to utilize `Tables.partitions`.
"""
partitioner(x) = Partitioner(x)
partitioner(f, itr) = partitioner((LazyTable(f, x) for x in itr))
partitions(x::Partitioner) = x
Base.IteratorEltype(::Type{Partitioner{S}}) where {S} = Base.IteratorEltype(S)
Base.eltype(x::Partitioner{S}) where {S} = eltype(x.x)
Base.eltype(::Type{Partitioner{S}}) where {S} = eltype(S)
Base.IteratorSize(::Type{Partitioner{S}}) where {S} = Base.IteratorSize(S)
Base.length(x::Partitioner) = length(x.x)
Base.size(x::Partitioner) = size(x.x)
Base.iterate(x::Partitioner, st...) = iterate(x.x, st...)
const SPECIALIZATION_THRESHOLD = 100
# reference implementations: Vector of NamedTuples and NamedTuple of Vectors
include("namedtuples.jl")
# helper functions
include("utils.jl")
# generic fallback definitions
include("fallbacks.jl")
# allow any valid iterator to be a table
include("tofromdatavalues.jl")
# matrix integration
include("matrix.jl")
# dict tables
include("dicts.jl")
"""
Tables.columnindex(table, name::Symbol)
Return the column index (1-based) of a column by `name` in a table with a known schema; returns 0 if `name` doesn't exist in table
"""
columnindex(table, colname::Symbol) = columnindex(schema(table), colname)
"""
Tables.columntype(table, name::Symbol)
Return the column element type of a column by `name` in a table with a known schema; returns Union{} if `name` doesn't exist in table
"""
columntype(table, colname::Symbol) = columntype(schema(table), colname)
Base.@pure columnindex(::Schema{names, types}, name::Symbol) where {names, types} = columnindex(names, name)
"given names and a Symbol `name`, compute the index (1-based) of the name in names"
Base.@pure function columnindex(names::Tuple{Vararg{Symbol}}, name::Symbol)
i = 1
for nm in names
nm === name && return i
i += 1
end
return 0
end
Base.@pure columntype(::Schema{names, types}, name::Symbol) where {names, types} = columntype(names, types, name)
"given tuple type and a Symbol `name`, compute the type of the name in the tuples types"
Base.@pure function columntype(names::Tuple{Vararg{Symbol}}, ::Type{types}, name::Symbol) where {types <: Tuple}
i = 1
for nm in names
nm === name && return fieldtype(types, i)
i += 1
end
return Union{}
end
end # module