Skip to content

Commit

Permalink
Improve performance of push!! to DataFrames
Browse files Browse the repository at this point in the history
  • Loading branch information
tkf committed Nov 4, 2019
1 parent c099249 commit 912b96f
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 5 deletions.
42 changes: 42 additions & 0 deletions benchmark/bench_dataframes.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
module BenchDataFrames

using BangBang: push!!
using BenchmarkTools: @benchmarkable, BenchmarkGroup
using CategoricalArrays: CategoricalVector
using DataFrames: DataFrame
using DataStructures: LittleDict

function repeat_push(df, row, n = 10)
row = LittleDict(keys(row), values(row))
for _ = 1:n
foreach(empty!, eachcol(df))
for i = 1:2^10
push!!(df, row)
end
end
end

mkrow(x) = (; tuple.(Symbol.('a':'a'+(8-1)), x)...)

suite = BenchmarkGroup()

suite["Int-Int"] = @benchmarkable repeat_push($(DataFrame([mkrow(0)])), $(mkrow(0)))
suite["Missing-Int"] = @benchmarkable repeat_push(
$(DataFrame([mkrow(missing)])),
$(mkrow(0)),
)
suite["String-String"] = @benchmarkable repeat_push(
$(DataFrame([mkrow("x")])),
$(mkrow("x")),
)
suite["Missing-String"] = @benchmarkable repeat_push(
$(DataFrame([mkrow(missing)])),
$(mkrow("x")),
)
suite["Categorical"] = @benchmarkable repeat_push(
$(DataFrame(mkrow(Ref(CategoricalVector(["x"]))))),
$(mkrow("x")),
)

end # module
BenchDataFrames.suite
41 changes: 36 additions & 5 deletions src/dataframes_impl.jl
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
_getvalue(x, pos, name) = getproperty(x, name)
_getvalue(x::AbstractVector, pos, name) = x[pos]
_getvalue(x::Tuple, pos, name) = x[pos]
_getvalue(x::NamedTuple, pos, name) = x[name]
_getvalue(x::AbstractDict, pos, name) = x[name]

_hascolumn(x, n) = hasproperty(x, n)
_hascolumn(::NamedTuple{names}, n) where {names} = n in names # optimization
_hascolumn(x::AbstractDict, n) = haskey(x, n)

function checkcolumnnames(x, columnnames)
Expand All @@ -28,13 +30,42 @@ function df_append_columns!!(df, table)
return df
end

function df_append_rows!!(df, rows)
macro manually_specialize(expr, head, tail...)
Expr(:if, head, expr, manually_specialize_impl(expr, tail)) |> esc
end

manually_specialize_impl(expr, predicates) =
if isempty(predicates)
expr
else
Expr(
:elseif,
predicates[1],
expr,
manually_specialize_impl(expr, predicates[2:end]),
)
end

function df_append_rows!!(df, table)
columns = getfield(df, :columns)
for x in rows
checkcolumnnames(x, propertynames(df))
for (pos, (name, col)) in enumerate(zip(propertynames(df), columns))
colnames = DataFrames._names(df) # avoid copy
# colnames = propertynames(df)
for x in table
checkcolumnnames(x, colnames)
for (pos, (name, col)) in enumerate(zip(colnames, columns))
v = _getvalue(x, pos, name)
columns[pos] = push!!(col, v)
@manually_specialize(
columns[pos] = push!!(col, v),
# "Inline" some method lookups for typical column types:
col isa Vector{Int},
col isa Vector{Union{Int,Missing}},
col isa Vector{Float64},
col isa Vector{Union{Float64,Missing}},
col isa Vector{Symbol},
col isa Vector{Union{Symbol,Missing}},
col isa Vector{String},
col isa Vector{Union{String,Missing}},
)
end
end
return df
Expand Down

0 comments on commit 912b96f

Please sign in to comment.