Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add broadcasting of AbstractDataFrame #1840

Merged
merged 30 commits into from
Jun 23, 2019
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
8bb061d
add broadcasting of AbstractDataFrame
bkamins Jun 8, 2019
6a6e954
switch to Tables.allocatecolumn
bkamins Jun 8, 2019
6f151ac
one more fix
bkamins Jun 8, 2019
628229b
revert similar and fix tests
bkamins Jun 8, 2019
8af3a0f
Apply suggestions from code review
bkamins Jun 9, 2019
c9a98bd
corrections after code review
bkamins Jun 9, 2019
5b4cc36
fix typo
bkamins Jun 9, 2019
24566e4
fix broadcasting assignment bug
bkamins Jun 20, 2019
2b82f79
fix SubDataFrame case
bkamins Jun 20, 2019
3810f75
add unaliasing of data frame against data frame
bkamins Jun 20, 2019
8e335f1
small fixes in legacy code
bkamins Jun 20, 2019
80a131e
optimized broadcasting
bkamins Jun 20, 2019
82e53a6
correct unaliasing
bkamins Jun 20, 2019
87206a2
small performance optimization
bkamins Jun 20, 2019
fba7cef
performance improvements
bkamins Jun 20, 2019
0e63fb8
add more broadcasting tests
bkamins Jun 20, 2019
bb4862b
more tests
bkamins Jun 20, 2019
5b8d2ec
Merge branch 'master' into new_dataframe_broadcasting
bkamins Jun 21, 2019
699cb6b
Merge branch 'master' into new_dataframe_broadcasting
bkamins Jun 21, 2019
6780b26
even more tests
bkamins Jun 21, 2019
432a530
getcolbc cleanup
bkamins Jun 21, 2019
3fdf733
fix after a code review
bkamins Jun 21, 2019
a67a3f5
unalias optimizations
bkamins Jun 21, 2019
5784100
more tests for common cases
bkamins Jun 21, 2019
b1813e1
improve helper signature
bkamins Jun 21, 2019
6a87c42
minor improvements
bkamins Jun 22, 2019
27c730a
minor improvements 2
bkamins Jun 22, 2019
9a68d30
Apply suggestions from code review
bkamins Jun 23, 2019
5d3ec40
fixes after code review
bkamins Jun 23, 2019
1f46086
Fix indentation
nalimilan Jun 23, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
82 changes: 72 additions & 10 deletions src/other/broadcasting.jl
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,13 @@ function copyto_widen!(res::AbstractVector{T},
return res
end

function getcolbc(bcf::Base.Broadcast.Broadcasted{Style}, colind) where {Style}
# we assume that bcf is already flattened and unaliased
Base.Broadcast.Broadcasted{Style}(bcf.f,
map(x->Base.Broadcast.extrude(x isa AbstractDataFrame ? x[colind] : x),
bkamins marked this conversation as resolved.
Show resolved Hide resolved
bcf.args), bcf.axes)
end

function Base.copy(bc::Base.Broadcast.Broadcasted{DataFrameStyle})
bcf = Base.Broadcast.flatten(bc)
colnames = unique([_names(df) for df in bcf.args if df isa AbstractDataFrame])
Expand All @@ -42,23 +49,23 @@ function Base.copy(bc::Base.Broadcast.Broadcasted{DataFrameStyle})
throw(ArgumentError("Column names in broadcasted data frames must match. " *
"Non matching column names are $msg"))
end
nrows = length(axes(bc)[1])
nrows = length(axes(bcf)[1])
bkamins marked this conversation as resolved.
Show resolved Hide resolved
df = DataFrame()
for i in axes(bc)[2]
for i in axes(bcf)[2]
if nrows == 0
col = Any[]
else
v1 = bc[CartesianIndex(1, i)]
bcf′ = getcolbc(bcf, i)
v1 = bcf′[CartesianIndex(1, i)]
startcol = similar(Vector{typeof(v1)}, nrows)
startcol[1] = v1
col = copyto_widen!(startcol, bc, 2, i)
col = copyto_widen!(startcol, bcf′, 2, i)
end
df[colnames[1][i]] = col
end
return df
end


### Broadcasting assignment

struct LazyNewColDataFrame
Expand Down Expand Up @@ -113,9 +120,60 @@ function _copyto_helper!(dfcol::AbstractVector, bc::Base.Broadcast.Broadcasted,
end
end

function Base.Broadcast.broadcast_unalias(dest::AbstractDataFrame, src)
for col in eachcol(dest)
src = Base.Broadcast.unalias(col, src)
end
src
end

function Base.Broadcast.broadcast_unalias(dest::AbstractDataFrame, src::AbstractDataFrame)
if size(dest, 2) != size(src, 2)
throw(ArgumentError("Dimension mismatch in broadcasting."))
end
# col2 can be checked from col1 point as we are writing broadcasting
# results from 1 to ncol
wascopied = false
for col1 in axes(dest, 2)
for col2 in col1:ncol(src)
dcol = dest[col1]
scol = src[col2]
if Base.mightalias(dcol, scol)
if src isa SubDataFrame
bkamins marked this conversation as resolved.
Show resolved Hide resolved
if !wascopied
src = SubDataFrame(copy(parent(src), copycols=false),
index(src), rows(src))
wascopied = true
end
parentidx = parentcols(index(src), col2)
parent(src)[parentidx] = Base.unaliascopy(parent(src)[parentidx])
else
if !wascopied
src = copy(src, copycols=false)
wascopied = true
end
src[col2] = Base.unaliascopy(scol)
end
break
end
end
end
src
end

function Base.copyto!(df::AbstractDataFrame, bc::Base.Broadcast.Broadcasted)
for col in axes(df, 2)
_copyto_helper!(df[col], bc, col)
bcf = Base.Broadcast.flatten(bc)
colnames = unique([_names(df) for df in bcf.args if df isa AbstractDataFrame])
if length(colnames) > 1 || (length(colnames) == 1 && _names(df) != colnames[1])
wrongnames = setdiff(union(colnames...), intersect(colnames...))
msg = join(wrongnames, ", ", " and ")
throw(ArgumentError("Column names in broadcasted data frames must match. " *
"Non matching column names are $msg"))
end

bcf′ = Base.Broadcast.preprocess(df, bcf)
for i in axes(df, 2)
_copyto_helper!(df[i], getcolbc(bcf′, i), i)
end
df
end
Expand All @@ -128,13 +186,17 @@ function Base.copyto!(df::AbstractDataFrame, bc::Base.Broadcast.Broadcasted{<:Ba
end
df
else
copyto!(df, convert(Broadcasted{Nothing}, bc))
copyto!(df, convert(Base.Broadcast.Broadcasted{Nothing}, bc))
end
end

Base.Broadcast.broadcast_unalias(dest::DataFrameRow, src) =
Base.Broadcast.broadcast_unalias(parent(dest), src)

function Base.copyto!(dfr::DataFrameRow, bc::Base.Broadcast.Broadcasted)
for I in eachindex(bc)
dfr[I] = bc[I]
bc′ = Base.Broadcast.preprocess(dfr, bc)
for I in eachindex(bc′)
bkamins marked this conversation as resolved.
Show resolved Hide resolved
dfr[I] = bc′[I]
end
dfr
end
227 changes: 226 additions & 1 deletion test/broadcasting.jl
Original file line number Diff line number Diff line change
@@ -1,11 +1,34 @@
module TestBroadcasting

using Test, DataFrames, PooledArrays
using Test, DataFrames, PooledArrays, Random

const ≅ = isequal

refdf = DataFrame(reshape(1.5:15.5, (3,5)))

@testset "CartesianIndex" begin
df = DataFrame(rand(2, 3))
for i in axes(df, 1), j in axes(df, 2)
@test df[i,j] == df[CartesianIndex(i,j)]
r = rand()
df[CartesianIndex(i,j)] = r
@test df[i,j] == r
end
@test_throws BoundsError df[CartesianIndex(0,1)]
@test_throws BoundsError df[CartesianIndex(0,0)]
@test_throws BoundsError df[CartesianIndex(1,0)]
@test_throws BoundsError df[CartesianIndex(5,1)]
@test_throws BoundsError df[CartesianIndex(5,5)]
@test_throws BoundsError df[CartesianIndex(1,5)]

@test_throws BoundsError df[CartesianIndex(0,1)] = 1
@test_throws ArgumentError df[CartesianIndex(0,0)] = 1
@test_throws ArgumentError df[CartesianIndex(1,0)] = 1
@test_throws BoundsError df[CartesianIndex(5,1)] = 1
@test_throws ArgumentError df[CartesianIndex(5,5)] = 1
@test_throws ArgumentError df[CartesianIndex(1,5)] = 1
end

@testset "broadcasting of AbstractDataFrame objects" begin
for df in (copy(refdf), view(copy(refdf), :, :))
@test identity.(df) == refdf
bkamins marked this conversation as resolved.
Show resolved Hide resolved
Expand Down Expand Up @@ -50,6 +73,26 @@ end
@test_throws ArgumentError df .+ 1 .+ df2
end

@testset "broadcasting expansion" begin
df1 = DataFrame(x=1, y=2)
df2 = DataFrame(x=[1,11], y=[2,12])
@test df1 .+ df2 == DataFrame(x=[2,12], y=[4,14])

df1 = DataFrame(x=1, y=2)
df2 = DataFrame(x=[1,11], y=[2,12])
x = df2.x
y = df2.y
df2 .+= df1
@test df2.x === x
@test df2.y === y
@test df2 == DataFrame(x=[2,12], y=[4,14])

df = DataFrame(x=[1,11], y=[2,12])
dfv = view(df, 1:1, 1:2)
df .-= dfv
@test df == DataFrame(x=[0,10], y=[0,10])
end

@testset "broadcasting of AbstractDataFrame objects corner cases" begin
df = DataFrame(c11 = categorical(["a", "b"]), c12 = categorical([missing, "b"]), c13 = categorical(["a", missing]),
c21 = categorical([1, 2]), c22 = categorical([missing, 2]), c23 = categorical([1, missing]),
Expand Down Expand Up @@ -616,4 +659,186 @@ end
end
end

@testset "scalar broadcasting" begin
a = DataFrame(x = zeros(2))
a .= 1 ./ (1 + 2)
@test a.x == [1/3, 1/3]
a .= 1 ./ (1 .+ 3)
@test a.x == [1/4, 1/4]
a .= sqrt.(1 ./ 2)
@test a.x == [sqrt(1/2), sqrt(1/2)]
end

@testset "tuple broadcasting" begin
X = DataFrame(zeros(2, 3))
X .= (1, 2)
@test X == DataFrame([1 1 1; 2 2 2])

X = DataFrame(zeros(2, 3))
X .= (1, 2) .+ 10 .- X
@test X == DataFrame([11 11 11; 12 12 12])

X = DataFrame(zeros(2, 3))
X .+= (1, 2) .+ 10
@test X == DataFrame([11 11 11; 12 12 12])

df = DataFrame(rand(2, 3))
@test floor.(Int, df ./ (1,)) == DataFrame(zeros(Int, 2, 3))
df .= floor.(Int, df ./ (1,))
@test df == DataFrame(zeros(2, 3))

df = DataFrame(rand(2, 3))
@test_throws InexactError convert.(Int, df)
df2 = convert.(Int, floor.(df))
@test df2 == DataFrame(zeros(Int, 2, 3))
@test eltypes(df2) == [Int, Int, Int]
end

@testset "scalar on assignment side" begin
df = DataFrame(rand(2, 3))
df[1, 1] .= df[1, 1] .- df[1, 1]
@test df[1, 1] == 0
df[1, 2] .-= df[1, 2]
@test df[1, 2] == 0
end

@testset "nothing test" begin
X = DataFrame(Any[1 2; 3 4])
X .= nothing
@test (X .== nothing) == DataFrame(trues(2, 2))
end

@testset "aliasing test" begin
df = DataFrame(x=[1, 2])
y = view(df.x, [2, 1])
df .= y
@test df.x == [2, 1]

df = DataFrame(x=[1, 2])
y = view(df.x, [2, 1])
dfv = view(df, :, :)
dfv .= y
@test df.x == [2, 1]

df = DataFrame(x=2, y=1, z=1)
dfr = df[1, :]
y = view(df.x, 1)
dfr .= 2 .* y
@test Vector(dfr) == [4, 4, 4]

df = DataFrame(x=[1, 2], y=[11,12])
df2 = DataFrame()
df2.x = [-1, -2]
df2.y = df.x
df3 = copy(df2)
df .= df2
@test df == df3

Random.seed!(1234)
for i in 1:10
df1 = DataFrame(rand(100, 100))
df2 = copy(df1)
for i in 1:100
df2[rand(1:100)] = df1[i]
end
df3 = copy(df2)
df1 .= df2
@test df1 == df3
@test df2 != df3
end

for i in 1:10
df1 = DataFrame(rand(100, 100))
df2 = copy(df1)
for i in 1:100
df2[rand(1:100)] = df1[i]
end
df3 = copy(df2)
df1 .= view(df2, :, :)
@test df1 == df3
@test df2 != df3
end

for i in 1:10
df1 = DataFrame(rand(100, 100))
df2 = copy(df1)
for i in 1:100
df2[rand(1:100)] = df1[i]
end
df3 = copy(df2)
view(df1, :, :) .= df2
@test df1 == df3
@test df2 != df3
end

for i in 1:10
df1 = DataFrame(rand(100, 100))
df2 = copy(df1)
df3 = copy(df1)
for i in 1:100
df2[rand(1:100)] = df1[i]
df3[rand(1:100)] = df1[i]
end
df6 = copy(df2)
df7 = copy(df3)
df4 = DataFrame(sin.(df1[1,1] .+ copy(df1[1]) .+ Matrix(df2) ./ Matrix(df3)))
df5 = sin.(view(df1,1,1) .+ df1[1] .+ df2 ./ df3)
df1 .= sin.(view(df1,1,1) .+ df1[1] .+ df2 ./ df3)
@test df1 == df4 == df5
@test df2 != df6
@test df3 != df7
end

for i in 1:10
df1 = DataFrame(rand(100, 100))
df2 = copy(df1)
df3 = copy(df1)
for i in 1:100
df2[rand(1:100)] = df1[i]
df3[rand(1:100)] = df1[i]
end
df6 = copy(df2)
df7 = copy(df3)
df4 = DataFrame(sin.(df1[1,1] .+ copy(df1[1]) .+ Matrix(df2) ./ Matrix(df3)))
df5 = sin.(view(df1,1,1) .+ df1[1] .+ view(df2, :, :) ./ df3)
df1 .= sin.(view(df1[1],1) .+ view(df1[1], :) .+ df2 ./ view(df3, :, :))
@test df1 == df4 == df5
@test df2 != df6
@test df3 != df7
end

for i in 1:10
df1 = DataFrame(rand(100, 100))
df2 = copy(df1)
df3 = copy(df1)
for i in 1:100
df2[rand(1:100)] = df1[i]
df3[rand(1:100)] = df1[i]
end
df6 = copy(df2)
df7 = copy(df3)
df4 = DataFrame(sin.(df1[1,1] .+ copy(df1[1]) .+ Matrix(df2) ./ Matrix(df3)))
df5 = sin.(view(df1,1,1) .+ df1[1] .+ view(df2, :, :) ./ df3)
view(df1, :, :) .= sin.(view(df1[1],1) .+ view(df1[1], :) .+ df2 ./ view(df3, :, :))
@test df1 == df4 == df5
@test df2 != df6
@test df3 != df7
end
end

@testset "@. test" begin
df = DataFrame(rand(2, 3))
sdf = view(df, 1:1, :)
dfm = Matrix(df)
sdfm = Matrix(sdf)

r1 = @. (df + sdf + 5) / sdf
r2 = @. (df + sdf + 5) / sdf
@test r1 == DataFrame(r2)

@. df = sin(sdf / (df + 1))
@. dfm = sin(sdfm / (dfm + 1))
@test df == DataFrame(dfm)
end

end # module