Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

allow scalar broadcasting into an empty data frame #1890

Merged
merged 12 commits into from
Jul 25, 2019
10 changes: 9 additions & 1 deletion docs/src/lib/indexing.md
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,8 @@ In such an operation `AbstractDataFrame` is considered as two-dimensional and `D
`DataFrameRow` is considered to be column-oriented.

Additional rules:
* in the `df[CartesianIndex(row, col)] .= v`, `df[row, col] .= v` and `df[row, cols] .= v` syntaxes the assignment to `df` is performed in-place;
* in the `df[CartesianIndex(row, col)] .= v`, `df[row, col] .= v` syntaxes we broadcast `v` into the contents of `df[row, col]` (this is consistent with Base behavior);
bkamins marked this conversation as resolved.
Show resolved Hide resolved
* in the `df[row, cols] .= v` syntaxes the assignment to `df` is performed in-place;
* in the `df[rows, col] .= v` and `df[rows, cols] .= v` syntaxes the assignment to `df` is performed in-place;
* in the `df[!, col] .= v` syntax column `col` is replaced by a freshly allocated vector; if `col` is `Symbol` and it is missing from `df` then a new column is added;
* `df[!, cols] = v` syntax is currently disallowed, but is planned to be supported in the future;
Expand All @@ -186,3 +187,10 @@ Note that `sdf[!, col] .= v` and `sdf[!, cols] .= v` syntaxes are not allowed as

If column indexing using `Symbol` names in `cols` is performed, the order of columns in the operation is specified
by the order of names.

The `df[!, col] .= v` syntax follows several convinence special rules:
bkamins marked this conversation as resolved.
Show resolved Hide resolved
* if `ncol(df) == 0` then it is allowed to add a column `v` as a freshly allocated column `col`;
the length of this column is equal to `length(v)` if `v` is a vector and `1` if it is a scalar or
a scalar broadcasting operation;
* if `ncol(df) > 0` and `nrow(df) == 0` then it is allowed to add a column only if `v` is a scalar;
the length of this column is `0` and type the same as `typeof(v)`;
80 changes: 71 additions & 9 deletions src/other/broadcasting.jl
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,52 @@ struct LazyNewColDataFrame{T}
col::T
end

# we allow LazyNewColDataFrame only for data frames with at least one column
# we allow LazyNewColDataFrame also for data frames with no columns
# in this case we inherit length from the left hand side
# unless the data frame has no columns;
# in this case we inherit it from the right hand side
Base.axes(x::LazyNewColDataFrame) = (Base.OneTo(nrow(x.df)),)

function Base.Broadcast.materialize!(dest::LazyNewColDataFrame,
bc::Base.Broadcast.Broadcasted{Style}) where {Style}
ibc = Base.Broadcast.instantiate(bc)
if length(axes(ibc)) == 1 && length(axes(ibc)[1]) == 0
throw(ArgumentError("Cannot broadcast over an empty container"))
end
if ncol(dest.df) > 0
copyto!(dest, Base.Broadcast.instantiate(Base.Broadcast.Broadcasted{Style}(bc.f, bc.args, axes(dest))))
else
if length(axes(ibc)) > 1
throw(DimensionMismatch("Cannot broadcast $(length(axes(bc)))-dimensional" *
"object into a vector"))
end
if length(axes(ibc)) == 1
copyto!(dest, ibc)
else
copyto!(dest, Base.Broadcast.instantiate(Base.Broadcast.Broadcasted{Style}(bc.f, bc.args, (Base.OneTo(1),))))
end
end
end

function Base.Broadcast.materialize!(dest::LazyNewColDataFrame, x)
if length(axes(x)) == 1 && length(axes(x)[1]) == 0
throw(ArgumentError("Cannot broadcast over an empty container"))
end
if ncol(dest.df) > 0
copyto!(dest, Base.Broadcast.instantiate(Base.Broadcast.Broadcasted(identity, (x,), axes(dest))))
else
if length(axes(x)) > 1
throw(DimensionMismatch("Cannot broadcast $(length(axes(x)))-dimensional" *
"object into a vector"))
end
if length(axes(x)) == 1
copyto!(dest, Base.Broadcast.instantiate(Base.Broadcast.Broadcasted(identity, (x,), axes(x))))
else
copyto!(dest, Base.Broadcast.instantiate(Base.Broadcast.Broadcasted(identity, (x,), (Base.OneTo(1),))))
end
end
end

# ColReplaceDataFrame is reserved for future extensions if we decide to allow df[!, cols] .= v
# # ColReplaceDataFrame allows for column replacement in broadcasting
# struct ColReplaceDataFrame
Expand All @@ -90,15 +133,16 @@ Base.axes(x::LazyNewColDataFrame) = (Base.OneTo(nrow(x.df)),)

# Base.axes(x::ColReplaceDataFrame) = axes(x.df)

Base.maybeview(df::AbstractDataFrame, idx::CartesianIndex{2}) = view(df, idx[1], idx[2])
Base.maybeview(df::AbstractDataFrame, idx::CartesianIndex{2}) = df[idx]
Base.maybeview(df::AbstractDataFrame, row::Integer, col::ColumnIndex) = df[row, col]
Base.maybeview(df::AbstractDataFrame, rows, cols) = view(df, rows, cols)

function Base.maybeview(df::DataFrame, ::typeof(!), cols)
if !(cols isa ColumnIndex)
throw(ArgumentError("broadcasting with column replacement is currently allowed only for single column index"))
end
if ncol(df) == 0
throw(ArgumentError("broadcasting into a data frame with no columns is not allowed"))
if !(cols isa Symbol) && cols > ncol(df)
throw(ArgumentError("creating new columns using an integer index by broadcasting is disallowed"))
end
# in the future we might allow cols to target multiple columns
# in which case ColReplaceDataFrame(df, index(df)[cols]) will be returned
Expand All @@ -109,16 +153,34 @@ Base.maybeview(df::SubDataFrame, ::typeof(!), idxs) =
throw(ArgumentError("broadcasting with ! row selector is not allowed for SubDataFrame"))

function Base.copyto!(lazydf::LazyNewColDataFrame, bc::Base.Broadcast.Broadcasted)
if isempty(lazydf.df)
throw(ArgumentError("creating a column via broadcasting is not allowed on empty data frames"))
end
if bc isa Base.Broadcast.Broadcasted{<:Base.Broadcast.AbstractArrayStyle{0}} &&
bc.f === identity && bc.args isa Tuple{Any} && Base.Broadcast.isflat(bc)
T = typeof(bc.args[1][])
col = similar(Vector{T}, nrow(lazydf.df))
if ncol(lazydf.df) > 0
nrows = nrow(lazydf.df)
else
nrows = 1
end
col = similar(Vector{T}, nrows)
copyto!(col, bc)
else
col = Base.Broadcast.materialize(bc)
if ncol(lazydf.df) > 0 && isempty(lazydf.df)
throw(ArgumentError("creating a column via broadcasting is not allowed on data frames " *
"with zero rows and non-zero columns unless it is a scalar assignment"))
end
if bc isa Base.Broadcast.Broadcasted{<:Base.Broadcast.AbstractArrayStyle{0}}
bcf = Base.Broadcast.flatten(bc)
v = bcf.f(getindex.(bcf.args)...)
if ncol(lazydf.df) > 0
nrows = nrow(lazydf.df)
else
nrows = 1
end
col = similar(Vector{typeof(v)}, nrows)
fill!(col, v)
else
col = Base.Broadcast.materialize(bc)
end
end
lazydf.df[!, lazydf.col] = col
end
Expand Down
103 changes: 75 additions & 28 deletions test/broadcasting.jl
Original file line number Diff line number Diff line change
Expand Up @@ -598,14 +598,41 @@ end
@test_throws BoundsError dfr[10] .= ones(3)
@test_throws ArgumentError dfr[:z] .= ones(3)
@test df == cdf

df = DataFrame()
df[!, :a] .= sin.(1:3)
df[!, :b] .= sin.(1)
df[!, :c] .= sin(1) .+ 1
@test df == DataFrame(a=sin.(1:3), b=sin.([1,1,1]), c=sin.([1,1,1]).+1)
end

@testset "empty data frame corner case" begin
df = DataFrame()
@test_throws ArgumentError df[!, 1] .= 1
@test_throws ArgumentError df[!, :a] .= [1]
@test_throws ArgumentError df[!, [:a,:b]] .= [1]
@test df == DataFrame()
@test_throws ArgumentError df[!, 2] .= 1
@test_throws ArgumentError df[!, [:a, :b]] .= [1]
@test_throws ArgumentError df[!, [:a, :b]] .= 1
@test_throws DimensionMismatch df[!, :a] .= [1 2]

for rhs in [1, [1], [1, 2], "abc", ["abc"], ["abc", "def"]]
df = DataFrame()
df[!, :a] .= rhs
@test df == DataFrame(a = rhs)

df = DataFrame()
df[!, :a] .= length.(rhs)
@test df == DataFrame(a = length.(rhs))

df = DataFrame()
df[!, :a] .= length.(rhs) .+ 1
@test df == DataFrame(a = length.(rhs) .+ 1)

df = DataFrame()
@. df[!, :a] = length(rhs) + 1
@test df == DataFrame(a = length.(rhs) .+ 1)
end

df = DataFrame()
df .= 1
@test df == DataFrame()
df .= [1]
Expand All @@ -615,11 +642,27 @@ end
@test_throws DimensionMismatch df .= ones(1,2)
@test_throws DimensionMismatch df .= ones(1,1,1)

@test_throws ArgumentError df[!, :a] .= 1
@test_throws ArgumentError df[!, [:a, :b]] .= 1

df = DataFrame(a=[])
@test_throws ArgumentError df[!, :b] .= 1
@test_throws ArgumentError df[!, :b] .= sin.(1)
@test_throws ArgumentError df[!, :b] .= [1]
df[!, :b] .= 1
@test names(df) == [:a, :b]
@test eltype(df.b) == Int

c = categorical(["a", "b", "c"])
df = DataFrame()
df[!, :a] .= c
@test df.a == c
@test df.a !== c
@test df.a isa CategoricalVector
df[!, :b] .= c[1]
@test df.b == [c[1], c[1], c[1]]
@test df.b isa CategoricalVector
bkamins marked this conversation as resolved.
Show resolved Hide resolved

df = DataFrame()
df[!, :a] .= c[1]
@test df.a == c[1:1]
@test df.a isa CategoricalVector
end

@testset "test categorical values" begin
Expand Down Expand Up @@ -694,9 +737,11 @@ end

@testset "scalar on assignment side" begin
df = DataFrame(rand(2, 3))
df[1, 1] .= df[1, 1] .- df[1, 1]
@test_throws MethodError df[1, 1] .= df[1, 1] .- df[1, 1]
df[1, 1:1] .= df[1, 1] .- df[1, 1]
@test df[1, 1] == 0
df[1, 2] .-= df[1, 2]
@test_throws MethodError df[1, 2] .-= df[1, 2]
df[1:1, 2] .-= df[1, 2]
@test df[1, 2] == 0
end

Expand Down Expand Up @@ -940,26 +985,20 @@ end
@testset "additional checks of post-! broadcasting rules" begin
df = copy(refdf)
v1 = df[!, 1]
df[CartesianIndex(1, 1)] .= VERSION >= v"1.1.0" ? 'd' : Ref('d')
@test v1 == [100.0, 2.5, 3.5]
@test_throws MethodError df[CartesianIndex(1, 1)] .= 1
@test_throws MethodError df[CartesianIndex(1, 1)] .= "d"
@test v1 == [100.0, 2.5, 3.5]
@test_throws DimensionMismatch df[CartesianIndex(1, 1)] .= [1,2]

df = copy(refdf)
v1 = df[!, 1]
df[1, 1] .= VERSION >= v"1.1.0" ? 'd' : Ref('d')
@test v1 == [100.0, 2.5, 3.5]
@test_throws MethodError df[1, 1] .= 1
@test_throws MethodError df[1, 1] .= "d"
@test v1 == [100.0, 2.5, 3.5]
@test_throws DimensionMismatch df[1, 1] .= [1, 2]

df = copy(refdf)
v1 = df[!, 1]
df[1, :x1] .= VERSION >= v"1.1.0" ? 'd' : Ref('d')
@test v1 == [100.0, 2.5, 3.5]
@test_throws MethodError df[1, :x1] .= 1
@test_throws MethodError df[1, :x1] .= "d"
@test v1 == [100.0, 2.5, 3.5]
@test_throws DimensionMismatch df[1, :x1] .= [1, 2]

df = copy(refdf)
Expand Down Expand Up @@ -1088,7 +1127,7 @@ end
@test df == refdf
@test_throws ArgumentError df[!, 10] .= [1,2,3]
@test df == refdf
@test_throws DimensionMismatch df[!, 10] .= [1 2 3]
@test_throws ArgumentError df[!, 10] .= [1 2 3]
@test df == refdf

df = copy(refdf)
Expand All @@ -1110,26 +1149,20 @@ end

df = view(copy(refdf), :, :)
v1 = df[!, 1]
df[CartesianIndex(1, 1)] .= VERSION >= v"1.1.0" ? 'd' : Ref('d')
@test v1 == [100.0, 2.5, 3.5]
@test_throws MethodError df[CartesianIndex(1, 1)] .= 1
@test_throws MethodError df[CartesianIndex(1, 1)] .= "d"
@test v1 == [100.0, 2.5, 3.5]
@test_throws DimensionMismatch df[CartesianIndex(1, 1)] .= [1,2]

df = view(copy(refdf), :, :)
v1 = df[!, 1]
df[1, 1] .= VERSION >= v"1.1.0" ? 'd' : Ref('d')
@test v1 == [100.0, 2.5, 3.5]
@test_throws MethodError df[1, 1] .= 1
@test_throws MethodError df[1, 1] .= "d"
@test v1 == [100.0, 2.5, 3.5]
@test_throws DimensionMismatch df[1, 1] .= [1, 2]

df = view(copy(refdf), :, :)
v1 = df[!, 1]
df[1, :x1] .= VERSION >= v"1.1.0" ? 'd' : Ref('d')
@test v1 == [100.0, 2.5, 3.5]
@test_throws MethodError df[1, :x1] .= 1
@test_throws MethodError df[1, :x1] .= "d"
@test v1 == [100.0, 2.5, 3.5]
@test_throws DimensionMismatch df[1, :x1] .= [1, 2]

df = view(copy(refdf), :, :)
Expand Down Expand Up @@ -1277,4 +1310,18 @@ end
@test df.a !== a
end

@testset "add new correct rules for df[row, col] .= v broadcasting" begin
df = DataFrame(a=1)
@test_throws MethodError df[1,1] .= 10
@test_throws MethodError df[1,:a] .= 10
@test_throws MethodError df[CartesianIndex(1,1)] .= 10
df = DataFrame(a=[[1,2,3]])
df[1,1] .= 10
@test df == DataFrame(a=[[10,10,10]])
df[1,:a] .= 100
@test df == DataFrame(a=[[100,100,100]])
df[CartesianIndex(1,1)] .= 1000
@test df == DataFrame(a=[[1000,1000,1000]])
end

end # module