Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

allow scalar broadcasting into an empty data frame #1890

Merged
merged 12 commits into from
Jul 25, 2019
5 changes: 3 additions & 2 deletions docs/src/lib/indexing.md
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,10 @@ In such an operation `AbstractDataFrame` is considered as two-dimensional and `D
`DataFrameRow` is considered to be column-oriented.

Additional rules:
* in the `df[CartesianIndex(row, col)] .= v`, `df[row, col] .= v` and `df[row, cols] .= v` syntaxes the assignment to `df` is performed in-place;
* in the `df[CartesianIndex(row, col)] .= v`, `df[row, col] .= v` syntaxes we broadcast `v` into the contents of `df[row, col]` (this is consistent with Base behavior);
bkamins marked this conversation as resolved.
Show resolved Hide resolved
* in the `df[row, cols] .= v` syntaxes the assignment to `df` is performed in-place;
* in the `df[rows, col] .= v` and `df[rows, cols] .= v` syntaxes the assignment to `df` is performed in-place;
* in the `df[!, col] .= v` syntax column `col` is replaced by a freshly allocated vector; if `col` is `Symbol` and it is missing from `df` then a new column is added;
* in the `df[!, col] .= v` syntax column `col` is replaced by a freshly allocated vector; if `col` is `Symbol` and it is missing from `df` then a new column is added; the length of the column is always the value of `nrow(df)` before the assignment takes place;
* `df[!, cols] = v` syntax is currently disallowed, but is planned to be supported in the future;
* `df.col .= v` syntax is allowed and performs in-place assignment to an existing vector `df.col`.
* in the `sdf[CartesianIndex(row, col)] .= v`, `sdf[row, col] .= v` and `sdf[row, cols] .= v` syntaxes the assignment to `sdf` is performed in-place;
Expand Down
23 changes: 11 additions & 12 deletions src/other/broadcasting.jl
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ struct LazyNewColDataFrame{T}
col::T
end

# we allow LazyNewColDataFrame only for data frames with at least one column
# we allow LazyNewColDataFrame also for data frames
# that are empty, ie. `nrow(df) == 0`; in this case we create a 0 length column
bkamins marked this conversation as resolved.
Show resolved Hide resolved
Base.axes(x::LazyNewColDataFrame) = (Base.OneTo(nrow(x.df)),)

# ColReplaceDataFrame is reserved for future extensions if we decide to allow df[!, cols] .= v
Expand All @@ -90,15 +91,16 @@ Base.axes(x::LazyNewColDataFrame) = (Base.OneTo(nrow(x.df)),)

# Base.axes(x::ColReplaceDataFrame) = axes(x.df)

Base.maybeview(df::AbstractDataFrame, idx::CartesianIndex{2}) = view(df, idx[1], idx[2])
Base.maybeview(df::AbstractDataFrame, idx::CartesianIndex{2}) = df[idx]
Base.maybeview(df::AbstractDataFrame, row::Integer, col::ColumnIndex) = df[row, col]
Base.maybeview(df::AbstractDataFrame, rows, cols) = view(df, rows, cols)

function Base.maybeview(df::DataFrame, ::typeof(!), cols)
if !(cols isa ColumnIndex)
throw(ArgumentError("broadcasting with column replacement is currently allowed only for single column index"))
end
if ncol(df) == 0
throw(ArgumentError("broadcasting into a data frame with no columns is not allowed"))
if !(cols isa Symbol) && cols > ncol(df)
throw(ArgumentError("creating new columns using an integer index by broadcasting is disallowed"))
end
# in the future we might allow cols to target multiple columns
# in which case ColReplaceDataFrame(df, index(df)[cols]) will be returned
Expand All @@ -108,14 +110,11 @@ end
Base.maybeview(df::SubDataFrame, ::typeof(!), idxs) =
throw(ArgumentError("broadcasting with ! row selector is not allowed for SubDataFrame"))

function Base.copyto!(lazydf::LazyNewColDataFrame, bc::Base.Broadcast.Broadcasted)
if isempty(lazydf.df)
throw(ArgumentError("creating a column via broadcasting is not allowed on empty data frames"))
end
if bc isa Base.Broadcast.Broadcasted{<:Base.Broadcast.AbstractArrayStyle{0}} &&
bc.f === identity && bc.args isa Tuple{Any} && Base.Broadcast.isflat(bc)
T = typeof(bc.args[1][])
col = similar(Vector{T}, nrow(lazydf.df))
function Base.copyto!(lazydf::LazyNewColDataFrame, bc::Base.Broadcast.Broadcasted{T}) where T
if bc isa Base.Broadcast.Broadcasted{<:Base.Broadcast.AbstractArrayStyle{0}}
bc_tmp = Base.Broadcast.Broadcasted{T}(bc.f, bc.args, ())
v = Base.Broadcast.materialize(bc_tmp)
col = similar(Vector{typeof(v)}, nrow(lazydf.df))
copyto!(col, bc)
else
col = Base.Broadcast.materialize(bc)
Expand Down
124 changes: 96 additions & 28 deletions test/broadcasting.jl
Original file line number Diff line number Diff line change
Expand Up @@ -598,14 +598,67 @@ end
@test_throws BoundsError dfr[10] .= ones(3)
@test_throws ArgumentError dfr[:z] .= ones(3)
@test df == cdf

df = DataFrame()
@test_throws DimensionMismatch df[!, :a] .= sin.(1:3)
df[!, :b] .= sin.(1)
df[!, :c] .= sin(1) .+ 1
@test df == DataFrame(b=Float64[], c=Float64[])
end

@testset "empty data frame corner case" begin
df = DataFrame()
@test_throws ArgumentError df[!, 1] .= 1
@test_throws ArgumentError df[!, :a] .= [1]
@test_throws ArgumentError df[!, [:a,:b]] .= [1]
@test df == DataFrame()
@test_throws ArgumentError df[!, 2] .= 1
@test_throws ArgumentError df[!, [:a, :b]] .= [1]
@test_throws ArgumentError df[!, [:a, :b]] .= 1
@test_throws DimensionMismatch df[!, :a] .= [1 2]
@test_throws DimensionMismatch df[!, :a] .= [1, 2]
@test_throws DimensionMismatch df[!, :a] .= sin.(1) .+ [1, 2]

for rhs in [1, [1], Int[], "abc", ["abc"]]
df = DataFrame()
df[!, :a] .= rhs
@test size(df) == (0, 1)
@test eltype(df[!, 1]) == (rhs isa AbstractVector ? eltype(rhs) : typeof(rhs))

df = DataFrame()
df[!, :a] .= length.(rhs)
@test size(df) == (0, 1)
@test eltype(df[!, 1]) == Int

df = DataFrame()
df[!, :a] .= length.(rhs) .+ 1
@test size(df) == (0, 1)
@test eltype(df[!, 1]) == Int

df = DataFrame()
@. df[!, :a] = length(rhs) + 1
@test size(df) == (0, 1)
@test eltype(df[!, 1]) == Int

df = DataFrame(x=Int[])
df[!, :a] .= rhs
@test size(df) == (0, 2)
@test eltype(df[!, 2]) == (rhs isa AbstractVector ? eltype(rhs) : typeof(rhs))

df = DataFrame(x=Int[])
df[!, :a] .= length.(rhs)
@test size(df) == (0, 2)
@test eltype(df[!, 2]) == Int

df = DataFrame(x=Int[])
df[!, :a] .= length.(rhs) .+ 1
@test size(df) == (0, 2)
@test eltype(df[!, 2]) == Int

df = DataFrame(x=Int[])
@. df[!, :a] = length(rhs) + 1
@test size(df) == (0, 2)
@test eltype(df[!, 2]) == Int
end

df = DataFrame()
df .= 1
@test df == DataFrame()
df .= [1]
Expand All @@ -615,11 +668,22 @@ end
@test_throws DimensionMismatch df .= ones(1,2)
@test_throws DimensionMismatch df .= ones(1,1,1)

@test_throws ArgumentError df[!, :a] .= 1
@test_throws ArgumentError df[!, [:a, :b]] .= 1

df = DataFrame(a=[])
@test_throws ArgumentError df[!, :b] .= 1
df[!, :b] .= sin.(1)
@test eltype(df.b) == Float64
df[!, :b] .= [1]
@test eltype(df.b) == Int
df[!, :b] .= 'a'
@test eltype(df.b) == Char
@test names(df) == [:a, :b]

c = categorical(["a", "b", "c"])
df = DataFrame()
@test_throws DimensionMismatch df[!, :a] .= c

df[!, :b] .= c[1]
@test nrow(df) == 0
@test df.b isa CategoricalVector
bkamins marked this conversation as resolved.
Show resolved Hide resolved
end

@testset "test categorical values" begin
Expand Down Expand Up @@ -694,9 +758,11 @@ end

@testset "scalar on assignment side" begin
df = DataFrame(rand(2, 3))
df[1, 1] .= df[1, 1] .- df[1, 1]
@test_throws MethodError df[1, 1] .= df[1, 1] .- df[1, 1]
df[1, 1:1] .= df[1, 1] .- df[1, 1]
@test df[1, 1] == 0
df[1, 2] .-= df[1, 2]
@test_throws MethodError df[1, 2] .-= df[1, 2]
df[1:1, 2] .-= df[1, 2]
@test df[1, 2] == 0
end

Expand Down Expand Up @@ -940,26 +1006,20 @@ end
@testset "additional checks of post-! broadcasting rules" begin
df = copy(refdf)
v1 = df[!, 1]
df[CartesianIndex(1, 1)] .= VERSION >= v"1.1.0" ? 'd' : Ref('d')
@test v1 == [100.0, 2.5, 3.5]
@test_throws MethodError df[CartesianIndex(1, 1)] .= 1
@test_throws MethodError df[CartesianIndex(1, 1)] .= "d"
@test v1 == [100.0, 2.5, 3.5]
@test_throws DimensionMismatch df[CartesianIndex(1, 1)] .= [1,2]

df = copy(refdf)
v1 = df[!, 1]
df[1, 1] .= VERSION >= v"1.1.0" ? 'd' : Ref('d')
@test v1 == [100.0, 2.5, 3.5]
@test_throws MethodError df[1, 1] .= 1
@test_throws MethodError df[1, 1] .= "d"
@test v1 == [100.0, 2.5, 3.5]
@test_throws DimensionMismatch df[1, 1] .= [1, 2]

df = copy(refdf)
v1 = df[!, 1]
df[1, :x1] .= VERSION >= v"1.1.0" ? 'd' : Ref('d')
@test v1 == [100.0, 2.5, 3.5]
@test_throws MethodError df[1, :x1] .= 1
@test_throws MethodError df[1, :x1] .= "d"
@test v1 == [100.0, 2.5, 3.5]
@test_throws DimensionMismatch df[1, :x1] .= [1, 2]

df = copy(refdf)
Expand Down Expand Up @@ -1088,7 +1148,7 @@ end
@test df == refdf
@test_throws ArgumentError df[!, 10] .= [1,2,3]
@test df == refdf
@test_throws DimensionMismatch df[!, 10] .= [1 2 3]
@test_throws ArgumentError df[!, 10] .= [1 2 3]
@test df == refdf

df = copy(refdf)
Expand All @@ -1110,26 +1170,20 @@ end

df = view(copy(refdf), :, :)
v1 = df[!, 1]
df[CartesianIndex(1, 1)] .= VERSION >= v"1.1.0" ? 'd' : Ref('d')
@test v1 == [100.0, 2.5, 3.5]
@test_throws MethodError df[CartesianIndex(1, 1)] .= 1
@test_throws MethodError df[CartesianIndex(1, 1)] .= "d"
@test v1 == [100.0, 2.5, 3.5]
@test_throws DimensionMismatch df[CartesianIndex(1, 1)] .= [1,2]

df = view(copy(refdf), :, :)
v1 = df[!, 1]
df[1, 1] .= VERSION >= v"1.1.0" ? 'd' : Ref('d')
@test v1 == [100.0, 2.5, 3.5]
@test_throws MethodError df[1, 1] .= 1
@test_throws MethodError df[1, 1] .= "d"
@test v1 == [100.0, 2.5, 3.5]
@test_throws DimensionMismatch df[1, 1] .= [1, 2]

df = view(copy(refdf), :, :)
v1 = df[!, 1]
df[1, :x1] .= VERSION >= v"1.1.0" ? 'd' : Ref('d')
@test v1 == [100.0, 2.5, 3.5]
@test_throws MethodError df[1, :x1] .= 1
@test_throws MethodError df[1, :x1] .= "d"
@test v1 == [100.0, 2.5, 3.5]
@test_throws DimensionMismatch df[1, :x1] .= [1, 2]

df = view(copy(refdf), :, :)
Expand Down Expand Up @@ -1277,4 +1331,18 @@ end
@test df.a !== a
end

@testset "add new correct rules for df[row, col] .= v broadcasting" begin
df = DataFrame(a=1)
@test_throws MethodError df[1,1] .= 10
@test_throws MethodError df[1,:a] .= 10
@test_throws MethodError df[CartesianIndex(1,1)] .= 10
df = DataFrame(a=[[1,2,3]])
df[1,1] .= 10
@test df == DataFrame(a=[[10,10,10]])
df[1,:a] .= 100
@test df == DataFrame(a=[[100,100,100]])
df[CartesianIndex(1,1)] .= 1000
@test df == DataFrame(a=[[1000,1000,1000]])
end

end # module