diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6de797902f..6ca522c869 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -44,6 +44,8 @@ jobs: ${{ runner.os }}- - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 + env: + JULIA_NUM_THREADS: 2 - uses: julia-actions/julia-processcoverage@v1 - uses: codecov/codecov-action@v1 with: diff --git a/NEWS.md b/NEWS.md index e9e9295f88..9d361bceb7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,11 @@ ## New functionalities +* `combine`, `select` and `transform` with `GroupedDataFrame` now have + support multithreading for some optimized grouped reductions. + This can be enabled using an experimental global option via + `DataFrames.NTHREADS[] = n` (with n > 1) + ([#2491](https://github.com/JuliaData/DataFrames.jl/pull/2491)). ## Deprecated diff --git a/src/DataFrames.jl b/src/DataFrames.jl index 0715c40a3e..4bbf13aac2 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -3,7 +3,7 @@ module DataFrames using Statistics, Printf, REPL using Reexport, SortingAlgorithms, Compat, Unicode, PooledArrays @reexport using Missings, InvertedIndices -using Base.Sort, Base.Order, Base.Iterators +using Base.Sort, Base.Order, Base.Iterators, Base.Threads using TableTraits, IteratorInterfaceExtensions import LinearAlgebra: norm using Markdown @@ -91,6 +91,8 @@ else export only end +const NTHREADS = Ref(1) + include("other/utils.jl") include("other/index.jl") diff --git a/src/groupeddataframe/fastaggregates.jl b/src/groupeddataframe/fastaggregates.jl index 4bce0e6c5f..bfcd66dc0c 100644 --- a/src/groupeddataframe/fastaggregates.jl +++ b/src/groupeddataframe/fastaggregates.jl @@ -163,18 +163,80 @@ function groupreduce!(res::AbstractVector, f, op, condf, adjust, checkempty::Boo counts = zeros(Int, n) end groups = gd.groups - @inbounds for i in eachindex(incol, groups) - gix = groups[i] - x = incol[i] - if gix > 0 && (condf === nothing || condf(x)) - # this check should be optimized out if U is not Any - if eltype(res) === Any && !isassigned(res, gix) - res[gix] = f(x, gix) - else - res[gix] = op(res[gix], f(x, gix)) + nt = min(NTHREADS[], Threads.nthreads()) + if nt <= 1 || axes(incol) != axes(groups) + @inbounds for i in eachindex(incol, groups) + gix = groups[i] + x = incol[i] + if gix > 0 && (condf === nothing || condf(x)) + # this check should be optimized out if eltype is not Any + if eltype(res) === Any && !isassigned(res, gix) + res[gix] = f(x, gix) + else + res[gix] = op(res[gix], f(x, gix)) + end + if adjust !== nothing || checkempty + counts[gix] += 1 + end end + end + else + res_vec = Vector{typeof(res)}(undef, nt) + # needs to be always allocated to fix type instability with @threads + counts_vec = Vector{Vector{Int}}(undef, nt) + res_vec[1] = res + if adjust !== nothing || checkempty + counts_vec[1] = counts + end + for i in 2:nt + res_vec[i] = copy(res) if adjust !== nothing || checkempty - counts[gix] += 1 + counts_vec[i] = zeros(Int, n) + end + end + @sync for tid in 1:nt + Threads.@spawn begin + res′ = res_vec[tid] + if adjust !== nothing || checkempty + counts′ = counts_vec[tid] + end + start = 1 + ((tid - 1) * length(groups)) ÷ nt + stop = (tid * length(groups)) ÷ nt + @inbounds for i in start:stop + gix = groups[i] + x = incol[i] + if gix > 0 && (condf === nothing || condf(x)) + # this check should be optimized out if eltype is not Any + if eltype(res′) === Any && !isassigned(res′, gix) + res′[gix] = f(x, gix) + else + res′[gix] = op(res′[gix], f(x, gix)) + end + if adjust !== nothing || checkempty + counts′[gix] += 1 + end + end + end + end + end + for i in 2:length(res_vec) + resi = res_vec[i] + @inbounds @simd for j in eachindex(res) + # this check should be optimized out if eltype is not Any + if eltype(res) === Any + if isassigned(resi, j) && isassigned(res, j) + res[j] = op(res[j], resi[j]) + elseif isassigned(resi, j) + res[j] = resi[j] + end + else + res[j] = op(res[j], resi[j]) + end + end + end + if adjust !== nothing || checkempty + for i in 2:length(counts_vec) + counts .+= counts_vec[i] end end end diff --git a/test/grouping.jl b/test/grouping.jl index 1e84898e53..c8868092ef 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -98,6 +98,32 @@ function groupby_checked(df::AbstractDataFrame, keys, args...; kwargs...) return ogd end +function combine_checked(gd::GroupedDataFrame, args...; kwargs...) + @assert DataFrames.NTHREADS[] == 1 + res1 = combine(gd, args...; kwargs...) + if Base.Threads.nthreads() > 1 + DataFrames.NTHREADS[] = 2 + res2 = combine(gd, args...; kwargs...) + DataFrames.NTHREADS[] = 1 + else + @warn "Only one thread available: multithreading not tested" + end + @test names(res1) == names(res2) + for (c1, c2) in zip(eachcol(res1), eachcol(res2)) + @test typeof(c1) === typeof(c2) + if eltype(c1) <: Union{AbstractFloat, Missing} + @test ismissing.(c1) == ismissing.(c2) + if !all(ismissing, c1) + @test isapprox(collect(skipmissing(c1)), collect(skipmissing(c2)), + nans=true) + end + else + @test c1 ≅ c2 + end + end + res1 +end + @testset "parent" begin df = DataFrame(a = [1, 1, 2, 2], b = [5, 6, 7, 8]) gd = groupby_checked(df, :a) @@ -911,7 +937,7 @@ Base.isless(::TestType, ::TestType) = false gd = groupby_checked(df, :a, skipmissing=skip, sort=sort) indices && @test gd.idx !== nothing # Trigger computation of indices - res = combine(gd, :x1 => f => :y) + res = combine_checked(gd, :x1 => f => :y) expected = combine(gd, :x1 => (x -> f(x)) => :y) @test res ≅ expected @test typeof(res.y) == typeof(expected.y) @@ -921,7 +947,7 @@ Base.isless(::TestType, ::TestType) = false df.x3 = Vector{T}(df.x1) gd = groupby_checked(df, :a, skipmissing=skip, sort=sort) indices && @test gd.idx !== nothing # Trigger computation of indices - res = combine(gd, :x3 => f => :y) + res = combine_checked(gd, :x3 => f => :y) expected = combine(gd, :x3 => (x -> f(x)) => :y) @test res ≅ expected @test typeof(res.y) == typeof(expected.y) @@ -933,11 +959,11 @@ Base.isless(::TestType, ::TestType) = false df.x3[1] = missing gd = groupby_checked(df, :a, skipmissing=skip, sort=sort) indices && @test gd.idx !== nothing # Trigger computation of indices - res = combine(gd, :x3 => f => :y) + res = combine_checked(gd, :x3 => f => :y) expected = combine(gd, :x3 => (x -> f(x)) => :y) @test res ≅ expected @test typeof(res.y) == typeof(expected.y) - res = combine(gd, :x3 => f∘skipmissing => :y) + res = combine_checked(gd, :x3 => f∘skipmissing => :y) expected = combine(gd, :x3 => (x -> f(collect(skipmissing(x)))) => :y) @test res ≅ expected @test typeof(res.y) == typeof(expected.y) @@ -949,7 +975,7 @@ Base.isless(::TestType, ::TestType) = false if f in (maximum, minimum, first, last) @test_throws ArgumentError combine(gd, :x3 => f∘skipmissing => :y) else - res = combine(gd, :x3 => f∘skipmissing => :y) + res = combine_checked(gd, :x3 => f∘skipmissing => :y) expected = combine(gd, :x3 => (x -> f(collect(skipmissing(x)))) => :y) @test res ≅ expected @test typeof(res.y) == typeof(expected.y) @@ -960,7 +986,7 @@ Base.isless(::TestType, ::TestType) = false gd = groupby_checked(df, :a, skipmissing=skip, sort=sort) indices && @test gd.idx !== nothing # Trigger computation of indices - res = combine(gd, :x2 => f => :y) + res = combine_checked(gd, :x2 => f => :y) expected = combine(gd, :x2 => (x -> f(x)) => :y) @test res ≅ expected @test typeof(res.y) == typeof(expected.y) @@ -973,14 +999,14 @@ Base.isless(::TestType, ::TestType) = false m && (df.x3[1] = missing) gd = groupby_checked(df, :a, skipmissing=skip, sort=sort) indices && @test gd.idx !== nothing # Trigger computation of indices - res = combine(gd, :x3 => f => :y) + res = combine_checked(gd, :x3 => f => :y) expected = combine(gd, :x3 => (x -> f(x)) => :y) @test res ≅ expected @test typeof(res.y) == typeof(expected.y) f === length && continue - res = combine(gd, :x3 => f∘skipmissing => :y) + res = combine_checked(gd, :x3 => f∘skipmissing => :y) expected = combine(gd, :x3 => (x -> f(collect(skipmissing(x)))) => :y) @test res ≅ expected @test typeof(res.y) == typeof(expected.y) @@ -989,7 +1015,7 @@ Base.isless(::TestType, ::TestType) = false @test_throws ArgumentError combine(gd, :x3 => f∘skipmissing => :y) end end - @test combine(gd, :x1 => maximum => :y, :x2 => sum => :z) ≅ + @test combine_checked(gd, :x1 => maximum => :y, :x2 => sum => :z) ≅ combine(gd, :x1 => (x -> maximum(x)) => :y, :x2 => (x -> sum(x)) => :z) # Test floating point corner cases @@ -1000,7 +1026,7 @@ Base.isless(::TestType, ::TestType) = false gd = groupby_checked(df, :a, skipmissing=skip, sort=sort) indices && @test gd.idx !== nothing # Trigger computation of indices - res = combine(gd, :x1 => f => :y) + res = combine_checked(gd, :x1 => f => :y) expected = combine(gd, :x1 => (x -> f(x)) => :y) @test res ≅ expected @test typeof(res.y) == typeof(expected.y) @@ -1011,18 +1037,18 @@ Base.isless(::TestType, ::TestType) = false df.x3[1] = missing gd = groupby_checked(df, :a, skipmissing=skip, sort=sort) indices && @test gd.idx !== nothing # Trigger computation of indices - res = combine(gd, :x3 => f => :y) + res = combine_checked(gd, :x3 => f => :y) expected = combine(gd, :x3 => (x -> f(x)) => :y) @test res ≅ expected @test typeof(res.y) == typeof(expected.y) - res = combine(gd, :x3 => f∘skipmissing => :y) + res = combine_checked(gd, :x3 => f∘skipmissing => :y) expected = combine(gd, :x3 => (x -> f(collect(skipmissing(x)))) => :y) @test res ≅ expected @test typeof(res.y) == typeof(expected.y) end df = DataFrame(x = [1, 1, 2, 2], y = Any[1, 2.0, 3.0, 4.0]) - res = combine(groupby_checked(df, :x), :y => maximum => :z) + res = combine_checked(groupby_checked(df, :x), :y => maximum => :z) @test res.z isa Vector{Float64} @test res.z == combine(groupby_checked(df, :x), :y => (x -> maximum(x)) => :z).z @@ -1031,7 +1057,7 @@ Base.isless(::TestType, ::TestType) = false gd = groupby_checked(df, :x, skipmissing=skip, sort=sort) indices && @test gd.idx !== nothing # Trigger computation of indices for f in (maximum, minimum) - res = combine(gd, :y => maximum => :z) + res = combine_checked(gd, :y => maximum => :z) @test res.z isa Vector{Any} @test res.z == combine(gd, :y => (x -> maximum(x)) => :z).z end @@ -2570,43 +2596,67 @@ end @testset "corner cases of group_reduce" begin df = DataFrame(g=[1, 1, 1, 2, 2, 2], x=Any[1, 1, 1, 1.5, 1.5, 1.5]) gdf = groupby_checked(df, :g) - @test combine(gdf, :x => sum) == DataFrame(g=1:2, x_sum=[3.0, 4.5]) - - @test combine(gdf, :x => sum∘skipmissing) == DataFrame(g=1:2, x_sum_skipmissing=[3.0, 4.5]) - @test combine(gdf, :x => mean∘skipmissing) == DataFrame(g=1:2, x_mean_skipmissing=[1.0, 1.5]) - @test combine(gdf, :x => var∘skipmissing) == DataFrame(g=1:2, x_var_skipmissing=[0.0, 0.0]) - @test combine(gdf, :x => mean) == DataFrame(g=1:2, x_mean=[1.0, 1.5]) - @test combine(gdf, :x => var) == DataFrame(g=1:2, x_var=[0.0, 0.0]) + @test combine_checked(gdf, :x => sum) == DataFrame(g=1:2, x_sum=[3.0, 4.5]) + + @test combine_checked(gdf, :x => sum∘skipmissing) == + DataFrame(g=1:2, x_sum_skipmissing=[3.0, 4.5]) + @test combine_checked(gdf, :x => mean∘skipmissing) == + DataFrame(g=1:2, x_mean_skipmissing=[1.0, 1.5]) + @test combine_checked(gdf, :x => var∘skipmissing) == + DataFrame(g=1:2, x_var_skipmissing=[0.0, 0.0]) + @test combine_checked(gdf, :x => mean) == + DataFrame(g=1:2, x_mean=[1.0, 1.5]) + @test combine_checked(gdf, :x => var) == + DataFrame(g=1:2, x_var=[0.0, 0.0]) df = DataFrame(g=[1, 1, 1, 2, 2, 2], x=Any[1, 1, 1, 1, 1, missing]) gdf = groupby_checked(df, :g) - @test combine(gdf, :x => sum∘skipmissing) == DataFrame(g=1:2, x_sum_skipmissing=[3, 2]) - @test combine(gdf, :x => mean∘skipmissing) == DataFrame(g=1:2, x_mean_skipmissing=[1.0, 1.0]) - @test combine(gdf, :x => var∘skipmissing) == DataFrame(g=1:2, x_var_skipmissing=[0.0, 0.0]) - @test combine(gdf, :x => sum) ≅ DataFrame(g=1:2, x_sum=[3, missing]) - @test combine(gdf, :x => mean) ≅ DataFrame(g=1:2, x_mean=[1.0, missing]) - @test combine(gdf, :x => var) ≅ DataFrame(g=1:2, x_var=[0.0, missing]) + @test combine_checked(gdf, :x => sum∘skipmissing) == + DataFrame(g=1:2, x_sum_skipmissing=[3, 2]) + @test combine_checked(gdf, :x => mean∘skipmissing) == + DataFrame(g=1:2, x_mean_skipmissing=[1.0, 1.0]) + @test combine_checked(gdf, :x => var∘skipmissing) == + DataFrame(g=1:2, x_var_skipmissing=[0.0, 0.0]) + @test combine_checked(gdf, :x => sum) ≅ + DataFrame(g=1:2, x_sum=[3, missing]) + @test combine_checked(gdf, :x => mean) ≅ + DataFrame(g=1:2, x_mean=[1.0, missing]) + @test combine_checked(gdf, :x => var) ≅ + DataFrame(g=1:2, x_var=[0.0, missing]) df = DataFrame(g=[1, 1, 1, 2, 2, 2], x=Union{Real, Missing}[1, 1, 1, 1, 1, missing]) gdf = groupby_checked(df, :g) - @test combine(gdf, :x => sum∘skipmissing) == DataFrame(g=1:2, x_sum_skipmissing=[3, 2]) - @test combine(gdf, :x => mean∘skipmissing) == DataFrame(g=1:2, x_mean_skipmissing=[1.0, 1.0]) - @test combine(gdf, :x => var∘skipmissing) == DataFrame(g=1:2, x_var_skipmissing=[0.0, 0.0]) - @test combine(gdf, :x => sum) ≅ DataFrame(g=1:2, x_sum=[3, missing]) - @test combine(gdf, :x => mean) ≅ DataFrame(g=1:2, x_mean=[1.0, missing]) - @test combine(gdf, :x => var) ≅ DataFrame(g=1:2, x_var=[0.0, missing]) + @test combine_checked(gdf, :x => sum∘skipmissing) == + DataFrame(g=1:2, x_sum_skipmissing=[3, 2]) + @test combine_checked(gdf, :x => mean∘skipmissing) == + DataFrame(g=1:2, x_mean_skipmissing=[1.0, 1.0]) + @test combine_checked(gdf, :x => var∘skipmissing) == + DataFrame(g=1:2, x_var_skipmissing=[0.0, 0.0]) + @test combine_checked(gdf, :x => sum) ≅ + DataFrame(g=1:2, x_sum=[3, missing]) + @test combine_checked(gdf, :x => mean) ≅ + DataFrame(g=1:2, x_mean=[1.0, missing]) + @test combine_checked(gdf, :x => var) ≅ + DataFrame(g=1:2, x_var=[0.0, missing]) Random.seed!(1) df = DataFrame(g = rand(1:2, 1000), x1 = rand(Int, 1000)) df.x2 = big.(df.x1) gdf = groupby_checked(df, :g) - res = combine(gdf, :x1 => sum, :x2 => sum, :x1 => x -> sum(x), :x2 => x -> sum(x)) + res = combine_checked(gdf, + :x1 => sum, :x2 => sum, + :x1 => x -> sum(x), + :x2 => x -> sum(x)) @test res.x1_sum == res.x1_function @test res.x2_sum == res.x2_function @test res.x1_sum != res.x2_sum # we are large enough to be sure we differ - res = combine(gdf, :x1 => mean, :x2 => mean, :x1 => x -> mean(x), :x2 => x -> mean(x)) + res = combine_checked(gdf, + :x1 => mean, + :x2 => mean, + :x1 => x -> mean(x), + :x2 => x -> mean(x)) if VERSION >= v"1.5" @test res.x1_mean ≈ res.x1_function else @@ -2618,71 +2668,87 @@ end # make sure we do correct promotions in corner case similar to Base df = DataFrame(g=[1, 1, 1, 1, 1, 1], x=Real[1, 1, big(typemax(Int)), 1, 1, 1]) gdf = groupby_checked(df, :g) - @test combine(gdf, :x => sum)[1, 2] == sum(df.x) - @test eltype(combine(gdf, :x => sum)[!, 2]) === BigInt + @test combine_checked(gdf, :x => sum)[1, 2] == sum(df.x) + @test eltype(combine_checked(gdf, :x => sum)[!, 2]) === BigInt df = DataFrame(g=[1, 1, 1, 1, 1, 1], x=Real[1, 1, typemax(Int), 1, 1, 1]) gdf = groupby_checked(df, :g) - @test combine(gdf, :x => sum)[1, 2] == sum(df.x) - @test eltype(combine(gdf, :x => sum)[!, 2]) === Int + @test combine_checked(gdf, :x => sum)[1, 2] == sum(df.x) + @test eltype(combine_checked(gdf, :x => sum)[!, 2]) === Int df = DataFrame(g=[1, 1, 1, 1, 1, 1], x=fill(missing, 6)) gdf = groupby_checked(df, :g) - @test combine(gdf, :x => sum)[1, 2] isa Missing - @test eltype(combine(gdf, :x => sum)[!, 2]) === Missing + res = combine_checked(gdf, :x => sum) + @test res[1, 2] isa Missing + @test eltype(res[!, 2]) === Missing @test_throws MethodError combine(gdf, :x => sum∘skipmissing) df = DataFrame(g=[1, 1, 1, 1, 1, 1], x=convert(Vector{Union{Real, Missing}}, fill(missing, 6))) gdf = groupby_checked(df, :g) - @test combine(gdf, :x => sum)[1, 2] isa Missing - @test eltype(combine(gdf, :x => sum)[!, 2]) === Missing - @test combine(gdf, :x => sum∘skipmissing) == DataFrame(g=1, x_sum_skipmissing=0) - @test eltype(combine(gdf, :x => sum∘skipmissing)[!, 2]) === Int + @test combine_checked(gdf, :x => sum)[1, 2] isa Missing + @test eltype(combine_checked(gdf, :x => sum)[!, 2]) === Missing + @test combine_checked(gdf, :x => sum∘skipmissing) == DataFrame(g=1, x_sum_skipmissing=0) + @test eltype(combine_checked(gdf, :x => sum∘skipmissing)[!, 2]) === Int df = DataFrame(g=[1, 1, 1, 1, 1, 1], x=convert(Vector{Union{Int, Missing}}, fill(missing, 6))) gdf = groupby_checked(df, :g) - @test combine(gdf, :x => sum)[1, 2] isa Missing - @test eltype(combine(gdf, :x => sum)[!, 2]) === Missing - @test combine(gdf, :x => sum∘skipmissing)[1, 2] == 0 - @test eltype(combine(gdf, :x => sum∘skipmissing)[!, 2]) === Int + @test combine_checked(gdf, :x => sum)[1, 2] isa Missing + @test eltype(combine_checked(gdf, :x => sum)[!, 2]) === Missing + @test combine_checked(gdf, :x => sum∘skipmissing)[1, 2] == 0 + @test eltype(combine_checked(gdf, :x => sum∘skipmissing)[!, 2]) === Int df = DataFrame(g=[1, 1, 1, 1, 1, 1], x=convert(Vector{Any}, fill(missing, 6))) gdf = groupby_checked(df, :g) - @test combine(gdf, :x => sum)[1, 2] isa Missing - @test eltype(combine(gdf, :x => sum)[!, 2]) === Missing + res = combine_checked(gdf, :x => sum) + @test res[1, 2] isa Missing + @test eltype(res[!, 2]) === Missing @test_throws MethodError combine(gdf, :x => sum∘skipmissing) # these questions can go to a final exam in "mastering combine" class df = DataFrame(g=[1, 2, 3], x=["a", "b", "c"]) gdf = groupby_checked(df, :g) - @test combine(gdf, :x => sum => :a, :x => prod => :b) == - combine(gdf, :x => (x -> sum(x)) => :a, :x => (x -> prod(x)) => :b) + @test combine_checked(gdf, :x => sum => :a, :x => prod => :b) == + combine_checked(gdf, :x => (x -> sum(x)) => :a, :x => (x -> prod(x)) => :b) + df = DataFrame(g=[1, 2, 3], x=Any["a", "b", "c"]) gdf = groupby_checked(df, :g) - @test combine(gdf, :x => sum => :a, :x => prod => :b) == - combine(gdf, :x => (x -> sum(x)) => :a, :x => (x -> prod(x)) => :b) + @test combine_checked(gdf, :x => sum => :a, :x => prod => :b) == + combine_checked(gdf, :x => (x -> sum(x)) => :a, :x => (x -> prod(x)) => :b) + df = DataFrame(g=[1, 1], x=[missing, "a"]) gdf = groupby_checked(df, :g) - @test combine(gdf, :x => sum∘skipmissing => :a, :x => prod∘skipmissing => :b) == - combine(gdf, :x => (x -> sum(skipmissing(x))) => :a, :x => (x -> prod(skipmissing(x))) => :b) + @test combine_checked(gdf, + :x => sum∘skipmissing => :a, + :x => prod∘skipmissing => :b) == + combine_checked(gdf, + :x => (x -> sum(skipmissing(x))) => :a, + :x => (x -> prod(skipmissing(x))) => :b) + df = DataFrame(g=[1, 1], x=Any[missing, "a"]) gdf = groupby_checked(df, :g) - @test combine(gdf, :x => sum∘skipmissing => :a, :x => prod∘skipmissing => :b) == - combine(gdf, :x => (x -> sum(skipmissing(x))) => :a, :x => (x -> prod(skipmissing(x))) => :b) + @test combine_checked(gdf, + :x => sum∘skipmissing => :a, + :x => prod∘skipmissing => :b) == + combine_checked(gdf, + :x => (x -> sum(skipmissing(x))) => :a, + :x => (x -> prod(skipmissing(x))) => :b) df = DataFrame(g=[1, 2], x=Any[nothing, "a"]) gdf = groupby_checked(df, :g) - df2 = combine(gdf, :x => sum => :a, :x => prod => :b) + df2 = combine_checked(gdf, :x => sum => :a, :x => prod => :b) @test df2 == DataFrame(g=[1, 2], a=[nothing, "a"], b=[nothing, "a"]) @test eltype(df2.a) === eltype(df2.b) === Union{Nothing, String} + df = DataFrame(g=[1, 2], x=Any[1, 1.0]) gdf = groupby_checked(df, :g) - df2 = combine(gdf, :x => sum => :a, :x => prod => :b) + df2 = combine_checked(gdf, :x => sum => :a, :x => prod => :b) @test df2 == DataFrame(g=[1, 2], a=ones(2), b=ones(2)) @test eltype(df2.a) === eltype(df2.b) === Float64 + df = DataFrame(g=[1, 2], x=[1, "1"]) gdf = groupby_checked(df, :g) - df2 = combine(gdf, :x => sum => :a, :x => prod => :b) + df2 = combine_checked(gdf, :x => sum => :a, :x => prod => :b) @test df2 == DataFrame(g=[1, 2], a=[1, "1"], b=[1, "1"]) @test eltype(df2.a) === eltype(df2.b) === Any + df = DataFrame(g=[1, 1, 2], x=[UInt8(1), UInt8(1), missing]) gdf = groupby_checked(df, :g) - df2 = combine(gdf, :x => sum => :a, :x => prod => :b) + df2 = combine_checked(gdf, :x => sum => :a, :x => prod => :b) @test df2 ≅ DataFrame(g=[1, 2], a=[2, missing], b=[1, missing]) @test eltype(df2.a) === eltype(df2.b) === Union{UInt, Missing} end