From 2f9d423e338026ef6c4848a33f890fa23dc33ac7 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Wed, 15 Jul 2020 14:39:06 -0400 Subject: [PATCH] add Grouping contrasts (#339) * add Grouping contrasts * export and add a getproperty patch to work around statsmodels * warn users to use Grouping when OOM during schema call * Update src/grouping.jl Co-authored-by: Phillip Alday --- src/MixedModels.jl | 2 ++ src/grouping.jl | 34 ++++++++++++++++++++++++++++++++++ src/linearmixedmodel.jl | 10 +++++++++- test/grouping.jl | 21 +++++++++++++++++++++ test/runtests.jl | 1 + 5 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 src/grouping.jl create mode 100644 test/grouping.jl diff --git a/src/MixedModels.jl b/src/MixedModels.jl index 8cebde08d..211ea10ae 100644 --- a/src/MixedModels.jl +++ b/src/MixedModels.jl @@ -39,6 +39,7 @@ export @formula, BlockedSparse, DummyCoding, EffectsCoding, + Grouping, Gamma, GeneralizedLinearMixedModel, HelmertCoding, @@ -155,5 +156,6 @@ include("linalg.jl") include("simulate.jl") include("bootstrap.jl") include("blockdescription.jl") +include("grouping.jl") end # module diff --git a/src/grouping.jl b/src/grouping.jl new file mode 100644 index 000000000..d4b3d3e8c --- /dev/null +++ b/src/grouping.jl @@ -0,0 +1,34 @@ + +""" + struct Grouping <: StatsModels.AbstractContrasts end + +A placeholder type to indicate that a categorical variable is only used for +grouping and not for contrasts. When creating a [`CategoricalTerm`](@ref), this +skips constructing the contrasts matrix which makes it robust to large numbers +of levels, while still holding onto the vector of levels and constructing the +level-to-index mapping (`invindex` field of the [`ContrastsMatrix`](@ref).). + +Note that calling `modelcols` on a `CategoricalTerm{Grouping}` is an error. + +# Examples + +```julia +julia> schema((; grp = string.(1:100_000))) +# out-of-memory error + +julia> schema((; grp = string.(1:100_000)), Dict(:grp => Grouping())) +""" +struct Grouping <: StatsModels.AbstractContrasts +end + +# return an empty matrix +StatsModels.contrasts_matrix(::Grouping, baseind, n) = zeros(0,0) +StatsModels.termnames(::Grouping, levels::AbstractVector, baseind::Integer) = levels + +# this is needed until StatsModels stops assuming all contrasts have a .levels field +Base.getproperty(g::Grouping, prop::Symbol) = + prop == :levels ? nothing : getfield(g, prop) + +# special-case categorical terms with Grouping contrasts. +StatsModels.modelcols(::CategoricalTerm{Grouping}, d::NamedTuple) = + error("can't create model columns directly from a Grouping term") diff --git a/src/linearmixedmodel.jl b/src/linearmixedmodel.jl index 9ea429e00..59c8be6c9 100644 --- a/src/linearmixedmodel.jl +++ b/src/linearmixedmodel.jl @@ -52,7 +52,15 @@ function LinearMixedModel( # TODO: perform missing_omit() after apply_schema() when improved # missing support is in a StatsModels release tbl, _ = StatsModels.missing_omit(tbl, f) - form = apply_schema(f, schema(f, tbl, contrasts), LinearMixedModel) + sch = try + schema(f, tbl, contrasts) + catch e + if isa(e, OutOfMemoryError) + @warn "Random effects grouping variables with many levels can cause out-of-memory errors. Try manually specifying `Grouping()` contrasts for those variables." + end + rethrow(e) + end + form = apply_schema(f, sch, LinearMixedModel) # tbl, _ = StatsModels.missing_omit(tbl, form) y, Xs = modelcols(form, tbl) diff --git a/test/grouping.jl b/test/grouping.jl new file mode 100644 index 000000000..930746444 --- /dev/null +++ b/test/grouping.jl @@ -0,0 +1,21 @@ +using Test +using StatsModels + +@testset "Grouping pseudo-contrasts" begin + d = (y = rand(2_000_000), grp=string.([1:1_000_000; 1:1_000_000])) + ## OOM seems to result in the process being killed on Mac so this messes up CI + # @test_throws OutOfMemoryError schema(d) + sch = schema(d, Dict(:grp => Grouping())) + t = sch[term(:grp)] + @test t isa CategoricalTerm{Grouping} + @test size(t.contrasts.matrix) == (0,0) + @test length(t.contrasts.levels) == 1_000_000 + + levs = sort(string.(1:1_000_000)) + + @test all(t.contrasts.invindex[lev] == i for (i,lev) in enumerate(levs)) + @test all(t.contrasts.levels[i] == lev for (i,lev) in enumerate(levs)) + + # @test_throws OutOfMemoryError fit(MixedModel, @formula(y ~ 1 + (1 | grp)), d) + @test fit(MixedModel, @formula(y ~ 1 + (1 | grp)), d, contrasts=Dict(:grp => Grouping())) isa LinearMixedModel +end diff --git a/test/runtests.jl b/test/runtests.jl index cf485f692..d94ba1e4f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -12,3 +12,4 @@ include("gausshermite.jl") include("fit.jl") include("missing.jl") include("likelihoodratiotest.jl") +include("grouping.jl")