auto apply Grouping() to grouping variables (#652)

JuliaStats · Sep 1, 2023 · 9ee4a0a · 9ee4a0a · palday · Sep 1, 2023
1 parent d2a05aa
commit 9ee4a0a
Show file tree

Hide file tree

Showing 10 changed files with 188 additions and 46 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,7 @@
+MixedModels v4.21.0 Release Notes
+==============================
+* Auto apply `Grouping()` to grouping variables that don't already have an explicit contrast. [#652]
+
 MixedModels v4.20.0 Release Notes
 ==============================
 * The `.tbl` property of a `MixedModelBootstrap` now includes the correlation parameters for lower triangular elements of the `λ` field. [#702]
@@ -452,6 +456,7 @@ Package dependencies
 [#639]: https://github.com/JuliaStats/MixedModels.jl/issues/639
 [#648]: https://github.com/JuliaStats/MixedModels.jl/issues/648
 [#651]: https://github.com/JuliaStats/MixedModels.jl/issues/651
+[#652]: https://github.com/JuliaStats/MixedModels.jl/issues/652
 [#653]: https://github.com/JuliaStats/MixedModels.jl/issues/653
 [#657]: https://github.com/JuliaStats/MixedModels.jl/issues/657
 [#663]: https://github.com/JuliaStats/MixedModels.jl/issues/663

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "MixedModels"
 uuid = "ff71e718-51f3-5ec2-a782-8ffcbfa3c316"
 author = ["Phillip Alday <me@phillipalday.com>", "Douglas Bates <dmbates@gmail.com>", "Jose Bayoan Santiago Calderon <jbs3hp@virginia.edu>"]
-version = "4.20.0"
+version = "4.21.0"
 
 [deps]
 Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"

diff --git a/src/MixedModels.jl b/src/MixedModels.jl
@@ -211,7 +211,8 @@ using PrecompileTools
         fit(MixedModel,
             @formula(use ~ 1 + age + abs2(age) + urban + livch + (1 | urban & dist)),
             contra,
-            Bernoulli(); progress)
+            Bernoulli();
+            progress)
     end
 end
 

diff --git a/src/grouping.jl b/src/grouping.jl
@@ -1,4 +1,3 @@
-
 """
     struct Grouping <: StatsModels.AbstractContrasts end
 
@@ -34,3 +33,30 @@ function StatsModels.ContrastsMatrix(
 )
     return StatsModels.ContrastsMatrix(zeros(0, 0), levels, levels, contrasts)
 end
+
+# this arises when there's an interaction as a grouping variable without a corresponding
+# non-interaction grouping, e.g. urban&dist in the contra dataset
+# adapted from https://github.com/JuliaStats/StatsModels.jl/blob/463eb0acb49bc5428374d749c4da90ea2a6c74c4/src/schema.jl#L355-L372
+function StatsModels.apply_schema(
+    t::CategoricalTerm{Grouping},
+    schema::FullRank,
+    ::Type{<:MixedModel},
+    context::AbstractTerm,
+)
+    aliased = drop_term(context, t)
+    @debug "$t in context of $context: aliases $aliased\n  seen already: $(schema.already)"
+    for seen in schema.already
+        if StatsModels.symequal(aliased, seen)
+            @debug "  aliased term already present: $seen"
+            return t
+        end
+    end
+    # aliased term not seen already:
+    # add aliased term to already seen:
+    push!(schema.already, aliased)
+    # repair:
+    new_contrasts = StatsModels.ContrastsMatrix(Grouping(), t.contrasts.levels)
+    t = CategoricalTerm(t.sym, new_contrasts)
+    @debug "  aliased term absent, repairing: $t"
+    return t
+end
diff --git a/src/linearmixedmodel.jl b/src/linearmixedmodel.jl
@@ -69,22 +69,12 @@ function LinearMixedModel(
     # TODO: perform missing_omit() after apply_schema() when improved
     # missing support is in a StatsModels release
     tbl, _ = StatsModels.missing_omit(tbl, f)
-    sch = try
-        schema(f, tbl, contrasts)
-    catch e
-        if isa(e, OutOfMemoryError)
-            @warn "Random effects grouping variables with many levels can cause out-of-memory errors.  Try manually specifying `Grouping()` contrasts for those variables."
-        end
-        rethrow(e)
-    end
-    form = apply_schema(f, sch, LinearMixedModel)
 
+    form = schematize(f, tbl, contrasts)
     if form.rhs isa MatrixTerm || !any(x -> isa(x, AbstractReTerm), form.rhs)
         throw(_MISSING_RE_ERROR)
     end
 
-    # tbl, _ = StatsModels.missing_omit(tbl, form)
-
     y, Xs = modelcols(form, tbl)
 
     return LinearMixedModel(y, Xs, form, wts, σ, amalgamate)

diff --git a/src/randomeffectsterm.jl b/src/randomeffectsterm.jl
@@ -31,6 +31,26 @@ function StatsModels.terms(t::RandomEffectsTerm)
     return union(StatsModels.terms(t.lhs), StatsModels.terms(t.rhs))
 end
 
+schema(t, data, hints) = StatsModels.schema(t, data, hints)
+
+function schema(t::FunctionTerm{typeof(|)}, data, hints::Dict{Symbol})
+    sch = schema(t.args[1], data, hints)
+    vars = StatsModels.termvars.(t.args[2])
+    # in the event that someone has x|x, then the Grouping()
+    # gets overwrriten by the broader schema BUT
+    # that doesn't matter because we detect and throw an error
+    # for that in apply_schema
+    grp_hints = Dict(rr => Grouping() for rr in vars)
+    return merge(schema(t.args[2], data, grp_hints), sch)
+end
+
+is_randomeffectsterm(::Any) = false
+is_randomeffectsterm(::AbstractReTerm) = true
+# RE with free covariance structure
+is_randomeffectsterm(::FunctionTerm{typeof(|)}) = true
+# not zerocorr() or the like
+is_randomeffectsterm(tt::FunctionTerm) = is_randomeffectsterm(tt.args[1])
+
 # | in MixedModel formula -> RandomEffectsTerm
 function StatsModels.apply_schema(
     t::FunctionTerm{typeof(|)},
@@ -81,7 +101,6 @@ function StatsModels.apply_schema(
     )
         lhs = InterceptTerm{true}() + lhs
     end
-
     lhs, rhs = apply_schema.((lhs, rhs), Ref(schema), Mod)
 
     # check whether grouping terms are categorical or interaction of categorical
@@ -230,6 +249,10 @@ function Base.show(io::IO, ::MIME"text/plain", t::ZeroCorr)
     return print(io, "zerocorr", t.term)
 end
 
+function schema(t::FunctionTerm{typeof(zerocorr)}, data, hints::Dict{Symbol})
+    return schema(only(t.args), data, hints)
+end
+
 function StatsModels.apply_schema(
     t::FunctionTerm{typeof(zerocorr)}, sch::MultiSchema, Mod::Type{<:MixedModel}
 )

diff --git a/src/schema.jl b/src/schema.jl
@@ -49,3 +49,38 @@ function StatsModels.apply_schema(t::FormulaTerm, schema::Schema, Mod::Type{<:Mi
         collect_matrix_terms(apply_schema(t.rhs, MultiSchema(schema), Mod)),
     )
 end
+
+"""
+    schematize(f, tbl, contrasts::Dict{Symbol}, Mod=LinearMixedModel)
+
+Find and apply the schema for f in a way that automatically uses `Grouping()`
+contrasts when appropriate.
+
+!!! warn
+    This is an internal method.
+"""
+function schematize(f, tbl, contrasts::Dict{Symbol}, Mod=LinearMixedModel)
+    # if there is only one term on the RHS, then you don't have an iterator
+    # also we want this to be a vector so we can sort later
+    rhs = f.rhs isa AbstractTerm ? [f.rhs] : collect(f.rhs)
+    fe = filter(!is_randomeffectsterm, rhs)
+    # init with lhs so we don't need an extra merge later
+    # and so that things work even when we have empty fixed effects
+    init = schema(f.lhs, tbl, contrasts)
+    sch_fe = mapfoldl(merge, fe; init) do tt
+        return schema(tt, tbl, contrasts)
+    end
+    re = filter(is_randomeffectsterm, rhs)
+    sch_re = mapfoldl(merge, re; init) do tt
+        # this allows us to control dispatch on a more subtle level
+        # and force things to use the schema
+        return schema(tt, tbl, contrasts)
+    end
+    # we want to make sure we don't overwrite any schema
+    # determined on the basis of the fixed effects
+    # recall: merge prefers the entry in the second argument when there's a duplicate key
+    # XXX could we take advantage of MultiSchema here?
+    sch = merge(sch_re, sch_fe)
+
+    return apply_schema(f, sch, Mod)
+end
diff --git a/test/FactorReTerm.jl b/test/FactorReTerm.jl
@@ -165,23 +165,6 @@ end
     end
 end
 
-@testset "Categorical Blocking Variable" begin
-    # deepcopy because we're going to modify it.  Don't need the copy if dataset returns an Arrow.Table
-    #slp = deepcopy(DataFrame(dataset("sleepstudy")))
-    slp = DataFrame(dataset("sleepstudy"))
-    contrasts =  Dict{Symbol,Any}()
-    f = @formula(reaction ~ 1 + (1|subj))
-
-    # String blocking-variables work fine because StatsModels is smart enough to
-    # treat strings as Categorical. Note however that this is a
-    # far less efficient to store the original dataframe, although it doesn't
-    # matter for the contrast matrix
-    slp[!,:subj] = convert.(String, slp[!, :subj])
-    # @test_throws ArgumentError LinearMixedModel(f, slp)
-    slp.subj = parse.(Int, getindex.(slp.subj, Ref(2:4)))
-    @test_throws ArgumentError LinearMixedModel(f, slp)
-end
-
 @testset "random effects term syntax" begin
 
     dat = (y = rand(18),

diff --git a/test/grouping.jl b/test/grouping.jl
@@ -1,13 +1,19 @@
-using Test
+using MixedModels
 using StatsModels
+using Test
+
+using MixedModels: schematize
+using StatsModels: ContrastsMatrix, FullDummyCoding
 
 @testset "Grouping" begin
     g = Grouping()
     @test isnothing(g.levels)
 end
 
 @testset "Grouping pseudo-contrasts" begin
-    d = (y = rand(2_000_000), grp=string.([1:1_000_000; 1:1_000_000]))
+    d = (; y=rand(2_000_000),
+         grp=string.([1:1_000_000; 1:1_000_000]),
+         outer=rand('A':'z', 2_000_000))
     ## OOM seems to result in the process being killed on Mac so this messes up CI
     # @test_throws OutOfMemoryError schema(d)
     sch = schema(d, Dict(:grp => Grouping()))
@@ -21,7 +27,87 @@ end
 
     @test all(t.contrasts.invindex[lev] == i for (i,lev) in enumerate(levs))
     @test all(t.contrasts.levels[i] == lev for (i,lev) in enumerate(levs))
+end
+
+@testset "Auto application of Grouping()" begin
+
+    d = (; y=rand(100),
+        x=rand('A':'Z', 100),
+        z=rand('A':'Z', 100),
+        grp=rand('a':'z', 100))
+    contrasts = Dict{Symbol, Any}()
+
+    @testset "blocking variables are grouping" for f in [@formula(y ~ 1 + x + (1|grp)),
+                                                         @formula(y ~ 1 + x + zerocorr(1|grp))]
+        fsch = schematize(f, d, contrasts)
+        fe = fsch.rhs[1]
+        x = last(fe.terms)
+        @test x.contrasts isa ContrastsMatrix{DummyCoding}
+        re = fsch.rhs[2]
+        grp = re.rhs
+        @test grp.contrasts isa ContrastsMatrix{Grouping}
+    end
+
+    @testset "FE contrasts take priority" for f in [@formula(y ~ 1 + x + (1|x)),
+                                                    @formula(y ~ 1 + x + zerocorr(1|x))]
+        fsch = schematize(f, d, contrasts)
+        fe = fsch.rhs[1]
+        x = last(fe.terms)
+        @test x.contrasts isa ContrastsMatrix{DummyCoding}
+        re = fsch.rhs[2]
+        grp = re.rhs
+        @test grp.contrasts isa ContrastsMatrix{DummyCoding}
+
+        fsch = schematize(@formula(y ~ 1 + x + (1|x)), d, Dict(:x => EffectsCoding()))
+        fe = fsch.rhs[1]
+        x = last(fe.terms)
+        @test x.contrasts isa ContrastsMatrix{EffectsCoding}
+        re = fsch.rhs[2]
+        grp = re.rhs
+        @test grp.contrasts isa ContrastsMatrix{EffectsCoding}
+    end
+
+    @testset "Nesting and interactions" for f in [@formula(y ~ 1 + x + (1|grp/z))]
+        # XXX zerocorr(1|grp/z) doesn't work!
+        fsch = schematize(f, d, contrasts)
+        fe = fsch.rhs[1]
+        x = last(fe.terms)
+        @test x.contrasts isa ContrastsMatrix{DummyCoding}
+        re = fsch.rhs[2:end]
+        grp = re[1].rhs
+        @test grp.contrasts isa ContrastsMatrix{Grouping}
+        interaction = re[2].rhs
+        # this is less than ideal but we need it for now to get the nesting logic to work
+        @test interaction.terms[1].contrasts isa ContrastsMatrix{FullDummyCoding}
+        # this is the desired behavior
+        @test_broken interaction.terms[1].contrasts isa ContrastsMatrix{Grouping}
+        @test interaction.terms[2].contrasts isa ContrastsMatrix{Grouping}
+    end
+
+    @testset "Interactions where one component is FE" for f in [@formula(y ~ 1 + x + (1|x&grp)),
+                                                                @formula(y ~ 1 + x + zerocorr(1|x&grp))]
+        # occurs in e.g. the contra models
+        # @formula(use ~ 1+age+abs2(age)+urban+livch+(1|urban&dist)
+        fsch = schematize(f, d, contrasts)
+        fe = fsch.rhs[1]
+        x = last(fe.terms)
+        @test x.contrasts isa ContrastsMatrix{DummyCoding}
+        re = fsch.rhs[2]
+        x_re = re.rhs.terms[1]
+        # this is less than ideal but it relates to the way interactions are computed in RE
+        @test x_re.contrasts isa ContrastsMatrix{DummyCoding}
+        # this is the desired behavior:
+        # even if the contrast matrix has to be small enough to invert,
+        # it's silly to store it and invert it again when we don't need it here
+        @test_broken x_rex.contrasts isa ContrastsMatrix{Grouping}
+        grp = re.rhs.terms[2]
+        @test grp.contrasts isa ContrastsMatrix{Grouping}
+    end
 
-    # @test_throws OutOfMemoryError fit(MixedModel, @formula(y ~ 1 + (1 | grp)), d)
-    @test fit(MixedModel, @formula(y ~ 1 + (1 | grp)), d, contrasts=Dict(:grp => Grouping()), progress=false) isa LinearMixedModel
+    @test_throws(ArgumentError("Same variable appears on both sides of |"),
+                 schematize(@formula(y ~ 1 + (x|x)), d, contrasts))
+    f1 = schematize(@formula(y ~ 1 + x + z), d, contrasts)
+    f2 = apply_schema(@formula(y ~ 1 + x + z), schema(d, contrasts))
+    # skip intercept term
+    @test all(a.contrasts == b.contrasts for (a, b) in zip(f1.rhs.terms[2:end], f2.rhs.terms[2:end]))
 end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -6,15 +6,8 @@ import InteractiveUtils: versioninfo
 import LinearAlgebra: BLAS
 
 # there seem to be processor-specific issues and knowing this is helpful
-println(versioninfo())
-@static if VERSION ≥ v"1.7.0-DEV.620"
-    println(BLAS.get_config())
-else
-    @show BLAS.vendor()
-    if startswith(string(BLAS.vendor()), "openblas")
-        println(BLAS.openblas_get_config())
-    end
-end
+@info sprint(versioninfo)
+@info BLAS.get_config()
 
 @testset "Aqua" begin
     # we can't check for unbound type parameters
@@ -29,13 +22,13 @@ include("UniformBlockDiagonal.jl")
 include("linalg.jl")
 include("matrixterm.jl")
 include("FactorReTerm.jl")
+include("grouping.jl")
 include("pls.jl")
 include("pirls.jl")
 include("gausshermite.jl")
 include("fit.jl")
 include("missing.jl")
 include("likelihoodratiotest.jl")
-include("grouping.jl")
 include("bootstrap.jl")
 include("mime.jl")
 include("optsummary.jl")