Skip to content

Commit

Permalink
auto apply Grouping() to grouping variables (#652)
Browse files Browse the repository at this point in the history
  • Loading branch information
palday committed Sep 1, 2023
1 parent d2a05aa commit 9ee4a0a
Show file tree
Hide file tree
Showing 10 changed files with 188 additions and 46 deletions.
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
MixedModels v4.21.0 Release Notes
==============================
* Auto apply `Grouping()` to grouping variables that don't already have an explicit contrast. [#652]

MixedModels v4.20.0 Release Notes
==============================
* The `.tbl` property of a `MixedModelBootstrap` now includes the correlation parameters for lower triangular elements of the `λ` field. [#702]
Expand Down Expand Up @@ -452,6 +456,7 @@ Package dependencies
[#639]: https://github.com/JuliaStats/MixedModels.jl/issues/639
[#648]: https://github.com/JuliaStats/MixedModels.jl/issues/648
[#651]: https://github.com/JuliaStats/MixedModels.jl/issues/651
[#652]: https://github.com/JuliaStats/MixedModels.jl/issues/652
[#653]: https://github.com/JuliaStats/MixedModels.jl/issues/653
[#657]: https://github.com/JuliaStats/MixedModels.jl/issues/657
[#663]: https://github.com/JuliaStats/MixedModels.jl/issues/663
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "MixedModels"
uuid = "ff71e718-51f3-5ec2-a782-8ffcbfa3c316"
author = ["Phillip Alday <me@phillipalday.com>", "Douglas Bates <dmbates@gmail.com>", "Jose Bayoan Santiago Calderon <jbs3hp@virginia.edu>"]
version = "4.20.0"
version = "4.21.0"

[deps]
Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
Expand Down
3 changes: 2 additions & 1 deletion src/MixedModels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,8 @@ using PrecompileTools
fit(MixedModel,
@formula(use ~ 1 + age + abs2(age) + urban + livch + (1 | urban & dist)),
contra,
Bernoulli(); progress)
Bernoulli();
progress)
end
end

Expand Down
28 changes: 27 additions & 1 deletion src/grouping.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

"""
struct Grouping <: StatsModels.AbstractContrasts end
Expand Down Expand Up @@ -34,3 +33,30 @@ function StatsModels.ContrastsMatrix(
)
return StatsModels.ContrastsMatrix(zeros(0, 0), levels, levels, contrasts)
end

# this arises when there's an interaction as a grouping variable without a corresponding
# non-interaction grouping, e.g. urban&dist in the contra dataset
# adapted from https://github.com/JuliaStats/StatsModels.jl/blob/463eb0acb49bc5428374d749c4da90ea2a6c74c4/src/schema.jl#L355-L372
function StatsModels.apply_schema(
t::CategoricalTerm{Grouping},
schema::FullRank,
::Type{<:MixedModel},
context::AbstractTerm,
)
aliased = drop_term(context, t)
@debug "$t in context of $context: aliases $aliased\n seen already: $(schema.already)"
for seen in schema.already
if StatsModels.symequal(aliased, seen)
@debug " aliased term already present: $seen"
return t
end
end
# aliased term not seen already:
# add aliased term to already seen:
push!(schema.already, aliased)
# repair:
new_contrasts = StatsModels.ContrastsMatrix(Grouping(), t.contrasts.levels)
t = CategoricalTerm(t.sym, new_contrasts)
@debug " aliased term absent, repairing: $t"
return t
end
12 changes: 1 addition & 11 deletions src/linearmixedmodel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -69,22 +69,12 @@ function LinearMixedModel(
# TODO: perform missing_omit() after apply_schema() when improved
# missing support is in a StatsModels release
tbl, _ = StatsModels.missing_omit(tbl, f)
sch = try
schema(f, tbl, contrasts)
catch e
if isa(e, OutOfMemoryError)
@warn "Random effects grouping variables with many levels can cause out-of-memory errors. Try manually specifying `Grouping()` contrasts for those variables."
end
rethrow(e)
end
form = apply_schema(f, sch, LinearMixedModel)

form = schematize(f, tbl, contrasts)
if form.rhs isa MatrixTerm || !any(x -> isa(x, AbstractReTerm), form.rhs)
throw(_MISSING_RE_ERROR)
end

# tbl, _ = StatsModels.missing_omit(tbl, form)

y, Xs = modelcols(form, tbl)

return LinearMixedModel(y, Xs, form, wts, σ, amalgamate)
Expand Down
25 changes: 24 additions & 1 deletion src/randomeffectsterm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,26 @@ function StatsModels.terms(t::RandomEffectsTerm)
return union(StatsModels.terms(t.lhs), StatsModels.terms(t.rhs))
end

schema(t, data, hints) = StatsModels.schema(t, data, hints)

function schema(t::FunctionTerm{typeof(|)}, data, hints::Dict{Symbol})
sch = schema(t.args[1], data, hints)
vars = StatsModels.termvars.(t.args[2])
# in the event that someone has x|x, then the Grouping()
# gets overwrriten by the broader schema BUT
# that doesn't matter because we detect and throw an error
# for that in apply_schema
grp_hints = Dict(rr => Grouping() for rr in vars)
return merge(schema(t.args[2], data, grp_hints), sch)
end

is_randomeffectsterm(::Any) = false
is_randomeffectsterm(::AbstractReTerm) = true
# RE with free covariance structure
is_randomeffectsterm(::FunctionTerm{typeof(|)}) = true
# not zerocorr() or the like
is_randomeffectsterm(tt::FunctionTerm) = is_randomeffectsterm(tt.args[1])

# | in MixedModel formula -> RandomEffectsTerm
function StatsModels.apply_schema(
t::FunctionTerm{typeof(|)},
Expand Down Expand Up @@ -81,7 +101,6 @@ function StatsModels.apply_schema(
)
lhs = InterceptTerm{true}() + lhs
end

lhs, rhs = apply_schema.((lhs, rhs), Ref(schema), Mod)

# check whether grouping terms are categorical or interaction of categorical
Expand Down Expand Up @@ -230,6 +249,10 @@ function Base.show(io::IO, ::MIME"text/plain", t::ZeroCorr)
return print(io, "zerocorr", t.term)
end

function schema(t::FunctionTerm{typeof(zerocorr)}, data, hints::Dict{Symbol})
return schema(only(t.args), data, hints)
end

function StatsModels.apply_schema(
t::FunctionTerm{typeof(zerocorr)}, sch::MultiSchema, Mod::Type{<:MixedModel}
)
Expand Down
35 changes: 35 additions & 0 deletions src/schema.jl
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,38 @@ function StatsModels.apply_schema(t::FormulaTerm, schema::Schema, Mod::Type{<:Mi
collect_matrix_terms(apply_schema(t.rhs, MultiSchema(schema), Mod)),
)
end

"""
schematize(f, tbl, contrasts::Dict{Symbol}, Mod=LinearMixedModel)
Find and apply the schema for f in a way that automatically uses `Grouping()`
contrasts when appropriate.
!!! warn
This is an internal method.
"""
function schematize(f, tbl, contrasts::Dict{Symbol}, Mod=LinearMixedModel)
# if there is only one term on the RHS, then you don't have an iterator
# also we want this to be a vector so we can sort later
rhs = f.rhs isa AbstractTerm ? [f.rhs] : collect(f.rhs)
fe = filter(!is_randomeffectsterm, rhs)
# init with lhs so we don't need an extra merge later
# and so that things work even when we have empty fixed effects
init = schema(f.lhs, tbl, contrasts)
sch_fe = mapfoldl(merge, fe; init) do tt
return schema(tt, tbl, contrasts)
end
re = filter(is_randomeffectsterm, rhs)
sch_re = mapfoldl(merge, re; init) do tt
# this allows us to control dispatch on a more subtle level
# and force things to use the schema
return schema(tt, tbl, contrasts)
end
# we want to make sure we don't overwrite any schema
# determined on the basis of the fixed effects
# recall: merge prefers the entry in the second argument when there's a duplicate key
# XXX could we take advantage of MultiSchema here?
sch = merge(sch_re, sch_fe)

return apply_schema(f, sch, Mod)
end
17 changes: 0 additions & 17 deletions test/FactorReTerm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -165,23 +165,6 @@ end
end
end

@testset "Categorical Blocking Variable" begin
# deepcopy because we're going to modify it. Don't need the copy if dataset returns an Arrow.Table
#slp = deepcopy(DataFrame(dataset("sleepstudy")))
slp = DataFrame(dataset("sleepstudy"))
contrasts = Dict{Symbol,Any}()
f = @formula(reaction ~ 1 + (1|subj))

# String blocking-variables work fine because StatsModels is smart enough to
# treat strings as Categorical. Note however that this is a
# far less efficient to store the original dataframe, although it doesn't
# matter for the contrast matrix
slp[!,:subj] = convert.(String, slp[!, :subj])
# @test_throws ArgumentError LinearMixedModel(f, slp)
slp.subj = parse.(Int, getindex.(slp.subj, Ref(2:4)))
@test_throws ArgumentError LinearMixedModel(f, slp)
end

@testset "random effects term syntax" begin

dat = (y = rand(18),
Expand Down
94 changes: 90 additions & 4 deletions test/grouping.jl
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
using Test
using MixedModels
using StatsModels
using Test

using MixedModels: schematize
using StatsModels: ContrastsMatrix, FullDummyCoding

@testset "Grouping" begin
g = Grouping()
@test isnothing(g.levels)
end

@testset "Grouping pseudo-contrasts" begin
d = (y = rand(2_000_000), grp=string.([1:1_000_000; 1:1_000_000]))
d = (; y=rand(2_000_000),
grp=string.([1:1_000_000; 1:1_000_000]),
outer=rand('A':'z', 2_000_000))
## OOM seems to result in the process being killed on Mac so this messes up CI
# @test_throws OutOfMemoryError schema(d)
sch = schema(d, Dict(:grp => Grouping()))
Expand All @@ -21,7 +27,87 @@ end

@test all(t.contrasts.invindex[lev] == i for (i,lev) in enumerate(levs))
@test all(t.contrasts.levels[i] == lev for (i,lev) in enumerate(levs))
end

@testset "Auto application of Grouping()" begin

d = (; y=rand(100),
x=rand('A':'Z', 100),
z=rand('A':'Z', 100),
grp=rand('a':'z', 100))
contrasts = Dict{Symbol, Any}()

@testset "blocking variables are grouping" for f in [@formula(y ~ 1 + x + (1|grp)),
@formula(y ~ 1 + x + zerocorr(1|grp))]
fsch = schematize(f, d, contrasts)
fe = fsch.rhs[1]
x = last(fe.terms)
@test x.contrasts isa ContrastsMatrix{DummyCoding}
re = fsch.rhs[2]
grp = re.rhs
@test grp.contrasts isa ContrastsMatrix{Grouping}
end

@testset "FE contrasts take priority" for f in [@formula(y ~ 1 + x + (1|x)),
@formula(y ~ 1 + x + zerocorr(1|x))]
fsch = schematize(f, d, contrasts)
fe = fsch.rhs[1]
x = last(fe.terms)
@test x.contrasts isa ContrastsMatrix{DummyCoding}
re = fsch.rhs[2]
grp = re.rhs
@test grp.contrasts isa ContrastsMatrix{DummyCoding}

fsch = schematize(@formula(y ~ 1 + x + (1|x)), d, Dict(:x => EffectsCoding()))
fe = fsch.rhs[1]
x = last(fe.terms)
@test x.contrasts isa ContrastsMatrix{EffectsCoding}
re = fsch.rhs[2]
grp = re.rhs
@test grp.contrasts isa ContrastsMatrix{EffectsCoding}
end

@testset "Nesting and interactions" for f in [@formula(y ~ 1 + x + (1|grp/z))]
# XXX zerocorr(1|grp/z) doesn't work!
fsch = schematize(f, d, contrasts)
fe = fsch.rhs[1]
x = last(fe.terms)
@test x.contrasts isa ContrastsMatrix{DummyCoding}
re = fsch.rhs[2:end]
grp = re[1].rhs
@test grp.contrasts isa ContrastsMatrix{Grouping}
interaction = re[2].rhs
# this is less than ideal but we need it for now to get the nesting logic to work
@test interaction.terms[1].contrasts isa ContrastsMatrix{FullDummyCoding}
# this is the desired behavior
@test_broken interaction.terms[1].contrasts isa ContrastsMatrix{Grouping}
@test interaction.terms[2].contrasts isa ContrastsMatrix{Grouping}
end

@testset "Interactions where one component is FE" for f in [@formula(y ~ 1 + x + (1|x&grp)),
@formula(y ~ 1 + x + zerocorr(1|x&grp))]
# occurs in e.g. the contra models
# @formula(use ~ 1+age+abs2(age)+urban+livch+(1|urban&dist)
fsch = schematize(f, d, contrasts)
fe = fsch.rhs[1]
x = last(fe.terms)
@test x.contrasts isa ContrastsMatrix{DummyCoding}
re = fsch.rhs[2]
x_re = re.rhs.terms[1]
# this is less than ideal but it relates to the way interactions are computed in RE
@test x_re.contrasts isa ContrastsMatrix{DummyCoding}
# this is the desired behavior:
# even if the contrast matrix has to be small enough to invert,
# it's silly to store it and invert it again when we don't need it here
@test_broken x_rex.contrasts isa ContrastsMatrix{Grouping}
grp = re.rhs.terms[2]
@test grp.contrasts isa ContrastsMatrix{Grouping}
end

# @test_throws OutOfMemoryError fit(MixedModel, @formula(y ~ 1 + (1 | grp)), d)
@test fit(MixedModel, @formula(y ~ 1 + (1 | grp)), d, contrasts=Dict(:grp => Grouping()), progress=false) isa LinearMixedModel
@test_throws(ArgumentError("Same variable appears on both sides of |"),
schematize(@formula(y ~ 1 + (x|x)), d, contrasts))
f1 = schematize(@formula(y ~ 1 + x + z), d, contrasts)
f2 = apply_schema(@formula(y ~ 1 + x + z), schema(d, contrasts))
# skip intercept term
@test all(a.contrasts == b.contrasts for (a, b) in zip(f1.rhs.terms[2:end], f2.rhs.terms[2:end]))
end
13 changes: 3 additions & 10 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,8 @@ import InteractiveUtils: versioninfo
import LinearAlgebra: BLAS

# there seem to be processor-specific issues and knowing this is helpful
println(versioninfo())
@static if VERSION v"1.7.0-DEV.620"
println(BLAS.get_config())
else
@show BLAS.vendor()
if startswith(string(BLAS.vendor()), "openblas")
println(BLAS.openblas_get_config())
end
end
@info sprint(versioninfo)
@info BLAS.get_config()

@testset "Aqua" begin
# we can't check for unbound type parameters
Expand All @@ -29,13 +22,13 @@ include("UniformBlockDiagonal.jl")
include("linalg.jl")
include("matrixterm.jl")
include("FactorReTerm.jl")
include("grouping.jl")
include("pls.jl")
include("pirls.jl")
include("gausshermite.jl")
include("fit.jl")
include("missing.jl")
include("likelihoodratiotest.jl")
include("grouping.jl")
include("bootstrap.jl")
include("mime.jl")
include("optsummary.jl")
Expand Down

2 comments on commit 9ee4a0a

@palday
Copy link
Member Author

@palday palday commented on 9ee4a0a Sep 1, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/90635

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v4.21.0 -m "<description of version>" 9ee4a0a9f371eddf9aa68118d96c8a006f4a4d58
git push origin v4.21.0

Please sign in to comment.