From 4e72e760cd490712af14d8e2e241f4a07624d91d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Sl=C3=A1me=C4=8Dka?= Date: Tue, 2 Nov 2021 16:40:44 +0000 Subject: [PATCH] Implement single-model `ftest` (#424) --- Project.toml | 2 +- src/GLM.jl | 3 +- src/ftest.jl | 47 ++++++++++++++++++++++++++++-- src/linpred.jl | 2 ++ test/runtests.jl | 76 +++++++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 124 insertions(+), 6 deletions(-) diff --git a/Project.toml b/Project.toml index 517aeb9f..ea73e8f5 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "GLM" uuid = "38e38edf-8417-5370-95a0-9cbb8c7f171a" -version = "1.5.1" +version = "1.6.0" [deps] Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" diff --git a/src/GLM.jl b/src/GLM.jl index 5421647e..768d9613 100644 --- a/src/GLM.jl +++ b/src/GLM.jl @@ -15,10 +15,11 @@ module GLM fitted, fit, model_response, response, modelmatrix, r2, r², adjr2, adjr², PValue import StatsFuns: xlogy import SpecialFunctions: erfc, erfcinv, digamma, trigamma + import StatsModels: hasintercept export coef, coeftable, confint, deviance, nulldeviance, dof, dof_residual, loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict, fitted, fit, fit!, model_response, response, modelmatrix, r2, r², adjr2, adjr², - cooksdistance + cooksdistance, hasintercept export # types diff --git a/src/ftest.jl b/src/ftest.jl index c039a92e..c1719877 100644 --- a/src/ftest.jl +++ b/src/ftest.jl @@ -1,3 +1,10 @@ +struct SingleFTestResult + nobs::Int + dof::Int + fstat::Float64 + pval::Float64 +end + mutable struct FTestResult{N} nobs::Int ssr::NTuple{N, Float64} @@ -28,6 +35,37 @@ _diffn(t::NTuple{N, T}) where {N, T} = ntuple(i->t[i]-t[i+1], N-1) _diff(t::NTuple{N, T}) where {N, T} = ntuple(i->t[i+1]-t[i], N-1) +""" + ftest(mod::LinearModel) + +Perform an F-test to determine whether model `mod` fits significantly better +than the null model (i.e. which includes only the intercept). + +```jldoctest +julia> dat = DataFrame(Result=[1.1, 1.2, 1, 2.2, 1.9, 2, 0.9, 1, 1, 2.2, 2, 2], + Treatment=[1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2]); + +julia> model = lm(@formula(Result ~ 1 + Treatment), dat); + +julia> ftest(model.model) +F-test against the null model: +F-statistic: 241.62 on 12 observations and 1 degrees of freedom, p-value: <1e-07 +``` +""" +function ftest(mod::LinearModel) + hasintercept(mod) || throw(ArgumentError("ftest only works for models with an intercept")) + + rss = deviance(mod) + tss = nulldeviance(mod) + + n = Int(nobs(mod)) + p = dof(mod) - 2 # -2 for intercept and dispersion parameter + fstat = ((tss - rss) / rss) * ((n - p - 1) / p) + fdist = FDist(p, dof_residual(mod)) + + SingleFTestResult(n, p, promote(fstat, ccdf(fdist, abs(fstat)))...) +end + """ ftest(mod::LinearModel...; atol::Real=0.0) @@ -85,9 +123,6 @@ F-test: 3 models fitted on 12 observations ``` """ function ftest(mods::LinearModel...; atol::Real=0.0) - if length(mods) < 2 - Base.depwarn("Passing less than two models to ftest is deprecated", :ftest) - end if !all(==(nobs(mods[1])), nobs.(mods)) throw(ArgumentError("F test is only valid for models fitted on the same data, " * "but number of observations differ")) @@ -127,6 +162,12 @@ function ftest(mods::LinearModel...; atol::Real=0.0) return FTestResult(Int(nobs(mods[1])), SSR, df, r2.(mods), fstat, pval) end +function show(io::IO, ftr::SingleFTestResult) + print(io, "F-test against the null model:\nF-statistic: ", StatsBase.TestStat(ftr.fstat), " ") + print(io, "on ", ftr.nobs, " observations and ", ftr.dof, " degrees of freedom, ") + print(io, "p-value: ", PValue(ftr.pval)) +end + function show(io::IO, ftr::FTestResult{N}) where N Δdof = _diff(ftr.dof) Δssr = _diff(ftr.ssr) diff --git a/src/linpred.jl b/src/linpred.jl index 7adecf94..412a71c9 100644 --- a/src/linpred.jl +++ b/src/linpred.jl @@ -255,3 +255,5 @@ coef(x::LinPred) = x.beta0 coef(obj::LinPredModel) = coef(obj.pp) dof_residual(obj::LinPredModel) = nobs(obj) - dof(obj) + 1 + +hasintercept(m::LinPredModel) = any(i -> all(==(1), view(m.pp.X , :, i)), 1:size(m.pp.X, 2)) diff --git a/test/runtests.jl b/test/runtests.jl index 3dcda77d..47c1682d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -643,7 +643,62 @@ end @test preds_delta.upper .- preds_delta.lower ≈ 2 .* 1.96 .* R_glm_se atol=1e-3 @test_throws ArgumentError predict(gm, newX, interval=:confidence, interval_method=:undefined_method) @test_throws ArgumentError predict(gm, newX, interval=:undefined) -end +end + +@testset "F test comparing to null model" begin + d = DataFrame(Treatment=[1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2.], + Result=[1.1, 1.2, 1, 2.2, 1.9, 2, .9, 1, 1, 2.2, 2, 2], + Other=categorical([1, 1, 2, 1, 2, 1, 3, 1, 1, 2, 2, 1])) + mod = lm(@formula(Result~Treatment), d).model + othermod = lm(@formula(Result~Other), d).model + nullmod = lm(@formula(Result~1), d).model + bothmod = lm(@formula(Result~Other+Treatment), d).model + nointerceptmod = lm(reshape(d.Treatment, :, 1), d.Result) + + ft1 = ftest(mod) + ft1base = ftest(nullmod, mod) + @test ft1.nobs == ft1base.nobs + @test ft1.dof ≈ dof(mod) - dof(nullmod) + @test ft1.fstat ≈ ft1base.fstat[2] + @test ft1.pval ≈ ft1base.pval[2] + if VERSION >= v"1.6.0" + @test sprint(show, ft1) == """ + F-test against the null model: + F-statistic: 241.62 on 12 observations and 1 degrees of freedom, p-value: <1e-07""" + else + @test sprint(show, ft1) == """ + F-test against the null model: + F-statistic: 241.62 on 12 observations and 1 degrees of freedom, p-value: <1e-7""" + end + + ft2 = ftest(othermod) + ft2base = ftest(nullmod, othermod) + @test ft2.nobs == ft2base.nobs + @test ft2.dof ≈ dof(othermod) - dof(nullmod) + @test ft2.fstat ≈ ft2base.fstat[2] + @test ft2.pval ≈ ft2base.pval[2] + @test sprint(show, ft2) == """ + F-test against the null model: + F-statistic: 1.12 on 12 observations and 2 degrees of freedom, p-value: 0.3690""" + + ft3 = ftest(bothmod) + ft3base = ftest(nullmod, bothmod) + @test ft3.nobs == ft3base.nobs + @test ft3.dof ≈ dof(bothmod) - dof(nullmod) + @test ft3.fstat ≈ ft3base.fstat[2] + @test ft3.pval ≈ ft3base.pval[2] + if VERSION >= v"1.6.0" + @test sprint(show, ft3) == """ + F-test against the null model: + F-statistic: 81.97 on 12 observations and 3 degrees of freedom, p-value: <1e-05""" + else + @test sprint(show, ft3) == """ + F-test against the null model: + F-statistic: 81.97 on 12 observations and 3 degrees of freedom, p-value: <1e-5""" + end + + @test_throws ArgumentError ftest(nointerceptmod) +end @testset "F test for model comparison" begin d = DataFrame(Treatment=[1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2.], @@ -895,3 +950,22 @@ end @test hash(NegativeBinomialLink(0.3)) == hash(NegativeBinomialLink(0.3)) @test hash(NegativeBinomialLink(0.31)) != hash(NegativeBinomialLink(0.3)) end + +@testset "hasintercept" begin + d = DataFrame(Treatment=[1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2.], + Result=[1.1, 1.2, 1, 2.2, 1.9, 2, .9, 1, 1, 2.2, 2, 2], + Other=categorical([1, 1, 2, 1, 2, 1, 3, 1, 1, 2, 2, 1])) + + mod = lm(@formula(Result~Treatment), d).model + @test hasintercept(mod) + + nullmod = lm(@formula(Result~1), d).model + @test hasintercept(nullmod) + + nointerceptmod = lm(reshape(d.Treatment, :, 1), d.Result) + @test !hasintercept(nointerceptmod) + + rng = StableRNG(1234321) + secondcolinterceptmod = glm([randn(rng, 5) ones(5)], ones(5), Binomial(), LogitLink()) + @test hasintercept(secondcolinterceptmod) +end