diff --git a/src/ftest.jl b/src/ftest.jl index 4f40c233..8ed0e3c3 100644 --- a/src/ftest.jl +++ b/src/ftest.jl @@ -1,14 +1,13 @@ mutable struct FTestResult{N} ssr::NTuple{N, Float64} - dof::NTuple{N, Int} dof_resid::NTuple{N, Int} r2::NTuple{N, Float64} - fstat::Tuple{Vararg{Float64}} - pval::Tuple{Vararg{PValue}} + fstat::NTuple{N, Float64} + pval::NTuple{N, Float64} end """A helper function to determine if mod1 is nested in mod2""" -function issubmodel(mod1::LinPredModel, mod2::LinPredModel; atol=0::Real) +function issubmodel(mod1::LinPredModel, mod2::LinPredModel; atol::Real=0.0) mod1.rr.y != mod2.rr.y && return false # Response variables must be equal # Test that models are nested @@ -28,15 +27,14 @@ _diffn(t::NTuple{N, T}) where {N, T} = ntuple(i->t[i]-t[i+1], N-1) _diff(t::NTuple{N, T}) where {N, T} = ntuple(i->t[i+1]-t[i], N-1) -dividetuples(t1::NTuple{N}, t2::NTuple{N}) where {N} = ntuple(i->t1[i]/t2[i], N) - """ - ftest(mod::LinearModel...; atol=0::Real) + ftest(mod::LinearModel...; atol::Real=0.0) -For each sequential pair of linear predictors in `mod`, perform an F-test to determine if -the first one fits significantly better than the next. +For each sequential pair of linear models in `mod...`, perform an F-test to determine if +the one model fits significantly better than the other. Models must have been fitted +on the same data, and be nested either in forward or backward direction. -A table is returned containing residual degrees of freedom (DOF), degrees of freedom, +A table is returned containing residual degrees of freedom (DOF), difference in DOF from the preceding model, sum of squared residuals (SSR), difference in SSR from the preceding model, R², difference in R² from the preceding model, and F-statistic and p-value for the comparison between the two models. @@ -57,75 +55,95 @@ this is an ANOVA, our null hypothesis is that `Result ~ 1` fits the data as well ```jldoctest julia> dat = DataFrame(Treatment=[1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2.], Result=[1.1, 1.2, 1, 2.2, 1.9, 2, .9, 1, 1, 2.2, 2, 2], - Other=[1, 1, 2, 1, 2, 1, 3, 1, 1, 2, 2, 1]); - -julia> model = lm(@formula(Result ~ 1 + Treatment), dat); + Other=categorical([1, 1, 2, 1, 2, 1, 3, 1, 1, 2, 2, 1])); julia> nullmodel = lm(@formula(Result ~ 1), dat); -julia> bigmodel = lm(@formula(Result ~ 1 + Treatment + Other), dat); - -julia> ft = ftest(model.model, nullmodel.model) - Res. DOF DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) -Model 1 10 3 0.1283 0.9603 -Model 2 11 2 -1 3.2292 -3.1008 0.0000 0.9603 241.6234 <1e-7 +julia> model = lm(@formula(Result ~ 1 + Treatment), dat); +julia> bigmodel = lm(@formula(Result ~ 1 + Treatment + Other), dat); -julia> ftest(bigmodel.model, model.model, nullmodel.model) - Res. DOF DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) -Model 1 9 4 0.1038 0.9678 -Model 2 10 3 -1 0.1283 -0.0245 0.9603 0.0076 2.1236 0.1790 -Model 3 11 2 -1 3.2292 -3.1008 0.0000 0.9603 241.6234 <1e-7 +julia> ftest(nullmodel.model, model.model) +────────────────────────────────────────────────────────────────────── + Res. DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) +────────────────────────────────────────────────────────────────────── +[1] 11 3.2292 -0.0000 +[2] 10 -1 0.1283 -3.1008 0.9603 0.9603 241.6234 <1e-7 +────────────────────────────────────────────────────────────────────── + +julia> ftest(nullmodel.model, model.model, bigmodel.model) +─────────────────────────────────────────────────────────────────────── + Res. DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) +─────────────────────────────────────────────────────────────────────── +[1] 11 3.2292 -0.0000 +[2] 10 -1 0.1283 -3.1008 0.9603 0.9603 241.6234 <1e-7 +[3] 8 -2 0.1017 -0.0266 0.9685 0.0082 1.0456 0.3950 +─────────────────────────────────────────────────────────────────────── ``` """ -function ftest(mods::LinearModel...; atol=0::Real) - nmodels = length(mods) - for i in 2:nmodels - issubmodel(mods[i], mods[i-1], atol=atol) || - throw(ArgumentError("F test $i is only valid if model $i is nested in model $(i-1)")) +function ftest(mods::LinearModel...; atol::Real=0.0) + forward = length(mods) == 1 || dof(mods[1]) <= dof(mods[2]) + if forward + for i in 2:length(mods) + if dof(mods[i-1]) >= dof(mods[i]) || !issubmodel(mods[i-1], mods[i], atol=atol) + throw(ArgumentError("F test is only valid for nested models")) + end + end + else + for i in 2:length(mods) + if dof(mods[i]) >= dof(mods[i-1]) || !issubmodel(mods[i], mods[i-1], atol=atol) + throw(ArgumentError("F test is only valid for nested models")) + end + end end SSR = deviance.(mods) - nparams = dof.(mods) - - df2 = _diffn(nparams) - df1 = Int.(dof_residual.(mods)) - - MSR1 = dividetuples(_diff(SSR), df2) - MSR2 = dividetuples(SSR, df1)[1:nmodels-1] + Δdf = _diff(dof.(mods)) + dfr = Int.(dof_residual.(mods)) + MSR1 = _diffn(SSR) ./ Δdf + MSR2 = (SSR ./ dfr) + if forward + MSR2 = MSR2[2:end] + dfr_big = dfr[2:end] + else + MSR2 = MSR2[1:end-1] + dfr_big = dfr[1:end-1] + end - fstat = dividetuples(MSR1, MSR2) - pval = PValue.(ccdf.(FDist.(df2, df1[1:nmodels-1]), fstat)) - return FTestResult(SSR, nparams, df1, r2.(mods), fstat, pval) + fstat = (NaN, (MSR1 ./ MSR2)...) + pval = (NaN, ccdf.(FDist.(abs.(Δdf), dfr_big), abs.(fstat[2:end]))...) + return FTestResult(SSR, dfr, r2.(mods), fstat, pval) end function show(io::IO, ftr::FTestResult{N}) where N - Δdof = _diffn(ftr.dof_resid) - Δssr = _diffn(ftr.ssr) - ΔR² = _diffn(ftr.r2) + Δdof = _diff(ftr.dof_resid) + Δssr = _diff(ftr.ssr) + ΔR² = _diff(ftr.r2) - nc = 10 + nc = 9 nr = N outrows = Matrix{String}(undef, nr+1, nc) - outrows[1, :] = ["", "Res. DOF", "DOF", "ΔDOF", "SSR", "ΔSSR", + outrows[1, :] = ["", "Res. DOF", "ΔDOF", "SSR", "ΔSSR", "R²", "ΔR²", "F*", "p(>F)"] - outrows[2, :] = ["Model 1", @sprintf("%.0d", ftr.dof_resid[1]), - @sprintf("%.0d", ftr.dof[1]), " ", + outrows[2, :] = ["[1]", @sprintf("%.0d", ftr.dof_resid[1]), " ", @sprintf("%.4f", ftr.ssr[1]), " ", @sprintf("%.4f", ftr.r2[1]), " ", " ", " "] for i in 2:nr - outrows[i+1, :] = ["Model $i", @sprintf("%.0d", ftr.dof_resid[i]), - @sprintf("%.0d", ftr.dof[i]), @sprintf("%.0d", Δdof[i-1]), + outrows[i+1, :] = ["[$i]", + @sprintf("%.0d", ftr.dof_resid[i]), @sprintf("%.0d", Δdof[i-1]), @sprintf("%.4f", ftr.ssr[i]), @sprintf("%.4f", Δssr[i-1]), @sprintf("%.4f", ftr.r2[i]), @sprintf("%.4f", ΔR²[i-1]), - @sprintf("%.4f", ftr.fstat[i-1]), string(ftr.pval[i-1]) ] + @sprintf("%.4f", ftr.fstat[i]), string(PValue(ftr.pval[i])) ] end colwidths = length.(outrows) max_colwidths = [maximum(view(colwidths, :, i)) for i in 1:nc] + totwidth = sum(max_colwidths) + 2*8 + + println(io, '─'^totwidth) for r in 1:nr+1 for c in 1:nc @@ -134,12 +152,14 @@ function show(io::IO, ftr::FTestResult{N}) where N padding = " "^(max_colwidths[c]-cur_cell_len) if c > 1 - padding = " "*padding + padding = " "*padding end print(io, padding) print(io, cur_cell) end print(io, "\n") + r == 1 && println(io, '─'^totwidth) end + print(io, '─'^totwidth) end diff --git a/test/runtests.jl b/test/runtests.jl index 625d4708..93f84c88 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -472,7 +472,7 @@ end @testset "F test for model comparison" begin d = DataFrame(Treatment=[1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2.], Result=[1.1, 1.2, 1, 2.2, 1.9, 2, .9, 1, 1, 2.2, 2, 2], - Other=[1, 1, 2, 1, 2, 1, 3, 1, 1, 2, 2, 1]) + Other=categorical([1, 1, 2, 1, 2, 1, 3, 1, 1, 2, 2, 1])) mod = lm(@formula(Result~Treatment), d).model othermod = lm(@formula(Result~Other), d).model nullmod = lm(@formula(Result~1), d).model @@ -483,32 +483,66 @@ end @test !GLM.issubmodel(bothmod, mod) @test GLM.issubmodel(othermod, bothmod) - @test_throws ArgumentError ftest(mod, othermod) - - d[:Sum] = d[:Treatment] + d[:Other] + d[:Sum] = d[:Treatment] + (d[:Other] .== 1) summod = lm(@formula(Result~Sum), d).model @test GLM.issubmodel(summod, bothmod) - ft = ftest(mod, nullmod) - @test isapprox(ft.pval[1].v, 2.481215056713184e-8) - @test sprint(show, ftest(mod, nullmod)) == - """ - Res. DOF DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) - Model 1 10 3 0.1283 0.9603 - Model 2 11 2 -1 3.2292 -3.1008 -0.0000 0.9603 241.6234 <1e-7 - """ + ft0 = ftest(mod) + @test sprint(show, ft0) == """ + ───────────────────────────────────────────────────────── + Res. DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) + ───────────────────────────────────────────────────────── + [1] 10 0.1283 0.9603 + ─────────────────────────────────────────────────────────""" + + ft1a = ftest(mod, nullmod) + @test isnan(ft1a.pval[1]) + @test ft1a.pval[2] ≈ 2.481215056713184e-8 + @test sprint(show, ft1a) == """ + ────────────────────────────────────────────────────────────────────── + Res. DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) + ────────────────────────────────────────────────────────────────────── + [1] 10 0.1283 0.9603 + [2] 11 1 3.2292 3.1008 -0.0000 -0.9603 241.6234 <1e-7 + ──────────────────────────────────────────────────────────────────────""" + + ft1b = ftest(nullmod, mod) + @test isnan(ft1b.pval[1]) + @test ft1b.pval[2] ≈ 2.481215056713184e-8 + @test sprint(show, ft1b) == """ + ────────────────────────────────────────────────────────────────────── + Res. DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) + ────────────────────────────────────────────────────────────────────── + [1] 11 3.2292 -0.0000 + [2] 10 -1 0.1283 -3.1008 0.9603 0.9603 241.6234 <1e-7 + ──────────────────────────────────────────────────────────────────────""" bigmod = lm(@formula(Result~Treatment+Other), d).model - ft2 = ftest(bigmod, mod, nullmod) - @test isapprox(ft2.pval[2].v, 2.481215056713184e-8) - @test isapprox(ft2.pval[1].v, 0.17903437900958952) - @test sprint(show, ftest(bigmod, mod, nullmod)) == - """ - Res. DOF DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) - Model 1 9 4 0.1038 0.9678 - Model 2 10 3 -1 0.1283 -0.0245 0.9603 0.0076 2.1236 0.1790 - Model 3 11 2 -1 3.2292 -3.1008 -0.0000 0.9603 241.6234 <1e-7 - """ + ft2a = ftest(nullmod, mod, bigmod) + @test isnan(ft2a.pval[1]) + @test ft2a.pval[2] ≈ 2.481215056713184e-8 + @test ft2a.pval[3] ≈ 0.3949973540194818 + @test sprint(show, ft2a) == """ + ─────────────────────────────────────────────────────────────────────── + Res. DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) + ─────────────────────────────────────────────────────────────────────── + [1] 11 3.2292 -0.0000 + [2] 10 -1 0.1283 -3.1008 0.9603 0.9603 241.6234 <1e-7 + [3] 8 -2 0.1017 -0.0266 0.9685 0.0082 1.0456 0.3950 + ───────────────────────────────────────────────────────────────────────""" + + ft2b = ftest(bigmod, mod, nullmod) + @test isnan(ft2b.pval[1]) + @test ft2b.pval[2] ≈ 0.3949973540194818 + @test ft2b.pval[3] ≈ 2.481215056713184e-8 + @test sprint(show, ft2b) == """ + ─────────────────────────────────────────────────────────────────────── + Res. DOF ΔDOF SSR ΔSSR R² ΔR² F* p(>F) + ─────────────────────────────────────────────────────────────────────── + [1] 8 0.1017 0.9685 + [2] 10 2 0.1283 0.0266 0.9603 -0.0082 1.0456 0.3950 + [3] 11 1 3.2292 3.1008 -0.0000 -0.9603 241.6234 <1e-7 + ───────────────────────────────────────────────────────────────────────""" end @testset "F test rounding error" begin