# Hypothesis Testing

Lukas Graz  
February 13, 2025

In [None]:
# interactive <- function() FALSE
source("R/data_prep.R")

Number of matches per filter criteria (not disjoint)
  Headphone  PRS_all_NA    Distance Activity_NA    Duration  HMNoise_NA 
        303         226         221         102          96          96 
JourneyTime 
         20 
Keep  1494 of 2206 observations

Imputing PRS_orig_vars

TODO: Remove PCA?

Imputing mediators & GIS_vars for MLR

## Linear Modeling - Testing HM_NOISE (Normalized)

### Imputation with MissForest on Training Data

In [None]:
#| code-fold: true
#| code-summary: "Number of NAs in Mediators and GIS variables"
sapply(D[Mediator_vars], \(x) sum(is.na(x)))

 FEELNAT   LNOISE LOC_SENS LOC_SOUN LOC_SCEN LOC_VISE LOC_VEGE LOC_FAUN 
      16      291       28       30       36       62       69       88 

  LCARTIF_sqrt  LCFOREST_sqrt          HETER    OVDIST_sqrt     VIS5K_sqrt 
             0              0              0              0              0 
       RL_NDVI       RL_NOISE    DISTKM_sqrt   JNYTIME_sqrt STRIMP123_sqrt 
             0              0              0             86              0 
STRIMP999_sqrt 
             0 

In [None]:
#| code-fold: true
#| code-summary: "Impute missing values using MissForest"
# Mediator imputation
D_trn[Mediator_vars] <- xfun::cache_rds({
  missForest(as.matrix(D_trn[Mediator_vars]))
  }, 
  file = "Mediator_imputation.rds", 
  dir = "cache/",
  hash = list(as.matrix(D_trn[Mediator_vars]))
)$ximp |> as.data.frame()

# GIS imputation (missForest)
D_trn[GIS_vars] <- xfun::cache_rds({
  missForest(as.matrix(D_trn[GIS_vars]))
  }, 
  file = "GIS_imputation.rds", 
  dir = "cache/",
  hash = list(as.matrix(D_trn[GIS_vars]))
)$ximp |> as.data.frame()

### Testing VIF

In [None]:
#| code-fold: true
#| code-summary: "VIF: PRS ~ Mediators + HM_NOISE (without interaction)"
car::vif(lm(as.formula(paste0(
  PRS_vars[1], " ~ ", paste(Mediator_vars, collapse = " + ")
)), D_trn))

 FEELNAT   LNOISE LOC_SENS LOC_SOUN LOC_SCEN LOC_VISE LOC_VEGE LOC_FAUN 
    1.55     1.43     1.48     1.96     2.16     1.68     1.85     1.40 

In [None]:
#| code-fold: true
#| code-summary: "VIF: Mediators ~ GIS + HM_NOISE (without interaction)"
car::vif(lm(as.formula(paste0(
  Mediator_vars[1], " ~ ", paste(GIS_vars, collapse = " + ")
)), D_trn))

  LCARTIF_sqrt  LCFOREST_sqrt          HETER    OVDIST_sqrt     VIS5K_sqrt 
          4.88           2.20           1.14           2.00           1.24 
       RL_NDVI       RL_NOISE    DISTKM_sqrt   JNYTIME_sqrt STRIMP123_sqrt 
          2.79           2.29           1.35           1.37           1.93 
STRIMP999_sqrt 
          2.65 

In [None]:
#| code-fold: true
#| code-summary: "VIF: PRS ~ Mediators * HM_NOISE (with interaction)"
car::vif(fit_PRS_MED <- lm(as.formula(paste0(
  PRS_vars[1], " ~ ", 
  "HM_NOISE_nrm * (", paste(Mediator_vars, collapse = " + "), ")"
)), D_trn))

there are higher-order terms (interactions) in this model
consider setting type = 'predictor'; see ?vif

         HM_NOISE_nrm               FEELNAT                LNOISE 
                60.63                  1.59                  1.53 
             LOC_SENS              LOC_SOUN              LOC_SCEN 
                 1.54                  1.98                  2.17 
             LOC_VISE              LOC_VEGE              LOC_FAUN 
                 1.71                  1.88                  1.42 
 HM_NOISE_nrm:FEELNAT   HM_NOISE_nrm:LNOISE HM_NOISE_nrm:LOC_SENS 
                51.26                 51.52                 22.91 
HM_NOISE_nrm:LOC_SOUN HM_NOISE_nrm:LOC_SCEN HM_NOISE_nrm:LOC_VISE 
                45.89                 33.97                 38.10 
HM_NOISE_nrm:LOC_VEGE HM_NOISE_nrm:LOC_FAUN 
                47.68                 10.68 

In [None]:
#| code-fold: true
#| code-summary: "VIF: Mediators ~ GIS * HM_NOISE (with interaction)"
car::vif(fit_MED_GIS <- lm(as.formula(paste0(
  Mediator_vars[1], " ~ ", 
  "HM_NOISE_nrm * (", paste(GIS_vars, collapse = " + "), ")"
)), D_trn))

there are higher-order terms (interactions) in this model
consider setting type = 'predictor'; see ?vif

               HM_NOISE_nrm                LCARTIF_sqrt 
                     108.08                        5.01 
              LCFOREST_sqrt                       HETER 
                       2.22                        1.17 
                OVDIST_sqrt                  VIS5K_sqrt 
                       2.06                        1.25 
                    RL_NDVI                    RL_NOISE 
                       2.82                        2.44 
                DISTKM_sqrt                JNYTIME_sqrt 
                       1.40                        1.40 
             STRIMP123_sqrt              STRIMP999_sqrt 
                       2.00                        2.71 
  HM_NOISE_nrm:LCARTIF_sqrt  HM_NOISE_nrm:LCFOREST_sqrt 
                       8.60                        6.29 
         HM_NOISE_nrm:HETER    HM_NOISE_nrm:OVDIST_sqrt 
                      11.55                       11.52 
    HM_NOISE_nrm:VIS5K_sqrt        HM_NOISE_nrm:RL_NDVI 
                       7.87    

Since we model with interactions later, these are also the p-values we want to use.

### Mediators ~ GIS

In [None]:
#| code-fold: true
Res1 <- list()
for (mediator in Mediator_vars) {
  full_model <- lm(as.formula(paste0(
    mediator, " ~ ", 
    "HM_NOISE_nrm * (", paste(GIS_vars, collapse = " + "), ")"
  )), D_trn)
  small_model <- step(full_model, trace = FALSE, k = log(nrow(D_trn)))
  Res1[[mediator]] <- lm(formula(small_model), D_tst)
}
lapply(Res1, summary)

$FEELNAT

Call:
lm(formula = formula(small_model), data = D_tst)

Residuals:
   Min     1Q Median     3Q    Max 
-5.159 -0.439  0.271  0.653  2.015 

Coefficients:
                          Estimate Std. Error t value Pr(>|t|)    
(Intercept)                5.84700    0.25292   23.12  < 2e-16 ***
HM_NOISE_nrm              -0.05734    0.05243   -1.09    0.274    
LCARTIF_sqrt              -0.87932    0.19991   -4.40  1.3e-05 ***
HETER                      0.13549    0.09619    1.41    0.159    
OVDIST_sqrt                0.00216    0.00477    0.45    0.650    
RL_NDVI                    0.49972    0.21719    2.30    0.022 *  
HM_NOISE_nrm:LCARTIF_sqrt -0.25672    0.13600   -1.89    0.059 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.995 on 731 degrees of freedom
  (9 observations deleted due to missingness)
Multiple R-squared:  0.118, Adjusted R-squared:  0.111 
F-statistic: 16.3 on 6 and 731 DF,  p-value: <2e-16


$LNOISE

Call:
lm(f

### PRS ~ Mediators

In [None]:
#| code-fold: true
Res2 <- list()
for (mediator in Mediator_vars) {
  full_model <- lm(as.formula(paste0(
    mediator, " ~ ", 
    "HM_NOISE_nrm * (", paste(GIS_vars, collapse = " + "), ")"
  )), D_trn)
  small_model <- step(full_model, trace = FALSE, k = log(nrow(D_trn)))
  Res2[[mediator]] <- lm(formula(small_model), D_tst)
}
lapply(Res2, summary)

$FEELNAT

Call:
lm(formula = formula(small_model), data = D_tst)

Residuals:
   Min     1Q Median     3Q    Max 
-5.159 -0.439  0.271  0.653  2.015 

Coefficients:
                          Estimate Std. Error t value Pr(>|t|)    
(Intercept)                5.84700    0.25292   23.12  < 2e-16 ***
HM_NOISE_nrm              -0.05734    0.05243   -1.09    0.274    
LCARTIF_sqrt              -0.87932    0.19991   -4.40  1.3e-05 ***
HETER                      0.13549    0.09619    1.41    0.159    
OVDIST_sqrt                0.00216    0.00477    0.45    0.650    
RL_NDVI                    0.49972    0.21719    2.30    0.022 *  
HM_NOISE_nrm:LCARTIF_sqrt -0.25672    0.13600   -1.89    0.059 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.995 on 731 degrees of freedom
  (9 observations deleted due to missingness)
Multiple R-squared:  0.118, Adjusted R-squared:  0.111 
F-statistic: 16.3 on 6 and 731 DF,  p-value: <2e-16


$LNOISE

Call:
lm(f

### All Interactions: Mediators ~ (GIS + HM_NOISE)^2

In [None]:
Res3 <- list()
for (mediator in Mediator_vars) {
  intercept_model <- lm(as.formula(paste0(
    mediator, " ~ 1")), D_trn)
  step_model <- step(intercept_model, 
    scope = as.formula(paste0(
      mediator, " ~ ", 
      "(HM_NOISE_nrm + ", paste(GIS_vars, collapse = " + "), ")^2"
    )),
    trace = FALSE, k = log(nrow(D_trn))
  )
  Res3[[mediator]] <- lm(formula(step_model), D_tst)
}
lapply(Res3, summary, signif.legend = FALSE)

$FEELNAT

Call:
lm(formula = formula(step_model), data = D_tst)

Residuals:
   Min     1Q Median     3Q    Max 
-5.231 -0.412  0.278  0.641  1.777 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)           6.19417    0.23315   26.57  < 2e-16 ***
LCARTIF_sqrt         -2.00802    0.44187   -4.54  6.4e-06 ***
RL_NDVI               0.17949    0.25803    0.70   0.4869    
OVDIST_sqrt           0.00281    0.00470    0.60   0.5511    
LCARTIF_sqrt:RL_NDVI  2.22177    0.77894    2.85   0.0045 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1 on 733 degrees of freedom
  (9 observations deleted due to missingness)
Multiple R-squared:  0.106, Adjusted R-squared:  0.101 
F-statistic: 21.8 on 4 and 733 DF,  p-value: <2e-16


$LNOISE

Call:
lm(formula = formula(step_model), data = D_tst)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.9861 -0.3934  0.0089  0.5050  1.2110 

Coefficients:
             Est

### All Interactions: PRS ~ (Mediators + GIS + HM_NOISE)^2

In [None]:
Res4 <- list()
for (prs in PRS_vars) {
  intercept_model <- lm(as.formula(paste0(
    prs, " ~ 1")), D_trn)
  step_model <- step(intercept_model, 
    scope = as.formula(paste0(
      prs, " ~ ", 
      "(HM_NOISE_nrm + ", paste(GIS_vars, collapse = " + "), " + ", 
      paste(Mediator_vars, collapse = " + "), ")^2"
    )),
    trace = FALSE, k = log(nrow(D_trn))
  )
  Res4[[prs]] <- lm(formula(step_model), D_tst)
}
lapply(Res4, summary)

$LA

Call:
lm(formula = formula(step_model), data = D_tst)

Residuals:
   Min     1Q Median     3Q    Max 
-3.766 -0.576  0.103  0.657  2.851 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)       2.35411    0.70674    3.33  0.00092 ***
LOC_SCEN          0.10364    0.19350    0.54  0.59246    
LOC_VISE          0.13862    0.04460    3.11  0.00198 ** 
FEELNAT           0.18417    0.11824    1.56  0.11991    
RL_NDVI          -1.22692    0.69103   -1.78  0.07636 .  
LOC_FAUN          0.14348    0.03391    4.23  2.7e-05 ***
LNOISE            0.19811    0.05974    3.32  0.00097 ***
LOC_SCEN:FEELNAT -0.00175    0.03140   -0.06  0.95556    
LOC_SCEN:RL_NDVI  0.12534    0.17737    0.71  0.48007    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.96 on 556 degrees of freedom
  (182 observations deleted due to missingness)
Multiple R-squared:  0.256, Adjusted R-squared:  0.245 
F-statistic: 23.9 on 8 and 556 DF,

### Legacy Code

In [None]:
#| code-fold: true
#| code-summary: "Variable selection over multiple y (not desired)"
#| echo: false
#| eval: false

# library(grpreg)
# fit <- cv.grpreg(X = model.matrix(fit_MED_GIS)[,-1], y = D_trn[Mediator_vars[1:2]])
# coef(fit) |> t()
# fit$beta
# plot(fit)

In [None]:
#| code-fold: true
#| code-summary: "Linear models with mice"
#| echo: false
#| eval: false

Y <- D[]

library(mice, quietly = TRUE)
library(car, quietly = TRUE)
library(miceadds, quietly = TRUE)
data(nhanes2, package = "mice")
set.seed(9090)

mi.res <- miceadds::mice.1chain(nhanes2, burnin = 4, iter = 20, Nimp = 8)
an2a <- miceadds::mi.anova(mi.res = mi.res, formula = "bmi ~ age * chl")

mod1 <- with(mi.res, stats::lm(bmi ~ age * chl))
mod0 <- with(mi.res, stats::lm(bmi ~ age + chl))

mitml::testModels(model = mod1$analyses, null.model = mod0$analyses, method = "D1")
mitml::testModels(model = mod1$analyses, null.model = mod0$analyses, method = "D2")

an2b <- miceadds::mi.anova(mi.res = mi.res, formula = "bmi ~ age * chl", type = 3)