Merge pull request #108 from NetCoupler/regress-on-for-standardize

Regress on for standardize fixed
NetCoupler · Apr 2, 2020 · b87a822 · b87a822
2 parents d6babe3 + e6fb09b
commit b87a822
Show file tree

Hide file tree

Showing 5 changed files with 119 additions and 33 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -18,5 +18,5 @@ after_success:
 before_cache: Rscript -e 'remotes::install_cran("pkgdown")'
 deploy:
   provider: script
-  script: Rscript -e 'pkgdown::deploy_site_github(ssh_id = Sys.getenv("TRAVIS_DEPLOY_KEY", ""))'
+  script: Rscript -e 'pkgdown::deploy_site_github(ssh_id = Sys.getenv("TRAVIS_DEPLOY_KEY", ""), verbose = TRUE)'
   skip_cleanup: true
diff --git a/R/standardize.R b/R/standardize.R
@@ -23,27 +23,29 @@
 #'
 #' @examples
 #'
-#' # Don't regress on any variable.
+#' # Don't regress on any variable
 #' simulated_data %>%
-#'   nc_standardize(vars(matches("metabolite_"))) %>%
-#'   tibble::as_tibble()
+#'   nc_standardize(vars(matches("metabolite_")))
 #'
-#' # Don't regress on any variable.
+#' # Extract residuals by regressing on a variable
 #' simulated_data %>%
-#'   nc_standardize(vars(matches("metabolite_")), "age") %>%
-#'   tibble::as_tibble()
+#'   nc_standardize(vars(matches("metabolite_")), "age")
 nc_standardize <- function(.tbl, .vars, .regressed_on = NULL) {
     if (!is.null(.regressed_on)) {
         assertive.types::assert_is_character(.regressed_on)
-        .tbl %>%
-            dplyr::mutate_at(.vars, .funs = .log_regress_standardize,
-                             regressed_on = .tbl[.regressed_on])
+        standardized_data <- .replace_with_residuals(
+            .tbl = .tbl,
+            .vars = .vars,
+            .regressed_on = .regressed_on
+        )
     } else {
-        .tbl %>%
+        standardized_data <- .tbl %>%
             dplyr::mutate_at(.vars, .funs = .log_standardize)
     }
+    return(standardized_data)
 }
 
+
 .log_standardize <- function(x) {
     as.numeric(scale(log(x)))
 }
@@ -54,3 +56,47 @@ nc_standardize <- function(.tbl, .vars, .regressed_on = NULL) {
     residual_x <- stats::residuals(stats::glm.fit(y = logged_x, x = regressed_on))
     as.numeric(scale(residual_x))
 }
+
+.replace_with_residuals <- function(.tbl, .vars, .regressed_on) {
+    metabolic_names <- .tbl %>%
+        select_at(.vars) %>%
+        names()
+
+    data_with_id <- .tbl %>%
+        # TODO: Check that no id variable exists
+        mutate(.id_variable = dplyr::row_number())
+
+    data_with_other_vars <- data_with_id %>%
+        select_at(vars(-metabolic_names))
+
+    standardized_data <- metabolic_names %>%
+        purrr::map(~ .extract_residuals(.x, data_with_id, .regressed_on)) %>%
+        purrr::reduce(dplyr::full_join, by = ".id_variable") %>%
+        dplyr::full_join(data_with_other_vars, by = ".id_variable") %>%
+        dplyr::arrange_at(".id_variable") %>%
+        # To put in original ordering
+        select_at(names(data_with_id)) %>%
+        select(-".id_variable")
+
+    return(standardized_data)
+}
+
+.extract_residuals <- function(.var, .tbl, .regressed_on, .id_var = ".id_variable") {
+    no_missing <- .tbl %>%
+        select_at(c(.var, .regressed_on, .id_var)) %>%
+        stats::na.omit()
+
+    metabolic_var <- no_missing[[.var]]
+    regress_on_vars <- no_missing[.regressed_on]
+
+    metabolic_residuals <-
+        .log_regress_standardize(metabolic_var,
+                                 regress_on_vars)
+
+    no_missing[.var] <- metabolic_residuals
+
+    data_with_residuals <- no_missing %>%
+        select_at(c(.var, ".id_variable"))
+
+    return(data_with_residuals)
+}
diff --git a/man/nc_standardize.Rd b/man/nc_standardize.Rd
diff --git a/tests/testthat/test-standardize.R b/tests/testthat/test-standardize.R
@@ -38,3 +38,8 @@ test_that("standardization with residuals works", {
     expect_false(identical(simulated_data, standardized_with_residuals))
     expect_false(identical(standardized, standardized_with_residuals))
 })
+
+#' simulated_data %>%
+#'   mutate(Random = rnorm(n(), 10, 2)) %>%
+#'   .insert_random_missingness() %>%
+#'   nc_standardize(vars(matches("metabolite_")), c("age", "Random"))
diff --git a/vignettes/NetCoupler.Rmd b/vignettes/NetCoupler.Rmd
@@ -279,14 +279,62 @@ library(NetCoupler)
 library(dplyr)
 ```
 
-```{r example-use, cache=TRUE}
+### Estimating the metabolic network
+
+For estimating the network, it's (basically) required to standardize
+the metabolic variables before inputting into `nc_create_network()`.
+If you intend to also adjust for potential confounders when estimating
+the exposure or outcome side connections,
+you can include the potential impact these confounders may have on 
+the network by regressing the confounders on the metabolic variables.
+Then the residuals can be extracted and used when constructing the network.
+You do this with the `nc_standardize()` function. 
+This function also log-transforms and scales 
+(mean-center and z-score normalize) the values of the metabolic variables.
+We do this because the network estimation algorithm can sometimes be finicky
+about differences in variable numerical scale (mean of 1 vs mean of 1000).
+
+```{r metabolic-residuals}
+std_metabolic_data <- simulated_data %>% 
+    nc_standardize(vars(starts_with("metabolite")), 
+                   .regressed_on = "age") %>% 
+    select(starts_with("metabolite"))
+```
+
+After that, you can estimate the network.
 
+```{r create-network}
 # Make partial independence network from metabolite data
-metabolite_network <- simulated_data %>% 
-    select(matches("metabolite")) %>% 
+metabolite_network <- std_metabolic_data %>% 
     nc_create_network()
+```
+
+To see what the network looks like,
+use the function `nc_plot_network()`.
+
+```{r visualize-metabolic-network, fig.width=5.6, fig.height=4.5}
+std_metabolic_data %>%
+    nc_plot_network(metabolite_network)
+```
+
+The plot is a bit crowded, but it provides a base to start tidying up from.
+
+### Estimating exposure and outcome-side connections
 
-outcome_estimates <- simulated_data %>%
+For the exposure and outcome side, 
+you should standardize the metabolic variables, 
+but this time, we don't regress on the confounders 
+since they will be included in the models.
+
+```{r standardize-data}
+standardized_data <- simulated_data %>% 
+    nc_standardize(vars(starts_with("metabolite")))
+```
+
+Then estimate the outcome or exposure:
+
+```{r example-use, cache=TRUE}
+outcome_estimates <- standardized_data %>%
     nc_outcome_estimates(
         .graph = metabolite_network,
         .outcome = "survival::Surv(survival_time, case_status)",
@@ -296,7 +344,7 @@ outcome_estimates <- simulated_data %>%
 
 outcome_estimates
 
-exposure_estimates <- simulated_data %>%
+exposure_estimates <- standardized_data %>%
     nc_exposure_estimates(
         .graph = metabolite_network,
         .exposure = "exposure",
@@ -307,17 +355,6 @@ exposure_estimates <- simulated_data %>%
 exposure_estimates
 ```
 
-### Visualizing the network
-
-To see the network of only the metabolic variables, use the function
-`nc_plot_network()`.
-
-```{r visualize-metabolic-network, fig.width=5.6, fig.height=4.5}
-simulated_data %>% 
-    select(matches("metabolite")) %>% 
-    nc_plot_network(metabolite_network)
-```
-
 ### Classifying direct effects on exposure or outcome side
 
 You can classify direct effects by using `nc_classify_effects()`,