From c96604f33afb51368bdef5e53072530eb0a711db Mon Sep 17 00:00:00 2001
From: Luke Johnston <lwjohnst@gmail.com>
Date: Sun, 29 Mar 2020 15:50:42 +0200
Subject: [PATCH 1/6] fixed bug with nc_standardize the stopped regressed on to
 work

---
 NEWS.md               | 11 ++++++--
 R/standardize.R       | 66 ++++++++++++++++++++++++++++++++++++-------
 man/nc_standardize.Rd | 10 +++----
 3 files changed, 69 insertions(+), 18 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 5f2a846..831c4bf 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,7 +1,7 @@
 # NetCoupler 0.0.4.9000 (development version)
 
-* Input dataset can include missingness. Input data is treated as complete case
-for only the variables used in the modelling (#88).
+## Added features
+
 * For `lm` and `glm` models, model summary statistics are added (#88).
 * Add a function to classify the direct effects between outcome or exposure and 
 the network (#98).
@@ -14,6 +14,13 @@ to help create the weights for the network plot.
 results too much, see #60 for details (#83).
 * Removed stringr dependency (#65, #83).
 
+## Fixed bugs and other problems
+
+* Fix problem with `nc_standardize()` that prevented the ability to use the `.regressed_on`
+argument to extract residuals (#).
+* Input dataset can include missingness. Input data is treated as complete case
+for only the variables used in the modelling (#88).
+
 # NetCoupler 0.0.3.9000
 
 * Add `nc_standardize()` function to standardize the metabolic variables (#73).
diff --git a/R/standardize.R b/R/standardize.R
index bbce5ab..acb288d 100644
--- a/R/standardize.R
+++ b/R/standardize.R
@@ -23,27 +23,29 @@
 #'
 #' @examples
 #'
-#' # Don't regress on any variable.
+#' # Don't regress on any variable
 #' simulated_data %>%
-#'   nc_standardize(vars(matches("metabolite_"))) %>%
-#'   tibble::as_tibble()
+#'   nc_standardize(matches("metabolite_"))
 #'
-#' # Don't regress on any variable.
+#' # Extract residuals by regressing on a variable
 #' simulated_data %>%
-#'   nc_standardize(vars(matches("metabolite_")), "age") %>%
-#'   tibble::as_tibble()
+#'   nc_standardize(vars(matches("metabolite_")), "age")
 nc_standardize <- function(.tbl, .vars, .regressed_on = NULL) {
     if (!is.null(.regressed_on)) {
         assertive.types::assert_is_character(.regressed_on)
-        .tbl %>%
-            dplyr::mutate_at(.vars, .funs = .log_regress_standardize,
-                             regressed_on = .tbl[.regressed_on])
+        standardized_data <- .replace_with_residuals(
+            .tbl = .tbl,
+            .vars = .vars,
+            .regressed_on = .regressed_on
+        )
     } else {
-        .tbl %>%
+        standardized_data <- .tbl %>%
             dplyr::mutate_at(.vars, .funs = .log_standardize)
     }
+    return(standardized_data)
 }
 
+
 .log_standardize <- function(x) {
     as.numeric(scale(log(x)))
 }
@@ -54,3 +56,47 @@ nc_standardize <- function(.tbl, .vars, .regressed_on = NULL) {
     residual_x <- stats::residuals(stats::glm.fit(y = logged_x, x = regressed_on))
     as.numeric(scale(residual_x))
 }
+
+.replace_with_residuals <- function(.tbl, .vars, .regressed_on) {
+    metabolic_names <- .tbl %>%
+        select_at(.vars) %>%
+        names()
+
+    data_with_id <- .tbl %>%
+        # TODO: Check that no id variable exists
+        mutate(.id_variable = row_number())
+
+    data_with_other_vars <- data_with_id %>%
+        select_at(vars(-metabolic_names))
+
+    standardized_data <- metabolic_names %>%
+        purrr::map(~ .extract_residuals(.x, data_with_id, .regressed_on)) %>%
+        purrr::reduce(full_join, by = ".id_variable") %>%
+        dplyr::full_join(data_with_other_vars, by = ".id_variable") %>%
+        dplyr::arrange_at(".id_variable") %>%
+        # To put in original ordering
+        select_at(names(data_with_id)) %>%
+        select(-".id_variable")
+
+    return(standardized_data)
+}
+
+.extract_residuals <- function(.var, .tbl, .regressed_on, .id_var = ".id_variable") {
+    no_missing <- .tbl %>%
+        select_at(c(.var, .regressed_on, .id_var)) %>%
+        na.omit()
+
+    metabolic_var <- no_missing[[.var]]
+    regress_on_vars <- no_missing[.regressed_on]
+
+    metabolic_residuals <-
+        .log_regress_standardize(metabolic_var,
+                                 regress_on_vars)
+
+    no_missing[.var] <- metabolic_residuals
+
+    data_with_residuals <- no_missing %>%
+        select_at(c(.var, ".id_variable"))
+
+    return(data_with_residuals)
+}
diff --git a/man/nc_standardize.Rd b/man/nc_standardize.Rd
index c9e16e4..3d6af9d 100644
--- a/man/nc_standardize.Rd
+++ b/man/nc_standardize.Rd
@@ -31,13 +31,11 @@ remove influence of potential confounding.
 }
 \examples{
 
-# Don't regress on any variable.
+# Don't regress on any variable
 simulated_data \%>\%
-  nc_standardize(vars(matches("metabolite_"))) \%>\%
-  tibble::as_tibble()
+  nc_standardize(matches("metabolite_"))
 
-# Don't regress on any variable.
+# Extract residuals by regressing on a variable
 simulated_data \%>\%
-  nc_standardize(vars(matches("metabolite_")), "age") \%>\%
-  tibble::as_tibble()
+  nc_standardize(vars(matches("metabolite_")), "age")
 }

From c07e29ea249f68ade0ed3ce20e3aea1fc85631eb Mon Sep 17 00:00:00 2001
From: Luke Johnston <lwjohnst@gmail.com>
Date: Sun, 29 Mar 2020 15:51:32 +0200
Subject: [PATCH 2/6] start of unit test for dealing with missingness in
 standardizing

---
 tests/testthat/test-standardize.R | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/testthat/test-standardize.R b/tests/testthat/test-standardize.R
index a8c3b33..71347be 100644
--- a/tests/testthat/test-standardize.R
+++ b/tests/testthat/test-standardize.R
@@ -38,3 +38,8 @@ test_that("standardization with residuals works", {
     expect_false(identical(simulated_data, standardized_with_residuals))
     expect_false(identical(standardized, standardized_with_residuals))
 })
+
+#' simulated_data %>%
+#'   mutate(Random = rnorm(n(), 10, 2)) %>%
+#'   .insert_random_missingness() %>%
+#'   nc_standardize(vars(matches("metabolite_")), c("age", "Random"))

From a7b6f894505f3448a29089226e0272a46d842410 Mon Sep 17 00:00:00 2001
From: Luke Johnston <lwjohnst@gmail.com>
Date: Sun, 29 Mar 2020 15:52:23 +0200
Subject: [PATCH 3/6] add pr number [skip ci]

---
 NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index 831c4bf..5b66467 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -17,7 +17,7 @@ results too much, see #60 for details (#83).
 ## Fixed bugs and other problems
 
 * Fix problem with `nc_standardize()` that prevented the ability to use the `.regressed_on`
-argument to extract residuals (#).
+argument to extract residuals (#108).
 * Input dataset can include missingness. Input data is treated as complete case
 for only the variables used in the modelling (#88).
 

From b857b36d442fe87361f0cd6ee86d6cb02d4a0aad Mon Sep 17 00:00:00 2001
From: Luke Johnston <lwjohnst@gmail.com>
Date: Thu, 2 Apr 2020 19:11:46 +0200
Subject: [PATCH 4/6] fix build errors

---
 R/standardize.R       | 8 ++++----
 man/nc_standardize.Rd | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/R/standardize.R b/R/standardize.R
index acb288d..14666b8 100644
--- a/R/standardize.R
+++ b/R/standardize.R
@@ -25,7 +25,7 @@
 #'
 #' # Don't regress on any variable
 #' simulated_data %>%
-#'   nc_standardize(matches("metabolite_"))
+#'   nc_standardize(vars(matches("metabolite_")))
 #'
 #' # Extract residuals by regressing on a variable
 #' simulated_data %>%
@@ -64,14 +64,14 @@ nc_standardize <- function(.tbl, .vars, .regressed_on = NULL) {
 
     data_with_id <- .tbl %>%
         # TODO: Check that no id variable exists
-        mutate(.id_variable = row_number())
+        mutate(.id_variable = dplyr::row_number())
 
     data_with_other_vars <- data_with_id %>%
         select_at(vars(-metabolic_names))
 
     standardized_data <- metabolic_names %>%
         purrr::map(~ .extract_residuals(.x, data_with_id, .regressed_on)) %>%
-        purrr::reduce(full_join, by = ".id_variable") %>%
+        purrr::reduce(dplyr::full_join, by = ".id_variable") %>%
         dplyr::full_join(data_with_other_vars, by = ".id_variable") %>%
         dplyr::arrange_at(".id_variable") %>%
         # To put in original ordering
@@ -84,7 +84,7 @@ nc_standardize <- function(.tbl, .vars, .regressed_on = NULL) {
 .extract_residuals <- function(.var, .tbl, .regressed_on, .id_var = ".id_variable") {
     no_missing <- .tbl %>%
         select_at(c(.var, .regressed_on, .id_var)) %>%
-        na.omit()
+        stats::na.omit()
 
     metabolic_var <- no_missing[[.var]]
     regress_on_vars <- no_missing[.regressed_on]
diff --git a/man/nc_standardize.Rd b/man/nc_standardize.Rd
index 3d6af9d..c2acddb 100644
--- a/man/nc_standardize.Rd
+++ b/man/nc_standardize.Rd
@@ -33,7 +33,7 @@ remove influence of potential confounding.
 
 # Don't regress on any variable
 simulated_data \%>\%
-  nc_standardize(matches("metabolite_"))
+  nc_standardize(vars(matches("metabolite_")))
 
 # Extract residuals by regressing on a variable
 simulated_data \%>\%

From 459aa76ed299f1a2df64282c80dd60d413339a24 Mon Sep 17 00:00:00 2001
From: Luke Johnston <lwjohnst@gmail.com>
Date: Thu, 2 Apr 2020 19:12:01 +0200
Subject: [PATCH 5/6] add examples of standardize in vignette

---
 vignettes/NetCoupler.Rmd | 69 ++++++++++++++++++++++++++++++----------
 1 file changed, 53 insertions(+), 16 deletions(-)

diff --git a/vignettes/NetCoupler.Rmd b/vignettes/NetCoupler.Rmd
index 51d7c94..db159a9 100644
--- a/vignettes/NetCoupler.Rmd
+++ b/vignettes/NetCoupler.Rmd
@@ -279,14 +279,62 @@ library(NetCoupler)
 library(dplyr)
 ```
 
-```{r example-use, cache=TRUE}
+### Estimating the metabolic network
+
+For estimating the network, it's (basically) required to standardize
+the metabolic variables before inputting into `nc_create_network()`.
+If you intend to also adjust for potential confounders when estimating
+the exposure or outcome side connections,
+you can include the potential impact these confounders may have on 
+the network by regressing the confounders on the metabolic variables.
+Then the residuals can be extracted and used when constructing the network.
+You do this with the `nc_standardize()` function. 
+This function also log-transforms and scales 
+(mean-center and z-score normalize) the values of the metabolic variables.
+We do this because the network estimation algorithm can sometimes be finicky
+about differences in variable numerical scale (mean of 1 vs mean of 1000).
+
+```{r metabolic-residuals}
+std_metabolic_data <- simulated_data %>% 
+    nc_standardize(vars(starts_with("metabolite")), 
+                   .regressed_on = "age") %>% 
+    select(starts_with("metabolite"))
+```
+
+After that, you can estimate the network.
 
+```{r create-network}
 # Make partial independence network from metabolite data
-metabolite_network <- simulated_data %>% 
-    select(matches("metabolite")) %>% 
+metabolite_network <- std_metabolic_data %>% 
     nc_create_network()
+```
+
+To see what the network looks like,
+use the function `nc_plot_network()`.
+
+```{r visualize-metabolic-network, fig.width=5.6, fig.height=4.5}
+std_metabolic_data %>%
+    nc_plot_network(metabolite_network)
+```
+
+The plot is a bit crowded, but it provides a base to start tidying up from.
+
+### Estimating exposure and outcome-side connections
 
-outcome_estimates <- simulated_data %>%
+For the exposure and outcome side, 
+you should standardize the metabolic variables, 
+but this time, we don't regress on the confounders 
+since they will be included in the models.
+
+```{r standardize-data}
+standardized_data <- simulated_data %>% 
+    nc_standardize(vars(starts_with("metabolite")))
+```
+
+Then estimate the outcome or exposure:
+
+```{r example-use, cache=TRUE}
+outcome_estimates <- standardized_data %>%
     nc_outcome_estimates(
         .graph = metabolite_network,
         .outcome = "survival::Surv(survival_time, case_status)",
@@ -296,7 +344,7 @@ outcome_estimates <- simulated_data %>%
 
 outcome_estimates
 
-exposure_estimates <- simulated_data %>%
+exposure_estimates <- standardized_data %>%
     nc_exposure_estimates(
         .graph = metabolite_network,
         .exposure = "exposure",
@@ -307,17 +355,6 @@ exposure_estimates <- simulated_data %>%
 exposure_estimates
 ```
 
-### Visualizing the network
-
-To see the network of only the metabolic variables, use the function
-`nc_plot_network()`.
-
-```{r visualize-metabolic-network, fig.width=5.6, fig.height=4.5}
-simulated_data %>% 
-    select(matches("metabolite")) %>% 
-    nc_plot_network(metabolite_network)
-```
-
 ### Classifying direct effects on exposure or outcome side
 
 You can classify direct effects by using `nc_classify_effects()`,

From e6fb09bc2ffc7492f62f602b85d5bff4cf0a9687 Mon Sep 17 00:00:00 2001
From: Luke Johnston <lwjohnst@gmail.com>
Date: Thu, 2 Apr 2020 19:12:09 +0200
Subject: [PATCH 6/6] to find travis errors

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 3b80b19..f752b3a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -18,5 +18,5 @@ after_success:
 before_cache: Rscript -e 'remotes::install_cran("pkgdown")'
 deploy:
   provider: script
-  script: Rscript -e 'pkgdown::deploy_site_github(ssh_id = Sys.getenv("TRAVIS_DEPLOY_KEY", ""))'
+  script: Rscript -e 'pkgdown::deploy_site_github(ssh_id = Sys.getenv("TRAVIS_DEPLOY_KEY", ""), verbose = TRUE)'
   skip_cleanup: true