From c96604f33afb51368bdef5e53072530eb0a711db Mon Sep 17 00:00:00 2001 From: Luke Johnston Date: Sun, 29 Mar 2020 15:50:42 +0200 Subject: [PATCH 1/6] fixed bug with nc_standardize the stopped regressed on to work --- NEWS.md | 11 ++++++-- R/standardize.R | 66 ++++++++++++++++++++++++++++++++++++------- man/nc_standardize.Rd | 10 +++---- 3 files changed, 69 insertions(+), 18 deletions(-) diff --git a/NEWS.md b/NEWS.md index 5f2a846..831c4bf 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,7 @@ # NetCoupler 0.0.4.9000 (development version) -* Input dataset can include missingness. Input data is treated as complete case -for only the variables used in the modelling (#88). +## Added features + * For `lm` and `glm` models, model summary statistics are added (#88). * Add a function to classify the direct effects between outcome or exposure and the network (#98). @@ -14,6 +14,13 @@ to help create the weights for the network plot. results too much, see #60 for details (#83). * Removed stringr dependency (#65, #83). +## Fixed bugs and other problems + +* Fix problem with `nc_standardize()` that prevented the ability to use the `.regressed_on` +argument to extract residuals (#). +* Input dataset can include missingness. Input data is treated as complete case +for only the variables used in the modelling (#88). + # NetCoupler 0.0.3.9000 * Add `nc_standardize()` function to standardize the metabolic variables (#73). diff --git a/R/standardize.R b/R/standardize.R index bbce5ab..acb288d 100644 --- a/R/standardize.R +++ b/R/standardize.R @@ -23,27 +23,29 @@ #' #' @examples #' -#' # Don't regress on any variable. +#' # Don't regress on any variable #' simulated_data %>% -#' nc_standardize(vars(matches("metabolite_"))) %>% -#' tibble::as_tibble() +#' nc_standardize(matches("metabolite_")) #' -#' # Don't regress on any variable. +#' # Extract residuals by regressing on a variable #' simulated_data %>% -#' nc_standardize(vars(matches("metabolite_")), "age") %>% -#' tibble::as_tibble() +#' nc_standardize(vars(matches("metabolite_")), "age") nc_standardize <- function(.tbl, .vars, .regressed_on = NULL) { if (!is.null(.regressed_on)) { assertive.types::assert_is_character(.regressed_on) - .tbl %>% - dplyr::mutate_at(.vars, .funs = .log_regress_standardize, - regressed_on = .tbl[.regressed_on]) + standardized_data <- .replace_with_residuals( + .tbl = .tbl, + .vars = .vars, + .regressed_on = .regressed_on + ) } else { - .tbl %>% + standardized_data <- .tbl %>% dplyr::mutate_at(.vars, .funs = .log_standardize) } + return(standardized_data) } + .log_standardize <- function(x) { as.numeric(scale(log(x))) } @@ -54,3 +56,47 @@ nc_standardize <- function(.tbl, .vars, .regressed_on = NULL) { residual_x <- stats::residuals(stats::glm.fit(y = logged_x, x = regressed_on)) as.numeric(scale(residual_x)) } + +.replace_with_residuals <- function(.tbl, .vars, .regressed_on) { + metabolic_names <- .tbl %>% + select_at(.vars) %>% + names() + + data_with_id <- .tbl %>% + # TODO: Check that no id variable exists + mutate(.id_variable = row_number()) + + data_with_other_vars <- data_with_id %>% + select_at(vars(-metabolic_names)) + + standardized_data <- metabolic_names %>% + purrr::map(~ .extract_residuals(.x, data_with_id, .regressed_on)) %>% + purrr::reduce(full_join, by = ".id_variable") %>% + dplyr::full_join(data_with_other_vars, by = ".id_variable") %>% + dplyr::arrange_at(".id_variable") %>% + # To put in original ordering + select_at(names(data_with_id)) %>% + select(-".id_variable") + + return(standardized_data) +} + +.extract_residuals <- function(.var, .tbl, .regressed_on, .id_var = ".id_variable") { + no_missing <- .tbl %>% + select_at(c(.var, .regressed_on, .id_var)) %>% + na.omit() + + metabolic_var <- no_missing[[.var]] + regress_on_vars <- no_missing[.regressed_on] + + metabolic_residuals <- + .log_regress_standardize(metabolic_var, + regress_on_vars) + + no_missing[.var] <- metabolic_residuals + + data_with_residuals <- no_missing %>% + select_at(c(.var, ".id_variable")) + + return(data_with_residuals) +} diff --git a/man/nc_standardize.Rd b/man/nc_standardize.Rd index c9e16e4..3d6af9d 100644 --- a/man/nc_standardize.Rd +++ b/man/nc_standardize.Rd @@ -31,13 +31,11 @@ remove influence of potential confounding. } \examples{ -# Don't regress on any variable. +# Don't regress on any variable simulated_data \%>\% - nc_standardize(vars(matches("metabolite_"))) \%>\% - tibble::as_tibble() + nc_standardize(matches("metabolite_")) -# Don't regress on any variable. +# Extract residuals by regressing on a variable simulated_data \%>\% - nc_standardize(vars(matches("metabolite_")), "age") \%>\% - tibble::as_tibble() + nc_standardize(vars(matches("metabolite_")), "age") } From c07e29ea249f68ade0ed3ce20e3aea1fc85631eb Mon Sep 17 00:00:00 2001 From: Luke Johnston Date: Sun, 29 Mar 2020 15:51:32 +0200 Subject: [PATCH 2/6] start of unit test for dealing with missingness in standardizing --- tests/testthat/test-standardize.R | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/testthat/test-standardize.R b/tests/testthat/test-standardize.R index a8c3b33..71347be 100644 --- a/tests/testthat/test-standardize.R +++ b/tests/testthat/test-standardize.R @@ -38,3 +38,8 @@ test_that("standardization with residuals works", { expect_false(identical(simulated_data, standardized_with_residuals)) expect_false(identical(standardized, standardized_with_residuals)) }) + +#' simulated_data %>% +#' mutate(Random = rnorm(n(), 10, 2)) %>% +#' .insert_random_missingness() %>% +#' nc_standardize(vars(matches("metabolite_")), c("age", "Random")) From a7b6f894505f3448a29089226e0272a46d842410 Mon Sep 17 00:00:00 2001 From: Luke Johnston Date: Sun, 29 Mar 2020 15:52:23 +0200 Subject: [PATCH 3/6] add pr number [skip ci] --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 831c4bf..5b66467 100644 --- a/NEWS.md +++ b/NEWS.md @@ -17,7 +17,7 @@ results too much, see #60 for details (#83). ## Fixed bugs and other problems * Fix problem with `nc_standardize()` that prevented the ability to use the `.regressed_on` -argument to extract residuals (#). +argument to extract residuals (#108). * Input dataset can include missingness. Input data is treated as complete case for only the variables used in the modelling (#88). From b857b36d442fe87361f0cd6ee86d6cb02d4a0aad Mon Sep 17 00:00:00 2001 From: Luke Johnston Date: Thu, 2 Apr 2020 19:11:46 +0200 Subject: [PATCH 4/6] fix build errors --- R/standardize.R | 8 ++++---- man/nc_standardize.Rd | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/R/standardize.R b/R/standardize.R index acb288d..14666b8 100644 --- a/R/standardize.R +++ b/R/standardize.R @@ -25,7 +25,7 @@ #' #' # Don't regress on any variable #' simulated_data %>% -#' nc_standardize(matches("metabolite_")) +#' nc_standardize(vars(matches("metabolite_"))) #' #' # Extract residuals by regressing on a variable #' simulated_data %>% @@ -64,14 +64,14 @@ nc_standardize <- function(.tbl, .vars, .regressed_on = NULL) { data_with_id <- .tbl %>% # TODO: Check that no id variable exists - mutate(.id_variable = row_number()) + mutate(.id_variable = dplyr::row_number()) data_with_other_vars <- data_with_id %>% select_at(vars(-metabolic_names)) standardized_data <- metabolic_names %>% purrr::map(~ .extract_residuals(.x, data_with_id, .regressed_on)) %>% - purrr::reduce(full_join, by = ".id_variable") %>% + purrr::reduce(dplyr::full_join, by = ".id_variable") %>% dplyr::full_join(data_with_other_vars, by = ".id_variable") %>% dplyr::arrange_at(".id_variable") %>% # To put in original ordering @@ -84,7 +84,7 @@ nc_standardize <- function(.tbl, .vars, .regressed_on = NULL) { .extract_residuals <- function(.var, .tbl, .regressed_on, .id_var = ".id_variable") { no_missing <- .tbl %>% select_at(c(.var, .regressed_on, .id_var)) %>% - na.omit() + stats::na.omit() metabolic_var <- no_missing[[.var]] regress_on_vars <- no_missing[.regressed_on] diff --git a/man/nc_standardize.Rd b/man/nc_standardize.Rd index 3d6af9d..c2acddb 100644 --- a/man/nc_standardize.Rd +++ b/man/nc_standardize.Rd @@ -33,7 +33,7 @@ remove influence of potential confounding. # Don't regress on any variable simulated_data \%>\% - nc_standardize(matches("metabolite_")) + nc_standardize(vars(matches("metabolite_"))) # Extract residuals by regressing on a variable simulated_data \%>\% From 459aa76ed299f1a2df64282c80dd60d413339a24 Mon Sep 17 00:00:00 2001 From: Luke Johnston Date: Thu, 2 Apr 2020 19:12:01 +0200 Subject: [PATCH 5/6] add examples of standardize in vignette --- vignettes/NetCoupler.Rmd | 69 ++++++++++++++++++++++++++++++---------- 1 file changed, 53 insertions(+), 16 deletions(-) diff --git a/vignettes/NetCoupler.Rmd b/vignettes/NetCoupler.Rmd index 51d7c94..db159a9 100644 --- a/vignettes/NetCoupler.Rmd +++ b/vignettes/NetCoupler.Rmd @@ -279,14 +279,62 @@ library(NetCoupler) library(dplyr) ``` -```{r example-use, cache=TRUE} +### Estimating the metabolic network + +For estimating the network, it's (basically) required to standardize +the metabolic variables before inputting into `nc_create_network()`. +If you intend to also adjust for potential confounders when estimating +the exposure or outcome side connections, +you can include the potential impact these confounders may have on +the network by regressing the confounders on the metabolic variables. +Then the residuals can be extracted and used when constructing the network. +You do this with the `nc_standardize()` function. +This function also log-transforms and scales +(mean-center and z-score normalize) the values of the metabolic variables. +We do this because the network estimation algorithm can sometimes be finicky +about differences in variable numerical scale (mean of 1 vs mean of 1000). + +```{r metabolic-residuals} +std_metabolic_data <- simulated_data %>% + nc_standardize(vars(starts_with("metabolite")), + .regressed_on = "age") %>% + select(starts_with("metabolite")) +``` + +After that, you can estimate the network. +```{r create-network} # Make partial independence network from metabolite data -metabolite_network <- simulated_data %>% - select(matches("metabolite")) %>% +metabolite_network <- std_metabolic_data %>% nc_create_network() +``` + +To see what the network looks like, +use the function `nc_plot_network()`. + +```{r visualize-metabolic-network, fig.width=5.6, fig.height=4.5} +std_metabolic_data %>% + nc_plot_network(metabolite_network) +``` + +The plot is a bit crowded, but it provides a base to start tidying up from. + +### Estimating exposure and outcome-side connections -outcome_estimates <- simulated_data %>% +For the exposure and outcome side, +you should standardize the metabolic variables, +but this time, we don't regress on the confounders +since they will be included in the models. + +```{r standardize-data} +standardized_data <- simulated_data %>% + nc_standardize(vars(starts_with("metabolite"))) +``` + +Then estimate the outcome or exposure: + +```{r example-use, cache=TRUE} +outcome_estimates <- standardized_data %>% nc_outcome_estimates( .graph = metabolite_network, .outcome = "survival::Surv(survival_time, case_status)", @@ -296,7 +344,7 @@ outcome_estimates <- simulated_data %>% outcome_estimates -exposure_estimates <- simulated_data %>% +exposure_estimates <- standardized_data %>% nc_exposure_estimates( .graph = metabolite_network, .exposure = "exposure", @@ -307,17 +355,6 @@ exposure_estimates <- simulated_data %>% exposure_estimates ``` -### Visualizing the network - -To see the network of only the metabolic variables, use the function -`nc_plot_network()`. - -```{r visualize-metabolic-network, fig.width=5.6, fig.height=4.5} -simulated_data %>% - select(matches("metabolite")) %>% - nc_plot_network(metabolite_network) -``` - ### Classifying direct effects on exposure or outcome side You can classify direct effects by using `nc_classify_effects()`, From e6fb09bc2ffc7492f62f602b85d5bff4cf0a9687 Mon Sep 17 00:00:00 2001 From: Luke Johnston Date: Thu, 2 Apr 2020 19:12:09 +0200 Subject: [PATCH 6/6] to find travis errors --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 3b80b19..f752b3a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,5 +18,5 @@ after_success: before_cache: Rscript -e 'remotes::install_cran("pkgdown")' deploy: provider: script - script: Rscript -e 'pkgdown::deploy_site_github(ssh_id = Sys.getenv("TRAVIS_DEPLOY_KEY", ""))' + script: Rscript -e 'pkgdown::deploy_site_github(ssh_id = Sys.getenv("TRAVIS_DEPLOY_KEY", ""), verbose = TRUE)' skip_cleanup: true