diff --git a/.travis.yml b/.travis.yml index 3b80b19..f752b3a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,5 +18,5 @@ after_success: before_cache: Rscript -e 'remotes::install_cran("pkgdown")' deploy: provider: script - script: Rscript -e 'pkgdown::deploy_site_github(ssh_id = Sys.getenv("TRAVIS_DEPLOY_KEY", ""))' + script: Rscript -e 'pkgdown::deploy_site_github(ssh_id = Sys.getenv("TRAVIS_DEPLOY_KEY", ""), verbose = TRUE)' skip_cleanup: true diff --git a/R/standardize.R b/R/standardize.R index bbce5ab..14666b8 100644 --- a/R/standardize.R +++ b/R/standardize.R @@ -23,27 +23,29 @@ #' #' @examples #' -#' # Don't regress on any variable. +#' # Don't regress on any variable #' simulated_data %>% -#' nc_standardize(vars(matches("metabolite_"))) %>% -#' tibble::as_tibble() +#' nc_standardize(vars(matches("metabolite_"))) #' -#' # Don't regress on any variable. +#' # Extract residuals by regressing on a variable #' simulated_data %>% -#' nc_standardize(vars(matches("metabolite_")), "age") %>% -#' tibble::as_tibble() +#' nc_standardize(vars(matches("metabolite_")), "age") nc_standardize <- function(.tbl, .vars, .regressed_on = NULL) { if (!is.null(.regressed_on)) { assertive.types::assert_is_character(.regressed_on) - .tbl %>% - dplyr::mutate_at(.vars, .funs = .log_regress_standardize, - regressed_on = .tbl[.regressed_on]) + standardized_data <- .replace_with_residuals( + .tbl = .tbl, + .vars = .vars, + .regressed_on = .regressed_on + ) } else { - .tbl %>% + standardized_data <- .tbl %>% dplyr::mutate_at(.vars, .funs = .log_standardize) } + return(standardized_data) } + .log_standardize <- function(x) { as.numeric(scale(log(x))) } @@ -54,3 +56,47 @@ nc_standardize <- function(.tbl, .vars, .regressed_on = NULL) { residual_x <- stats::residuals(stats::glm.fit(y = logged_x, x = regressed_on)) as.numeric(scale(residual_x)) } + +.replace_with_residuals <- function(.tbl, .vars, .regressed_on) { + metabolic_names <- .tbl %>% + select_at(.vars) %>% + names() + + data_with_id <- .tbl %>% + # TODO: Check that no id variable exists + mutate(.id_variable = dplyr::row_number()) + + data_with_other_vars <- data_with_id %>% + select_at(vars(-metabolic_names)) + + standardized_data <- metabolic_names %>% + purrr::map(~ .extract_residuals(.x, data_with_id, .regressed_on)) %>% + purrr::reduce(dplyr::full_join, by = ".id_variable") %>% + dplyr::full_join(data_with_other_vars, by = ".id_variable") %>% + dplyr::arrange_at(".id_variable") %>% + # To put in original ordering + select_at(names(data_with_id)) %>% + select(-".id_variable") + + return(standardized_data) +} + +.extract_residuals <- function(.var, .tbl, .regressed_on, .id_var = ".id_variable") { + no_missing <- .tbl %>% + select_at(c(.var, .regressed_on, .id_var)) %>% + stats::na.omit() + + metabolic_var <- no_missing[[.var]] + regress_on_vars <- no_missing[.regressed_on] + + metabolic_residuals <- + .log_regress_standardize(metabolic_var, + regress_on_vars) + + no_missing[.var] <- metabolic_residuals + + data_with_residuals <- no_missing %>% + select_at(c(.var, ".id_variable")) + + return(data_with_residuals) +} diff --git a/man/nc_standardize.Rd b/man/nc_standardize.Rd index c9e16e4..c2acddb 100644 --- a/man/nc_standardize.Rd +++ b/man/nc_standardize.Rd @@ -31,13 +31,11 @@ remove influence of potential confounding. } \examples{ -# Don't regress on any variable. +# Don't regress on any variable simulated_data \%>\% - nc_standardize(vars(matches("metabolite_"))) \%>\% - tibble::as_tibble() + nc_standardize(vars(matches("metabolite_"))) -# Don't regress on any variable. +# Extract residuals by regressing on a variable simulated_data \%>\% - nc_standardize(vars(matches("metabolite_")), "age") \%>\% - tibble::as_tibble() + nc_standardize(vars(matches("metabolite_")), "age") } diff --git a/tests/testthat/test-standardize.R b/tests/testthat/test-standardize.R index a8c3b33..71347be 100644 --- a/tests/testthat/test-standardize.R +++ b/tests/testthat/test-standardize.R @@ -38,3 +38,8 @@ test_that("standardization with residuals works", { expect_false(identical(simulated_data, standardized_with_residuals)) expect_false(identical(standardized, standardized_with_residuals)) }) + +#' simulated_data %>% +#' mutate(Random = rnorm(n(), 10, 2)) %>% +#' .insert_random_missingness() %>% +#' nc_standardize(vars(matches("metabolite_")), c("age", "Random")) diff --git a/vignettes/NetCoupler.Rmd b/vignettes/NetCoupler.Rmd index 51d7c94..db159a9 100644 --- a/vignettes/NetCoupler.Rmd +++ b/vignettes/NetCoupler.Rmd @@ -279,14 +279,62 @@ library(NetCoupler) library(dplyr) ``` -```{r example-use, cache=TRUE} +### Estimating the metabolic network + +For estimating the network, it's (basically) required to standardize +the metabolic variables before inputting into `nc_create_network()`. +If you intend to also adjust for potential confounders when estimating +the exposure or outcome side connections, +you can include the potential impact these confounders may have on +the network by regressing the confounders on the metabolic variables. +Then the residuals can be extracted and used when constructing the network. +You do this with the `nc_standardize()` function. +This function also log-transforms and scales +(mean-center and z-score normalize) the values of the metabolic variables. +We do this because the network estimation algorithm can sometimes be finicky +about differences in variable numerical scale (mean of 1 vs mean of 1000). + +```{r metabolic-residuals} +std_metabolic_data <- simulated_data %>% + nc_standardize(vars(starts_with("metabolite")), + .regressed_on = "age") %>% + select(starts_with("metabolite")) +``` + +After that, you can estimate the network. +```{r create-network} # Make partial independence network from metabolite data -metabolite_network <- simulated_data %>% - select(matches("metabolite")) %>% +metabolite_network <- std_metabolic_data %>% nc_create_network() +``` + +To see what the network looks like, +use the function `nc_plot_network()`. + +```{r visualize-metabolic-network, fig.width=5.6, fig.height=4.5} +std_metabolic_data %>% + nc_plot_network(metabolite_network) +``` + +The plot is a bit crowded, but it provides a base to start tidying up from. + +### Estimating exposure and outcome-side connections -outcome_estimates <- simulated_data %>% +For the exposure and outcome side, +you should standardize the metabolic variables, +but this time, we don't regress on the confounders +since they will be included in the models. + +```{r standardize-data} +standardized_data <- simulated_data %>% + nc_standardize(vars(starts_with("metabolite"))) +``` + +Then estimate the outcome or exposure: + +```{r example-use, cache=TRUE} +outcome_estimates <- standardized_data %>% nc_outcome_estimates( .graph = metabolite_network, .outcome = "survival::Surv(survival_time, case_status)", @@ -296,7 +344,7 @@ outcome_estimates <- simulated_data %>% outcome_estimates -exposure_estimates <- simulated_data %>% +exposure_estimates <- standardized_data %>% nc_exposure_estimates( .graph = metabolite_network, .exposure = "exposure", @@ -307,17 +355,6 @@ exposure_estimates <- simulated_data %>% exposure_estimates ``` -### Visualizing the network - -To see the network of only the metabolic variables, use the function -`nc_plot_network()`. - -```{r visualize-metabolic-network, fig.width=5.6, fig.height=4.5} -simulated_data %>% - select(matches("metabolite")) %>% - nc_plot_network(metabolite_network) -``` - ### Classifying direct effects on exposure or outcome side You can classify direct effects by using `nc_classify_effects()`,