diff --git a/.Rbuildignore b/.Rbuildignore index 05ee2c7..54208b3 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -14,3 +14,5 @@ ^\.github$ ^check_long_long_int\.R$ ^draft_vignettes\.Rmd$ +^.*\.log$ +^other_stuff$ diff --git a/.gitignore b/.gitignore index b4c1e70..077e745 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,5 @@ Meta /Meta/ inst/doc *ICIKendallTau_run.log +/other_stuff/ +/large_test/ diff --git a/DESCRIPTION b/DESCRIPTION index 93ec860..100fde8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: ICIKendallTau Title: Calculates information-content-informed Kendall-tau -Version: 0.3.22 +Version: 1.0.2 Date: 2024-01-26 Authors@R: c(person(given = c("Robert", "M"), family = "Flight", role = c("aut", "cre"), email = "rflight79@gmail.com", comment = @@ -16,9 +16,9 @@ Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 7.2.3 LinkingTo: Rcpp -Imports: Rcpp, purrr, utils +Imports: Rcpp, purrr, utils, stringr Suggests: furrr, future, testthat (>= 3.0.0), microbenchmark, - rmarkdown, knitr, dplyr + rmarkdown, knitr, dplyr, logger URL: https://moseleybioinformaticslab.github.io/ICIKendallTau https://github.com/moseleybioinformaticslab/ICIKendallTau Config/testthat/edition: 3 diff --git a/NEWS.md b/NEWS.md index f412138..6881600 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# ICIKendallTau 1.0.0 + +- Calculates correlation between columns of the matrix, **not** the rows. + # ICIKendallTau 0.3.20 - `kt_fast` now uses the same data.frame format for output as `ici_kendalltau`, but returns a matrix by default. The data.frame is useful when large amounts of comparisons are run. diff --git a/R/kendalltau.R b/R/kendalltau.R index 9d096ea..a493b50 100644 --- a/R/kendalltau.R +++ b/R/kendalltau.R @@ -3,7 +3,7 @@ #' Given a data-matrix, computes the information-content-informed (ICI) Kendall-tau-b between #' all samples. #' -#' @param data_matrix samples are rows, features are columns +#' @param data_matrix samples are columns, features are rows #' @param global_na what values should be treated as missing (NA)? #' @param zero_value what is the actual zero value? #' @param perspective how to treat missing data in denominator and ties, see details @@ -25,9 +25,7 @@ ici_kendalltau_ref = function(data_matrix, diag_good = TRUE, progress = FALSE){ - # assume row-wise (because that is what the description states), so need to transpose - # because `cor` actually does things columnwise. - data_matrix = t(data_matrix) + exclude_loc = matrix(FALSE, nrow = nrow(data_matrix), ncol = ncol(data_matrix)) # Actual NA and Inf values are special cases, so we do @@ -102,7 +100,7 @@ missing_either = function(in_x, in_y){ #' Calculates the completeness between any two samples using "or", is an #' entry missing in either X "or" Y. #' -#' @param data_matrix samples are rows, features are columns +#' @param data_matrix samples are columns, features are rows #' @param global_na globally, what should be treated as NA? #' @param include_only is there certain comparisons to do? #' @param return_matrix should the matrix or data.frame be returned? @@ -115,7 +113,6 @@ pairwise_completeness = function(data_matrix, include_only = NULL, return_matrix = TRUE){ - data_matrix = t(data_matrix) if (is.null(colnames(data_matrix))) { stop("rownames of data_matrix cannot be NULL!") @@ -248,7 +245,7 @@ pairwise_completeness = function(data_matrix, #' Given a data-matrix, computes the information-theoretic Kendall-tau-b between #' all samples. #' -#' @param data_matrix samples are rows, features are columns +#' @param data_matrix samples are columns, features are rows #' @param global_na globally, what should be treated as NA? #' @param perspective how to treat missing data in denominator and ties, see details #' @param scale_max should everything be scaled compared to the maximum correlation? @@ -285,7 +282,7 @@ pairwise_completeness = function(data_matrix, #' #' matrix_1 = cbind(s1, s2) #' -#' r_1 = ici_kendalltau(t(matrix_1)) +#' r_1 = ici_kendalltau(matrix_1) #' r_1$cor #' #' # s1 s2 @@ -301,7 +298,7 @@ pairwise_completeness = function(data_matrix, #' s4[sample(100, 50)] = NA #' #' matrix_2 = cbind(s3, s4) -#' r_2 = ici_kendalltau(t(matrix_2)) +#' r_2 = ici_kendalltau(matrix_2) #' r_2$cor #' # s3 s4 #' # s3 1.0000000 0.9944616 @@ -309,8 +306,8 @@ pairwise_completeness = function(data_matrix, #' #' # using include_only #' set.seed(1234) -#' x = matrix(rnorm(5000), nrow = 100, ncol = 50) -#' rownames(x) = paste0("s", seq(1, nrow(x))) +#' x = t(matrix(rnorm(5000), nrow = 100, ncol = 50)) +#' colnames(x) = paste0("s", seq(1, nrow(x))) #' #' # only calculate correlations of other columns with "s1" #' include_s1 = "s1" @@ -342,11 +339,14 @@ ici_kendalltau = function(data_matrix, do_log_memory = get("memory", envir = icikt_logger) # assume row-wise (because that is what the description states), so need to transpose - data_matrix = t(data_matrix) exclude_loc = matrix(FALSE, nrow = nrow(data_matrix), ncol = ncol(data_matrix)) + if (is.data.frame(data_matrix)) { + data_matrix = as.matrix(data_matrix) + } + if (is.null(colnames(data_matrix))) { - stop("rownames of data_matrix cannot be NULL!") + stop("colnames of data_matrix cannot be NULL!") } log_message("Processing missing values ...\n") diff --git a/R/reshaping.R b/R/reshaping.R index a35dada..ccb0390 100644 --- a/R/reshaping.R +++ b/R/reshaping.R @@ -1,8 +1,10 @@ +utils::globalVariables(c("stack", "ind", "values")) + #' convert matrix to data.frame #' #' Given a square correlation matrix, converts it to a long data.frame, with three columns. #' -#' @param cor_matrix the correlation matrix +#' @param in_matrix the correlation matrix #' #' @details The data.frame contains three columns: #' * s1: the first entry of comparison @@ -32,6 +34,7 @@ cor_matrix_2_long_df = function(in_matrix) #' Given a long data.frame, converts it to a possibly square correlation matrix #' #' @param long_df the long data.frame +#' @param is_square should it be a square matrix? #' #' #' @export diff --git a/README.Rmd b/README.Rmd index 0034985..a2757ab 100644 --- a/README.Rmd +++ b/README.Rmd @@ -80,7 +80,7 @@ s2 = s1 + 10 matrix_1 = cbind(s1, s2) -r_1 = ici_kendalltau(t(matrix_1)) +r_1 = ici_kendalltau(matrix_1) r_1$cor ``` @@ -95,7 +95,7 @@ s4 = s2 s4[sample(100, 50)] = NA matrix_2 = cbind(s3, s4) -r_2 = ici_kendalltau(t(matrix_2)) +r_2 = ici_kendalltau(matrix_2) r_2$cor ``` @@ -150,7 +150,7 @@ If you have {future} and the {furrr} packages installed, then it is also possibl #| eval: false library(furrr) future::plan(multicore, workers = 4) -r_3 = ici_kendalltau(t(matrix_2)) +r_3 = ici_kendalltau(matrix_2) ``` ## Many Many Comparisons @@ -163,7 +163,7 @@ To keep the `data.frame` output, add the argument `return_matrix=FALSE` to the c ```{r} #| label: matrix -r_4 = ici_kendalltau(t(matrix_2), return_matrix = FALSE) +r_4 = ici_kendalltau(matrix_2, return_matrix = FALSE) r_4 ``` diff --git a/README.md b/README.md index f3551a5..56fb0b6 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ s2 = s1 + 10 matrix_1 = cbind(s1, s2) -r_1 = ici_kendalltau(t(matrix_1)) +r_1 = ici_kendalltau(matrix_1) r_1$cor #> s1 s2 #> s1 1 1 @@ -101,7 +101,7 @@ s4 = s2 s4[sample(100, 50)] = NA matrix_2 = cbind(s3, s4) -r_2 = ici_kendalltau(t(matrix_2)) +r_2 = ici_kendalltau(matrix_2) r_2$cor #> s3 s4 #> s3 1.0000000 0.9944616 @@ -136,10 +136,10 @@ microbenchmark( times = 5 ) #> Unit: microseconds -#> expr min lq mean median uq max -#> cor(x, y, method = "kendall") 13379.183 13401.532 13493.0316 13401.829 13595.070 13687.544 -#> ici_kt(x, y, "global") 271.586 279.007 369.9088 314.779 357.086 627.086 -#> ici_kt(x2, y2, "global") 15983.902 16484.666 16952.9788 16667.885 17124.932 18503.509 +#> expr min lq mean median uq max +#> cor(x, y, method = "kendall") 13568.01 13818.243 14017.885 14090.436 14174.800 14437.933 +#> ici_kt(x, y, "global") 261.84 272.733 304.271 287.571 335.669 363.542 +#> ici_kt(x2, y2, "global") 14631.30 15614.509 16419.494 16340.620 17646.365 17864.673 #> neval #> 5 #> 5 @@ -177,7 +177,7 @@ resources for any multiprocessing engine registered with {future}. ``` r library(furrr) future::plan(multicore, workers = 4) -r_3 = ici_kendalltau(t(matrix_2)) +r_3 = ici_kendalltau(matrix_2) ``` ## Many Many Comparisons @@ -193,7 +193,7 @@ are converted to a matrix form if requested (the default).s To keep the of `ici_kendalltau`. ``` r -r_4 = ici_kendalltau(t(matrix_2), return_matrix = FALSE) +r_4 = ici_kendalltau(matrix_2, return_matrix = FALSE) r_4 #> $cor #> s1 s2 core raw pvalue taumax cor @@ -202,7 +202,7 @@ r_4 #> 3 s4 s4 0 1.0000000 0 1.000000 1.0000000 #> #> $run_time -#> [1] 0.02034521 +#> [1] 0.0196352 ``` ## Code of Conduct diff --git a/docs/404.html b/docs/404.html index 397fa0b..5c1ff30 100644 --- a/docs/404.html +++ b/docs/404.html @@ -24,7 +24,7 @@ ICIKendallTau - 0.3.21 + 1.0.1
set.seed(1234)
@@ -351,7 +350,7 @@ Running Manymatrix_1 = cbind(s1, s2, s3)
-r_1 = ici_kendalltau(t(matrix_1))
+r_1 = ici_kendalltau(matrix_1)
r_1$cor
#> s1 s2 s3
#> s1 1.0000000 0.8049209 0.9907488
@@ -367,7 +366,7 @@ Parallelism
library(furrr)
future::plan(multicore, workers = 4)
-r_2 = ici_kendalltau(t(matrix_1))
ici_kendalltau
.
-r_3 = ici_kendalltau(t(matrix_1), return_matrix = FALSE)
+r_3 = ici_kendalltau(matrix_1, return_matrix = FALSE)
r_3$cor
#> s1 s2 core raw pvalue taumax cor
#> 1 s1 s2 1 0.8049209 0 1.0000000 0.8049209
@@ -392,6 +391,24 @@ Many Many Comparisons#> 4 s1 s1 0 1.0000000 0 1.0000000 1.0000000
#> 5 s2 s2 0 1.0000000 0 1.0000000 1.0000000
#> 6 s3 s3 0 0.9850000 0 1.0000000 0.9850000
It is possible to log the steps being done and how much memory is +being used (on Linux at least) as correlations are calculated. This can +be useful when running very large sets of correlations and making sure +too much memory isn’t being used, for example.
+To enable logging, the {logger} package must be installed. If a
+log_file
is not supplied, one will be created with the
+current date and time.
+enable_logging()
+enable_logging("/tmp/my_ici_run.log")
By default, ici_kendalltau
also shows progress messages,
+if you want to turn them off, you can do:
+show_progress(FALSE)
Flight RM, Moseley HN (2024). ICIKendallTau: Calculates information-content-informed Kendall-tau. -R package version 0.3.21, https://moseleybioinformaticslab.github.io/ICIKendallTauhttps://github.com/moseleybioinformaticslab/ICIKendallTau. +R package version 1.0.1, https://moseleybioinformaticslab.github.io/ICIKendallTauhttps://github.com/moseleybioinformaticslab/ICIKendallTau.
@Manual{, title = {ICIKendallTau: Calculates information-content-informed Kendall-tau}, author = {Robert M Flight and Hunter NB Moseley}, year = {2024}, - note = {R package version 0.3.21}, + note = {R package version 1.0.1}, url = {https://moseleybioinformaticslab.github.io/ICIKendallTau https://github.com/moseleybioinformaticslab/ICIKendallTau}, }diff --git a/docs/index.html b/docs/index.html index 1adddc1..9e161dc 100644 --- a/docs/index.html +++ b/docs/index.html @@ -30,7 +30,7 @@ ICIKendallTau - 0.3.21 + 1.0.1
library(furrr)
future::plan(multicore, workers = 4)
-r_3 = ici_kendalltau(t(matrix_2))
+r_3 = ici_kendalltau(matrix_2)
In the case of hundreds of thousands of comparisons to be done, the result matrices can become very, very large, and require lots of memory for storage. They are also inefficient, as both the lower and upper triangular components are stored. An alternative storage format is as a data.frame
, where there is a single row for each comparison performed. This is actually how the results are stored internally, and then they are converted to a matrix form if requested (the default).s To keep the data.frame
output, add the argument return_matrix=FALSE
to the call of ici_kendalltau
.
-r_4 = ici_kendalltau(t(matrix_2), return_matrix = FALSE)
+r_4 = ici_kendalltau(matrix_2, return_matrix = FALSE)
r_4
#> $cor
#> s1 s2 core raw pvalue taumax cor
@@ -231,7 +231,7 @@ Many Many Comparisons#> 3 s4 s4 0 1.0000000 0 1.000000 1.0000000
#>
#> $run_time
-#> [1] 0.02034521
NEWS.md
samples are rows, features are columns
samples are columns, features are rows
samples are rows, features are columns
samples are columns, features are rows
samples are rows, features are columns
samples are columns, features are rows