Add LowCon and IES.

JieYinStat · Mar 21, 2024 · af22019 · af22019
1 parent 98f987c
commit af22019
Show file tree

Hide file tree

Showing 38 changed files with 933 additions and 42 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -13,15 +13,24 @@ RoxygenNote: 7.2.3
 URL: https://github.com/JieYinStat/dbsubsampling, https://jieyinstat.github.io/dbsubsampling/
 BugReports: https://github.com/JieYinStat/dbsubsampling/issues
 Suggests: 
+    bench,
+    copula,
+    dplyr,
+    gam,
     ggplot2,
     knitr,
     mvtnorm,
+    parallel,
+    purrr,
     RcppArmadillo,
     rmarkdown,
     testthat (>= 3.0.0),
-    tidyr
+    tidyr,
+    TruncatedNormal
 Config/testthat/edition: 3
 Imports: 
+    lhs,
+    RANN,
     Rcpp,
     withr
 Depends: 

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,15 +1,22 @@
 # Generated by roxygen2: do not edit by hand
 
 export(IBOSS)
+export(IES)
+export(LowCon)
 export(OSMAC)
 export(OSS)
 export(Unif)
+export(c_IES_compare)
 export(myArma_IBOSS)
 export(myArma_OSS)
 export(myR_IBOSS)
 export(myR_OSS)
 export(myRcpp_IBOSS)
 export(myRcpp_cstyle_IBOSS)
+export(r_IES)
+export(r_IES_compare)
+export(scale01)
+export(scale_neg_pos_1)
 export(subsampling)
 importFrom(Rcpp,sourceCpp)
 useDynLib(dbsubsampling, .registration = TRUE)
diff --git a/R/IES.R b/R/IES.R
@@ -0,0 +1,173 @@
+## ---------------------------------------------------------------------------
+## ---------------------------------------------------------------------------
+##
+## R main function `IES`.  `RcppArmadillo`-Version.
+##
+## ---------------------------------------------------------------------------
+## ---------------------------------------------------------------------------
+
+#' Scale data to \eqn{[-1, 1]}
+#'
+#' @param X A data.frame or matrix.
+#'
+#' @return Scaled X.
+#' @export
+#'
+#' @examples
+#' X <- matrix(1:20, 5, 4)
+#' scale01(X)
+scale01 <- function(X) {
+  apply(X, 2, function(.col) (.col - min(.col)) / (max(.col)-min(.col)))
+}
+
+#' Independence-Encouraging Subsampling for Nonparametric Additive Models (IES, Proposed by Zhang et.al. (2024))
+#'
+#' A subsampling method for nonparameter additive model based on Orthogonal Array.
+#'
+#' @param X A data.frame or matrix consists of explanatory variables.
+#' @param n Subsample size.
+#' @param q Hyperparamter of how to divide the axes. Default to 16.
+#' @param seed Random seed for the sampling.
+#'
+#' @return Subsample index.
+#' @references Yi Zhang, Lin Wang, Xiaoke Zhang & HaiYing Wang (2024)
+#' \emph{Independence-Encouraging Subsampling for Nonparametric Additive Models,
+#' Journal of Computational and Graphical Statistics},
+#' \url{https://www.tandfonline.com/doi/full/10.1080/10618600.2024.2326136}.
+#'
+#' @examples
+#' data <- data_numeric_regression
+#' X <- data[-which(names(data) == "y")]
+#' IES(X, n = 100, q = 16, seed = NULL)
+#' @export
+IES <- function(X, n, q = 16, seed = NULL) {
+  if (!is.null(seed)) withr::local_seed(seed)
+  X <- scale01(as.matrix(X))
+  attributes(X) <- attributes(X)["dim"]
+
+  index <- armaIES(X, n, q)
+  return(as.vector(index))
+}
+
+
+#' IES C++-Version for Benchmarking (R-Wrap Code)
+#'
+#' There is no randomness, all parts that need to be randomly selected are selected first indexed.
+#'
+#' @param X X A data.frame or matrix consists of explanatory variables.
+#' @param n Subsample size.
+#' @param q Hyperparamter of how to divide the axes.
+#'
+#' @export
+c_IES_compare <- function(X, n, q) {
+  X <- scale01(as.matrix(X))
+  attributes(X) <- attributes(X)["dim"]
+
+  index <- armaIES_compare(X, n, q)
+  return(as.vector(index))
+}
+
+## ---------------------------------------------------------------------------
+## ---------------------------------------------------------------------------
+##
+## IES R-Version.
+##
+## ---------------------------------------------------------------------------
+## ---------------------------------------------------------------------------
+
+r_Compute_IES_Loss <- function(candi, last_index, X, q) {
+  x_last <- floor(X[last_index, ]*q)
+  loss <- apply(X[candi, ], 1, function(.row) (sum(floor(.row*q) == x_last))^2)
+  return(loss)
+}
+
+#' R Version of IES for Testing
+#'
+#' @param X A data.frame or matrix consists of explanatory variables.
+#' @param n Subsample size.
+#' @param q Hyperparamter of how to divide the axes.
+#' @param seed Random seed for the sampling.
+#'
+#' @export
+r_IES <- function(X, n, q, seed = NULL) {
+  if (!is.null(seed)) withr::local_seed(seed)
+  X <- scale01(as.matrix(X))
+  attributes(X) <- attributes(X)["dim"]
+  N <- nrow(X)
+
+  index <- numeric(n)
+  candi <- 1:N
+  loss <- numeric(N)
+
+  # Initial
+  index[1] <- sample(N, 1)
+  # index[1] <- 1
+  candi <- candi[-index[1]]
+  loss <- loss[-index[1]]
+  loss <- r_Compute_IES_Loss(candi, index[1], X, q)
+  # paste0("---Step: 1---")
+  # print(loss)
+
+  for (i in 2:n) {
+    # Election
+    temp_vec <- which(loss == min(loss))
+    temp <- temp_vec[sample(length(temp_vec), 1)]
+    # temp <- temp_vec[1]
+    index[i] <- candi[temp]
+    # print(temp)
+
+    # Update
+    candi <- candi[-temp]
+    loss <- loss[-temp]
+    loss <- loss + r_Compute_IES_Loss(candi, index[i], X, q)
+    # paste0("---Step: ", i, "---")
+    # print(loss)
+  }
+
+  return(index)
+}
+
+#' IES R-Version for Benchmarking
+#'
+#' There is no randomness, all parts that need to be randomly selected are selected first indexed.
+#'
+#' @param X X A data.frame or matrix consists of explanatory variables.
+#' @param n Subsample size.
+#' @param q Hyperparamter of how to divide the axes.
+#'
+#' @export
+r_IES_compare <- function(X, n, q) {
+  X <- scale01(as.matrix(X))
+  attributes(X) <- attributes(X)["dim"]
+  N <- nrow(X)
+
+  index <- numeric(n)
+  candi <- 1:N
+  loss <- numeric(N)
+
+  # Initial
+  # index[1] <- sample(N, 1)
+  index[1] <- 1
+  candi <- candi[-index[1]]
+  loss <- loss[-index[1]]
+  loss <- r_Compute_IES_Loss(candi, index[1], X, q)
+  # paste0("---Step: 1---")
+  # print(loss)
+  for (i in 2:n) {
+    # Election
+    temp_vec <- which(loss == min(loss))
+    # temp <- temp_vec[sample(length(temp_vec), 1)]
+    temp <- temp_vec[1]
+    index[i] <- candi[temp]
+    # print(temp)
+
+    # Update
+    candi <- candi[-temp]
+    loss <- loss[-temp]
+    loss <- loss + r_Compute_IES_Loss(candi, index[i], X, q)
+    # paste0("---Step: ", i, "---")
+    # print(loss)
+  }
+
+  return(index)
+}
diff --git a/R/LowCon.R b/R/LowCon.R
@@ -0,0 +1,60 @@
+#' Transform data to \eqn{[-1,1]^p}
+#'
+#' @param X A data.frame or matrix of explanatory variables.
+#'
+#' @return Scaled `X` to \eqn{[-1,1]^p}.
+#' @export
+#'
+#' @examples
+#' X <- matrix(rnorm(100), nrow = 20, ncol = 5)
+#' scaled_X <- scale_neg_pos_1(X)
+#' apply(scaled_X, 2, range)
+scale_neg_pos_1 <- function(X) {
+  apply(X, 2, function(.col) (( (.col - min(.col)) / (max(.col)-min(.col)) ) *2 ) -1 )
+}
+
+trans_LHD_adapt_X <- function(LHD_data, X, theta) {
+  p <- ncol(X)
+  space <- apply(X, 2, stats::quantile, probs = c(theta / 100, 1 - theta / 100))
+  width <- space[2,] - space[1,]
+  for (j in 1:p) {
+    LHD_data[,j] <- LHD_data[,j]*width[j] + space[1, j]
+  }
+  return(LHD_data)
+}
+
+#' LowCon: A Design-based Subsampling Approach in a Misspecified Linear Model
+#'
+#' A subsampling method based space-filling design for misspecified linear model proposed by Meng et.al. (2021).
+#'
+#' @param X A data.frame or matrix of explanatory variables.
+#' @param n Subsample size.
+#' @param theta Percentage of data shrinkage. Default to 1.
+#' @param seed Random seed for the sampling.
+#'
+#' @return Subsample index.
+#' @references Cheng Meng, Rui Xie, Abhyuday Mandal, Xinlian Zhang, Wenxuan Zhong & Ping Ma (2021)
+#' \emph{LowCon: A Design-based Subsampling Approach in a Misspecified Linear Model,
+#' Journal of Computational and Graphical Statistics, 30:3, 694-708},
+#' \url{https://www.tandfonline.com/doi/full/10.1080/10618600.2020.1844215}.
+#'
+#' @examples
+#' data <- data_numeric_regression
+#' X <- data[-which(names(data) == "y")]
+#' LowCon(X, n = 100, theta = 1, seed = NULL)
+#' @export
+LowCon <- function(X, n, theta = 1, seed = NULL){
+  if (!is.null(seed)) withr::local_seed(seed)
+
+  X <- scale_neg_pos_1(as.matrix(X))
+  attributes(X) <- attributes(X)["dim"]
+  p <- ncol(X)
+
+  LHD_data <- lhs::randomLHS(n, p)
+  LHD_data <- trans_LHD_adapt_X(LHD_data, X, theta)
+
+  index <- RANN::nn2(X, LHD_data, k=1, treetype = "kd")$nn.idx
+  return(as.vector(index))
+}
+
+
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -1,7 +1,7 @@
 # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
-#' Get subsample index of other column(except the first column) (IBOSS core code, `Rcpp-C++-style by Wang`)
+#' Get subsample index of other column(except the first column) (IBOSS core code, `Rcpp`-C++-style by `Wang`)
 #'
 #' @param r Subsample size of the column.
 #' @param z A numeric vector. the column.
@@ -11,7 +11,7 @@ getIdxR_cpp <- function(r, z, rdel) {
     .Call(`_dbsubsampling_getIdxR_cpp`, r, z, rdel)
 }
 
-#' Get subsample index of the first column (IBOSS core code, Rcpp-C++-style by Wang)
+#' Get subsample index of the first column (IBOSS core code, `Rcpp`-C++-style by `Wang`)
 #'
 #' @param r Subsample size of the first column.
 #' @param z A numeric vector. the first column.
@@ -32,30 +32,50 @@ Get_Candi_Quan <- function(x, next_x, candi, lower, upper, r) {
     .Call(`_dbsubsampling_Get_Candi_Quan`, x, next_x, candi, lower, upper, r)
 }
 
-#' IBOSS with `Rcpp-C++-style` by myself (`myRcpp_cstyle_IBOSS` core c++ code)
+#' IBOSS with `Rcpp`-C++-style by `the package itself` (`myRcpp_cstyle_IBOSS` core c++ code)
 #' @param n Subsample size.
 #' @param X A data.frame or matrix consists of explanatory variables.
 #' @return Subsample index.
 rcpp_cstyle_IBOSS <- function(X, n) {
     .Call(`_dbsubsampling_rcpp_cstyle_IBOSS`, X, n)
 }
 
-#' IBOSS with `Rcpp` (`myRcppIBOSS` core c++ code)
+#' IBOSS with `Rcpp`-r-style by `the package itself` (`myRcppIBOSS` core c++ code)
 #' @param n Subsample size.
 #' @param X A data.frame or matrix consists of explanatory variables.
 #' @return Subsample index.
 rcppIBOSS <- function(X, n) {
     .Call(`_dbsubsampling_rcppIBOSS`, X, n)
 }
 
-#' IBOSS with `RcppArmadillo` (`myArma_IBOSS` core c++ code)
+#' IBOSS with `RcppArmadillo` by `the package itself` (`myArma_IBOSS` core c++ code)
 #' @param n Subsample size.
 #' @param X A data.frame or matrix consists of explanatory variables.
 #' @return Subsample index.
 armarcppIBOSS <- function(X, n) {
     .Call(`_dbsubsampling_armarcppIBOSS`, X, n)
 }
 
+#' IES Core Code Using `RcppArmadillo`.
+#'
+#' @param X A data.frame or matrix consists of explanatory variables.
+#' @param n Subsample size.
+#' @param q Hyperparamter of how to divide the axes.
+armaIES <- function(X, n, q) {
+    .Call(`_dbsubsampling_armaIES`, X, n, q)
+}
+
+#' IES C++-Version for Benchmarking (C++ Core Code)
+#'
+#' There is no randomness, all parts that need to be randomly selected are selected first indexed.
+#'
+#' @param X A data.frame or matrix consists of explanatory variables.
+#' @param n Subsample size.
+#' @param q Hyperparamter of how to divide the axes.
+armaIES_compare <- function(X, n, q) {
+    .Call(`_dbsubsampling_armaIES_compare`, X, n, q)
+}
+
 #' Get L2 norm
 #'
 #' Get L2 norm of a matrix or data frame.
@@ -89,7 +109,7 @@ ComputeLoss <- function(candi, last_index, X, norm) {
     .Call(`_dbsubsampling_ComputeLoss`, candi, last_index, X, norm)
 }
 
-#' Rcpp version OSS (`OSS` core code)
+#' OSS `Rcpp`-version by `the package itself` (`OSS` core code)
 #'
 #' @param X A matrix.
 #' @param n Subsample size.
@@ -131,7 +151,7 @@ armaComputeLoss <- function(X, xa, y, ya, tPow) {
     .Call(`_dbsubsampling_armaComputeLoss`, X, xa, y, ya, tPow)
 }
 
-#' OSS (RcppArmadillo-version, `myArma_OSS` core code)
+#' OSS `RcppArmadillo`-version by `Zhu` (`myArma_OSS` core code)
 #' @param x A matrix.
 #' @param k Subsample size.
 #' @param tPow The power of the loss function.