Skip to content

Commit

Permalink
Add LowCon and IES.
Browse files Browse the repository at this point in the history
  • Loading branch information
JieYinStat committed Mar 21, 2024
1 parent 98f987c commit af22019
Show file tree
Hide file tree
Showing 38 changed files with 933 additions and 42 deletions.
11 changes: 10 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,24 @@ RoxygenNote: 7.2.3
URL: https://github.com/JieYinStat/dbsubsampling, https://jieyinstat.github.io/dbsubsampling/
BugReports: https://github.com/JieYinStat/dbsubsampling/issues
Suggests:
bench,
copula,
dplyr,
gam,
ggplot2,
knitr,
mvtnorm,
parallel,
purrr,
RcppArmadillo,
rmarkdown,
testthat (>= 3.0.0),
tidyr
tidyr,
TruncatedNormal
Config/testthat/edition: 3
Imports:
lhs,
RANN,
Rcpp,
withr
Depends:
Expand Down
7 changes: 7 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,15 +1,22 @@
# Generated by roxygen2: do not edit by hand

export(IBOSS)
export(IES)
export(LowCon)
export(OSMAC)
export(OSS)
export(Unif)
export(c_IES_compare)
export(myArma_IBOSS)
export(myArma_OSS)
export(myR_IBOSS)
export(myR_OSS)
export(myRcpp_IBOSS)
export(myRcpp_cstyle_IBOSS)
export(r_IES)
export(r_IES_compare)
export(scale01)
export(scale_neg_pos_1)
export(subsampling)
importFrom(Rcpp,sourceCpp)
useDynLib(dbsubsampling, .registration = TRUE)
173 changes: 173 additions & 0 deletions R/IES.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
## ---------------------------------------------------------------------------
## ---------------------------------------------------------------------------
##
## R main function `IES`. `RcppArmadillo`-Version.
##
## ---------------------------------------------------------------------------
## ---------------------------------------------------------------------------

#' Scale data to \eqn{[-1, 1]}
#'
#' @param X A data.frame or matrix.
#'
#' @return Scaled X.
#' @export
#'
#' @examples
#' X <- matrix(1:20, 5, 4)
#' scale01(X)
scale01 <- function(X) {
apply(X, 2, function(.col) (.col - min(.col)) / (max(.col)-min(.col)))
}

#' Independence-Encouraging Subsampling for Nonparametric Additive Models (IES, Proposed by Zhang et.al. (2024))
#'
#' A subsampling method for nonparameter additive model based on Orthogonal Array.
#'
#' @param X A data.frame or matrix consists of explanatory variables.
#' @param n Subsample size.
#' @param q Hyperparamter of how to divide the axes. Default to 16.
#' @param seed Random seed for the sampling.
#'
#' @return Subsample index.
#' @references Yi Zhang, Lin Wang, Xiaoke Zhang & HaiYing Wang (2024)
#' \emph{Independence-Encouraging Subsampling for Nonparametric Additive Models,
#' Journal of Computational and Graphical Statistics},
#' \url{https://www.tandfonline.com/doi/full/10.1080/10618600.2024.2326136}.
#'
#' @examples
#' data <- data_numeric_regression
#' X <- data[-which(names(data) == "y")]
#' IES(X, n = 100, q = 16, seed = NULL)
#' @export
IES <- function(X, n, q = 16, seed = NULL) {
if (!is.null(seed)) withr::local_seed(seed)
X <- scale01(as.matrix(X))
attributes(X) <- attributes(X)["dim"]

index <- armaIES(X, n, q)
return(as.vector(index))
}


#' IES C++-Version for Benchmarking (R-Wrap Code)
#'
#' There is no randomness, all parts that need to be randomly selected are selected first indexed.
#'
#' @param X X A data.frame or matrix consists of explanatory variables.
#' @param n Subsample size.
#' @param q Hyperparamter of how to divide the axes.
#'
#' @export
c_IES_compare <- function(X, n, q) {
X <- scale01(as.matrix(X))
attributes(X) <- attributes(X)["dim"]

index <- armaIES_compare(X, n, q)
return(as.vector(index))
}

## ---------------------------------------------------------------------------
## ---------------------------------------------------------------------------
##
## IES R-Version.
##
## ---------------------------------------------------------------------------
## ---------------------------------------------------------------------------

r_Compute_IES_Loss <- function(candi, last_index, X, q) {
x_last <- floor(X[last_index, ]*q)
loss <- apply(X[candi, ], 1, function(.row) (sum(floor(.row*q) == x_last))^2)
return(loss)
}

#' R Version of IES for Testing
#'
#' @param X A data.frame or matrix consists of explanatory variables.
#' @param n Subsample size.
#' @param q Hyperparamter of how to divide the axes.
#' @param seed Random seed for the sampling.
#'
#' @export
r_IES <- function(X, n, q, seed = NULL) {
if (!is.null(seed)) withr::local_seed(seed)
X <- scale01(as.matrix(X))
attributes(X) <- attributes(X)["dim"]
N <- nrow(X)

index <- numeric(n)
candi <- 1:N
loss <- numeric(N)

# Initial
index[1] <- sample(N, 1)
# index[1] <- 1
candi <- candi[-index[1]]
loss <- loss[-index[1]]
loss <- r_Compute_IES_Loss(candi, index[1], X, q)
# paste0("---Step: 1---")
# print(loss)

for (i in 2:n) {
# Election
temp_vec <- which(loss == min(loss))
temp <- temp_vec[sample(length(temp_vec), 1)]
# temp <- temp_vec[1]
index[i] <- candi[temp]
# print(temp)

# Update
candi <- candi[-temp]
loss <- loss[-temp]
loss <- loss + r_Compute_IES_Loss(candi, index[i], X, q)
# paste0("---Step: ", i, "---")
# print(loss)
}

return(index)
}

#' IES R-Version for Benchmarking
#'
#' There is no randomness, all parts that need to be randomly selected are selected first indexed.
#'
#' @param X X A data.frame or matrix consists of explanatory variables.
#' @param n Subsample size.
#' @param q Hyperparamter of how to divide the axes.
#'
#' @export
r_IES_compare <- function(X, n, q) {
X <- scale01(as.matrix(X))
attributes(X) <- attributes(X)["dim"]
N <- nrow(X)

index <- numeric(n)
candi <- 1:N
loss <- numeric(N)

# Initial
# index[1] <- sample(N, 1)
index[1] <- 1
candi <- candi[-index[1]]
loss <- loss[-index[1]]
loss <- r_Compute_IES_Loss(candi, index[1], X, q)
# paste0("---Step: 1---")
# print(loss)
for (i in 2:n) {
# Election
temp_vec <- which(loss == min(loss))
# temp <- temp_vec[sample(length(temp_vec), 1)]
temp <- temp_vec[1]
index[i] <- candi[temp]
# print(temp)

# Update
candi <- candi[-temp]
loss <- loss[-temp]
loss <- loss + r_Compute_IES_Loss(candi, index[i], X, q)
# paste0("---Step: ", i, "---")
# print(loss)
}

return(index)
}
60 changes: 60 additions & 0 deletions R/LowCon.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#' Transform data to \eqn{[-1,1]^p}
#'
#' @param X A data.frame or matrix of explanatory variables.
#'
#' @return Scaled `X` to \eqn{[-1,1]^p}.
#' @export
#'
#' @examples
#' X <- matrix(rnorm(100), nrow = 20, ncol = 5)
#' scaled_X <- scale_neg_pos_1(X)
#' apply(scaled_X, 2, range)
scale_neg_pos_1 <- function(X) {
apply(X, 2, function(.col) (( (.col - min(.col)) / (max(.col)-min(.col)) ) *2 ) -1 )
}

trans_LHD_adapt_X <- function(LHD_data, X, theta) {
p <- ncol(X)
space <- apply(X, 2, stats::quantile, probs = c(theta / 100, 1 - theta / 100))
width <- space[2,] - space[1,]
for (j in 1:p) {
LHD_data[,j] <- LHD_data[,j]*width[j] + space[1, j]
}
return(LHD_data)
}

#' LowCon: A Design-based Subsampling Approach in a Misspecified Linear Model
#'
#' A subsampling method based space-filling design for misspecified linear model proposed by Meng et.al. (2021).
#'
#' @param X A data.frame or matrix of explanatory variables.
#' @param n Subsample size.
#' @param theta Percentage of data shrinkage. Default to 1.
#' @param seed Random seed for the sampling.
#'
#' @return Subsample index.
#' @references Cheng Meng, Rui Xie, Abhyuday Mandal, Xinlian Zhang, Wenxuan Zhong & Ping Ma (2021)
#' \emph{LowCon: A Design-based Subsampling Approach in a Misspecified Linear Model,
#' Journal of Computational and Graphical Statistics, 30:3, 694-708},
#' \url{https://www.tandfonline.com/doi/full/10.1080/10618600.2020.1844215}.
#'
#' @examples
#' data <- data_numeric_regression
#' X <- data[-which(names(data) == "y")]
#' LowCon(X, n = 100, theta = 1, seed = NULL)
#' @export
LowCon <- function(X, n, theta = 1, seed = NULL){
if (!is.null(seed)) withr::local_seed(seed)

X <- scale_neg_pos_1(as.matrix(X))
attributes(X) <- attributes(X)["dim"]
p <- ncol(X)

LHD_data <- lhs::randomLHS(n, p)
LHD_data <- trans_LHD_adapt_X(LHD_data, X, theta)

index <- RANN::nn2(X, LHD_data, k=1, treetype = "kd")$nn.idx
return(as.vector(index))
}


34 changes: 27 additions & 7 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

#' Get subsample index of other column(except the first column) (IBOSS core code, `Rcpp-C++-style by Wang`)
#' Get subsample index of other column(except the first column) (IBOSS core code, `Rcpp`-C++-style by `Wang`)
#'
#' @param r Subsample size of the column.
#' @param z A numeric vector. the column.
Expand All @@ -11,7 +11,7 @@ getIdxR_cpp <- function(r, z, rdel) {
.Call(`_dbsubsampling_getIdxR_cpp`, r, z, rdel)
}

#' Get subsample index of the first column (IBOSS core code, Rcpp-C++-style by Wang)
#' Get subsample index of the first column (IBOSS core code, `Rcpp`-C++-style by `Wang`)
#'
#' @param r Subsample size of the first column.
#' @param z A numeric vector. the first column.
Expand All @@ -32,30 +32,50 @@ Get_Candi_Quan <- function(x, next_x, candi, lower, upper, r) {
.Call(`_dbsubsampling_Get_Candi_Quan`, x, next_x, candi, lower, upper, r)
}

#' IBOSS with `Rcpp-C++-style` by myself (`myRcpp_cstyle_IBOSS` core c++ code)
#' IBOSS with `Rcpp`-C++-style by `the package itself` (`myRcpp_cstyle_IBOSS` core c++ code)
#' @param n Subsample size.
#' @param X A data.frame or matrix consists of explanatory variables.
#' @return Subsample index.
rcpp_cstyle_IBOSS <- function(X, n) {
.Call(`_dbsubsampling_rcpp_cstyle_IBOSS`, X, n)
}

#' IBOSS with `Rcpp` (`myRcppIBOSS` core c++ code)
#' IBOSS with `Rcpp`-r-style by `the package itself` (`myRcppIBOSS` core c++ code)
#' @param n Subsample size.
#' @param X A data.frame or matrix consists of explanatory variables.
#' @return Subsample index.
rcppIBOSS <- function(X, n) {
.Call(`_dbsubsampling_rcppIBOSS`, X, n)
}

#' IBOSS with `RcppArmadillo` (`myArma_IBOSS` core c++ code)
#' IBOSS with `RcppArmadillo` by `the package itself` (`myArma_IBOSS` core c++ code)
#' @param n Subsample size.
#' @param X A data.frame or matrix consists of explanatory variables.
#' @return Subsample index.
armarcppIBOSS <- function(X, n) {
.Call(`_dbsubsampling_armarcppIBOSS`, X, n)
}

#' IES Core Code Using `RcppArmadillo`.
#'
#' @param X A data.frame or matrix consists of explanatory variables.
#' @param n Subsample size.
#' @param q Hyperparamter of how to divide the axes.
armaIES <- function(X, n, q) {
.Call(`_dbsubsampling_armaIES`, X, n, q)
}

#' IES C++-Version for Benchmarking (C++ Core Code)
#'
#' There is no randomness, all parts that need to be randomly selected are selected first indexed.
#'
#' @param X A data.frame or matrix consists of explanatory variables.
#' @param n Subsample size.
#' @param q Hyperparamter of how to divide the axes.
armaIES_compare <- function(X, n, q) {
.Call(`_dbsubsampling_armaIES_compare`, X, n, q)
}

#' Get L2 norm
#'
#' Get L2 norm of a matrix or data frame.
Expand Down Expand Up @@ -89,7 +109,7 @@ ComputeLoss <- function(candi, last_index, X, norm) {
.Call(`_dbsubsampling_ComputeLoss`, candi, last_index, X, norm)
}

#' Rcpp version OSS (`OSS` core code)
#' OSS `Rcpp`-version by `the package itself` (`OSS` core code)
#'
#' @param X A matrix.
#' @param n Subsample size.
Expand Down Expand Up @@ -131,7 +151,7 @@ armaComputeLoss <- function(X, xa, y, ya, tPow) {
.Call(`_dbsubsampling_armaComputeLoss`, X, xa, y, ya, tPow)
}

#' OSS (RcppArmadillo-version, `myArma_OSS` core code)
#' OSS `RcppArmadillo`-version by `Zhu` (`myArma_OSS` core code)
#' @param x A matrix.
#' @param k Subsample size.
#' @param tPow The power of the loss function.
Expand Down

0 comments on commit af22019

Please sign in to comment.