From 58908bc7262aaaf3719b77ad981829318748c7f6 Mon Sep 17 00:00:00 2001 From: JieYinStat Date: Fri, 16 Feb 2024 11:18:22 +0800 Subject: [PATCH] Add OSS --- DESCRIPTION | 4 +- NAMESPACE | 1 + R/IBOSS.R | 4 +- R/OSMAC.R | 6 +- R/OSS.R | 135 +++++++++++++++++++++++++++++++ R/RcppExports.R | 96 ++++++++++++++++++++++ R/Unif.R | 4 +- R/subsampling.R | 10 ++- README.Rmd | 8 +- README.md | 18 ++++- man/ComputeLoss.Rd | 23 ++++++ man/IBOSS.Rd | 4 +- man/L2norm.Rd | 17 ++++ man/OSS.Rd | 31 ++++++++ man/Unif.Rd | 4 +- man/armaComputeLoss.Rd | 25 ++++++ man/armaOSS.Rd | 21 +++++ man/armaScaleMatrix.Rd | 17 ++++ man/armabottom_k.Rd | 19 +++++ man/bottom_t_index.Rd | 19 +++++ man/getIdxR_cpp.Rd | 21 +++++ man/getIdx_cpp.Rd | 19 +++++ man/get_Logistic_MLE.Rd | 6 +- man/rComputeLoss.Rd | 25 ++++++ man/rL2norm.Rd | 17 ++++ man/rOSS.Rd | 19 +++++ man/rbottom_t_index.Rd | 19 +++++ man/rcppOSS.Rd | 19 +++++ man/subsampling.Rd | 6 +- src/IBOSS.cpp | 49 ++---------- src/Makevars | 2 + src/Makevars.win | 2 + src/OSS.cpp | 146 ++++++++++++++++++++++++++++++++++ src/RcppExports.cpp | 109 +++++++++++++++++++++++++ src/armaOSS.cpp | 132 ++++++++++++++++++++++++++++++ tests/testthat/test-OSS.R | 12 +++ tests/testthat/test-armaOSS.R | 11 +++ vignettes/Subsampling.Rmd | 20 +++-- 38 files changed, 1026 insertions(+), 74 deletions(-) create mode 100644 R/OSS.R create mode 100644 man/ComputeLoss.Rd create mode 100644 man/L2norm.Rd create mode 100644 man/OSS.Rd create mode 100644 man/armaComputeLoss.Rd create mode 100644 man/armaOSS.Rd create mode 100644 man/armaScaleMatrix.Rd create mode 100644 man/armabottom_k.Rd create mode 100644 man/bottom_t_index.Rd create mode 100644 man/getIdxR_cpp.Rd create mode 100644 man/getIdx_cpp.Rd create mode 100644 man/rComputeLoss.Rd create mode 100644 man/rL2norm.Rd create mode 100644 man/rOSS.Rd create mode 100644 man/rbottom_t_index.Rd create mode 100644 man/rcppOSS.Rd create mode 100644 src/Makevars create mode 100644 src/Makevars.win create mode 100644 src/OSS.cpp create mode 100644 src/armaOSS.cpp create mode 100644 tests/testthat/test-OSS.R create mode 100644 tests/testthat/test-armaOSS.R diff --git a/DESCRIPTION b/DESCRIPTION index ac31561..56a78a5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,6 +15,7 @@ BugReports: https://github.com/JieYinStat/dbsubsampling/issues Suggests: knitr, mvtnorm, + RcppArmadillo, rmarkdown, testthat (>= 3.0.0) Config/testthat/edition: 3 @@ -26,4 +27,5 @@ Depends: LazyData: true VignetteBuilder: knitr LinkingTo: - Rcpp + Rcpp, + RcppArmadillo diff --git a/NAMESPACE b/NAMESPACE index e82fb7d..9c53540 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ export(IBOSS) export(OSMAC) +export(OSS) export(Unif) export(subsampling) importFrom(Rcpp,sourceCpp) diff --git a/R/IBOSS.R b/R/IBOSS.R index e0a373c..33354bc 100644 --- a/R/IBOSS.R +++ b/R/IBOSS.R @@ -3,10 +3,10 @@ #' A subsampling method based on D-optiaml criterion inspired by optimal experimental design #' used for linear regression. #' -#' @param n subsample size. +#' @param n Subsample size. #' @param X A data.frame or matrix consists of explanatory variables. #' -#' @return subsample index. +#' @return Subsample index. #' @references HaiYing Wang, Min Yang & John Stufken (2019) #' \emph{Information-Based Optimal Subdata Selection for Big Data Linear Regression, #' Journal of the American Statistical Association, 114:525, 393-405}, diff --git a/R/OSMAC.R b/R/OSMAC.R index 81bc212..357a55b 100644 --- a/R/OSMAC.R +++ b/R/OSMAC.R @@ -7,9 +7,9 @@ #' @param w A numeric vector. The weight of each sample. #' #' @return A list. -#' * `par` : parameter estimation. -#' * `message` : message during iteration. -#' * `iter` : iteration times. +#' * `par` : Parameter estimation. +#' * `message` : Message during iteration. +#' * `iter` : Iteration times. get_Logistic_MLE <- function(x, y, w) { d <- ncol(x) beta <- rep(0, d) diff --git a/R/OSS.R b/R/OSS.R new file mode 100644 index 0000000..c564d5b --- /dev/null +++ b/R/OSS.R @@ -0,0 +1,135 @@ +#' Orthogonal subsampling for big data linear regression(OSS) +#' +#' A subsampling method based on orthogonal array for linear model. +#' +#' @param n Subsample size. +#' @param X A matrix or data frame. +#' +#' @return Subsample index. +#' +#' @examples +#' data_numeric_regression["y"] <- NULL +#' X <- as.matrix(data_numeric_regression) +#' OSS(100, X) +#' +#' @references Lin Wang, Jake Elmstedt, Weng Kee Wong & Hongquan Xu (2021) +#' \emph{Orthogonal subsampling for big data linear regression, +#' The Annals of Applied Statistics, 15(3), 1273-1290}, +#' \url{https://projecteuclid.org/journals/annals-of-applied-statistics/volume-15/issue-3/Orthogonal-subsampling-for-big-data-linear-regression/10.1214/21-AOAS1462.short?tab=ArticleLink}. +#' +#' @export +OSS <- function(n, X){ + X <- scale(as.matrix(X)) # need scale + attributes(X) <- attributes(X)["dim"] + subindex <- rcppOSS(X = X, n = n) + return(subindex) +} + +#' Get L2 norm (r-version) +#' +#' Get L2 norm of a matrix or data frame. +#' @param X A matrix or data.frame. +#' +#' @return L2 norm of `X`(every row). +#' +# @examples +# X <- matrix(1:12, 4, 3) +# X <- scale(X) +# rL2norm(X) +rL2norm <- function(X) { + return(rowSums(X^2)) +} + +#' Compute loss function for OSS (r-version) +#' +#' @param candi The index of the candidate set. +#' @param last_index The index of the seleted point in last iteration. +#' @param X The whole data. +#' @param norm Norm of the whole data. +#' @param p Numbers of columns of the data. +#' +#' @return Loss of every point in candidate set. +# @examples +# X <- matrix(1:20, 5, 4) +# X <- scale(X) +# norm <- rL2norm(X) +# rComputeLoss(c(1,3,4), 2, X, norm) +rComputeLoss <- function(candi, last_index, X, norm, p = ncol(X)){ + delta <- rowSums(t(apply(X[candi, ], 1, function(.row) sign(.row) == sign(X[last_index,])))) + loss <- (p - norm[candi]/2 - norm[last_index]/2 + delta)^2 + return(loss) +} + +#' Find t smallest index of a vector. +#' +#' @param loss A vector. +#' @param t A int +#' +#' @return The index of the t smallest element of the vector. +#' +# @examples +# loss <- rnorm(10) +# rbottom_t_index(loss, 3) +rbottom_t_index <- function(loss, t){ + return(which(loss <= sort(loss)[t])) +} + + +#' OSS (r-version) +#' +#' @param n Subsample size. +#' @param X A matrix. +#' +#' @return Subsample index. +#' +# @examples +# data_numeric_regression["y"] <- NULL +# X <- as.matrix(data_numeric_regression) +# rOSS(X, 100) +rOSS <- function(n, X){ + X <- scale(as.matrix(X)) + attributes(X) <- attributes(X)["dim"] + N <- nrow(X) + + index <- numeric(n) + candi <- 1:N + + norm <- rL2norm(X) + r <- log(N)/log(n) + + for (i in 1:n) { + # Initial + if (i == 1) { + index[1] <- which.max(norm) + candi <- candi[-index[1]] + loss <- rComputeLoss(candi, index[1], X, norm) + next + } + + # Election + tmp <- which.min(loss) + index[i] <- candi[tmp] + candi <- candi[-tmp] + loss <- loss[-tmp] + + # Elimination + t <- ifelse(N > (n^2), N/i, N/(i^(r-1))) + if (length(candi) > t) { + candi <- candi[rbottom_t_index(loss,t)] + loss <- loss[rbottom_t_index(loss,t)] + } + +# if (length(candi) == 0) { +# index <- index[1:i] +# break +# } + # Update loss + loss <- loss + rComputeLoss(candi, index[i], X, norm) + } + + return(index) +} + + + + diff --git a/R/RcppExports.R b/R/RcppExports.R index 803384c..e84eb44 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,11 +1,107 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 +#' Get subsample index of other column(except the first column) (IBOSS) +#' +#' @param r Subsample size of the column. +#' @param z A numeric vector. the column. +#' @param rdel Subsample index of the first column. +#' @return Subsample index of the column. getIdxR_cpp <- function(r, z, rdel) { .Call(`_dbsubsampling_getIdxR_cpp`, r, z, rdel) } +#' Get subsample index of the first column(IBOSS) +#' +#' @param r Subsample size of the first column. +#' @param z A numeric vector. the first column. +#' @return Subsample index of the first column. getIdx_cpp <- function(r, z) { .Call(`_dbsubsampling_getIdx_cpp`, r, z) } +#' Get L2 norm +#' +#' Get L2 norm of a matrix or data frame. +#' +#' @param X A matrix or data.frame. +#' +#' @return L2 norm of `X`(every row). +L2norm <- function(X) { + .Call(`_dbsubsampling_L2norm`, X) +} + +#' Find t smallest index of a vector +#' +#' @param loss A vector. +#' @param t A int. +#' +#' @return The index of the t smallest element of the vector. +bottom_t_index <- function(loss, t) { + .Call(`_dbsubsampling_bottom_t_index`, loss, t) +} + +#' Compute loss function for OSS +#' +#' @param candi The index of the candidate set. +#' @param last_index The index of the seleted point in last iteration. +#' @param X The whole data. +#' @param norm Norm of the whole data. +#' +#' @return Loss of every point in candidate set. +ComputeLoss <- function(candi, last_index, X, norm) { + .Call(`_dbsubsampling_ComputeLoss`, candi, last_index, X, norm) +} + +#' Rcpp version OSS (core code of `OSS`) +#' +#' @param X A matrix. +#' @param n Subsample size. +#' +#' @return Subsample index. +rcppOSS <- function(X, n) { + .Call(`_dbsubsampling_rcppOSS`, X, n) +} + +#' Find t smallest index of a vector (RcppArmadillo-version) +#' +#' @param x A vector. +#' @param k A int. +#' +#' @return The index of the t smallest element of the vector. +armabottom_k <- function(x, k) { + .Call(`_dbsubsampling_armabottom_k`, x, k) +} + +#' Scale a matrix (RcppArmadillo-version) +#' +#' @param X A matrix. +#' +#' @return Scaled matrix. +armaScaleMatrix <- function(X) { + .Call(`_dbsubsampling_armaScaleMatrix`, X) +} + +#' Compute loss function for OSS (RcppArmadillo-version) +#' +#' @param X Matrix of the candidate set. +#' @param xa Norm of the candidate set. +#' @param y A vector. The point which be selected last iteration. +#' @param ya Norm of `y`. +#' @param tPow The power of the loss function. +#' +#' @return Loss of the candidate set. +armaComputeLoss <- function(X, xa, y, ya, tPow) { + .Call(`_dbsubsampling_armaComputeLoss`, X, xa, y, ya, tPow) +} + +#' OSS (RcppArmadillo-version) +#' @param x A matrix. +#' @param k Subsample size. +#' @param tPow The power of the loss function. +#' +#' @return Subsample index. +armaOSS <- function(x, k, tPow = 2) { + .Call(`_dbsubsampling_armaOSS`, x, k, tPow) +} + diff --git a/R/Unif.R b/R/Unif.R index 9edae03..b61403d 100644 --- a/R/Unif.R +++ b/R/Unif.R @@ -5,8 +5,8 @@ #' @param N Total sample size. #' @param n Subsample size. #' @param replace A boolean. -#' * `TRUE` (the default): sampling with replace. -#' * `FALSE`: sampling without replace +#' * `TRUE` (the default): Sampling with replace. +#' * `FALSE`: Sampling without replace #' @param seed Random seed which is an integer (default NULL). This random seed is only valid for this sampling and #' will not affect the external environment #' diff --git a/R/subsampling.R b/R/subsampling.R index 95bb433..77f2b98 100644 --- a/R/subsampling.R +++ b/R/subsampling.R @@ -13,9 +13,10 @@ #' * `OSMAC_A`: A subsampling method based on A-optimal for logistic regression proposed by Wang et.al. (2018). #' * `OSMAC_L`: A subsampling method based on L-optimal for logistic regression proposed by Wang et.al. (2018). #' * `IBOSS`: A subsampling method based on D-optimal for linear regression proposed by Wang et.al. (2019). +#' * `OSS` : A subsampling method based on Orthogonal Array proposed by Wang et.al.(2021). #' @param replace A boolean. -#' * `TRUE` (the default): sampling with replace. -#' * `FALSE`: sampling without replace +#' * `TRUE` (the default): Sampling with replace. +#' * `FALSE`: Sampling without replace #' @param seed_1 Random seed for the first stage sampling or Unif. #' @param seed_2 Random seed for the second stage sampling. #' @param na_method Method to handle NA. @@ -33,6 +34,7 @@ #' #' data_numeric <- data_numeric_regression #' subsampling(y_name = "y", data = data_numeric, n = 100, method = "IBOSS") +#' subsampling(y_name = "y", data = data_numeric, n = 30, method = "OSS") subsampling <- function(y_name, x_name = NULL, data, n, pilot_n = NULL, method = "Unif", replace = TRUE, seed_1 = NULL, seed_2 = NULL, na_method = NULL) { @@ -47,10 +49,10 @@ subsampling <- function(y_name, x_name = NULL, data, n, pilot_n = NULL, method = Unif = Unif(N = N, n = n, seed = seed_1, replace = TRUE), IBOSS = IBOSS(n = n, X = x), OSMAC_A = OSMAC(X = x, Y = y, r1 = pilot_n, r2 = n, method = "mmse", seed_1 = seed_1, seed_2 = seed_2), - OSMAC_L = OSMAC(X = x, Y = y, r1 = pilot_n, r2 = n, method = "mvc", seed_1 = seed_1, seed_2 = seed_2) + OSMAC_L = OSMAC(X = x, Y = y, r1 = pilot_n, r2 = n, method = "mvc", seed_1 = seed_1, seed_2 = seed_2), + OSS = OSS(n = n, X = x) # Support = # Lowcon = - # OSS = # DDS = ) return(subsample_index) diff --git a/README.Rmd b/README.Rmd index 389b0b0..1a6fde5 100644 --- a/README.Rmd +++ b/README.Rmd @@ -32,7 +32,8 @@ devtools::install_github("JieYinStat/dbsubsampling") ## Example -This is a basic example which shows you how to get subsample index, such as uniform sampling, OSMAC and IBOSS: +This is a basic example which shows you how to get subsample index, such as uniform sampling, OSMAC, IBOSS and +OSS: ```{r example} library(dbsubsampling) @@ -52,7 +53,10 @@ subsampling(y_name = "y", data = data_binary, n = 10, pilot_n = 100, method = "O # IBOSS data_numeric <- data_numeric_regression -subsampling(y_name = "y", data = data_numeric, n = 30, method = "IBOSS") +subsampling(y_name = "y", data = data_numeric, n = 100, method = "IBOSS") + +# OSS +subsampling(y_name = "y", data = data_numeric, n = 30, method = "OSS") ``` You can get more detailed examples from the article column on the [website](jieyinstat.github.io/dbsubsampling/). diff --git a/README.md b/README.md index f357838..bd81dba 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ devtools::install_github("JieYinStat/dbsubsampling") ## Example This is a basic example which shows you how to get subsample index, such -as uniform sampling, OSMAC and IBOSS: +as uniform sampling, OSMAC, IBOSS and OSS: ``` r library(dbsubsampling) @@ -47,9 +47,19 @@ subsampling(y_name = "y", data = data_binary, n = 10, pilot_n = 100, method = "O # IBOSS data_numeric <- data_numeric_regression -subsampling(y_name = "y", data = data_numeric, n = 30, method = "IBOSS") -#> [1] 419 1144 3395 3484 3896 5121 6203 7915 7967 8026 8156 8694 8841 9117 8438 -#> [16] 3121 +subsampling(y_name = "y", data = data_numeric, n = 100, method = "IBOSS") +#> [1] 183 226 395 419 584 666 711 758 1027 1144 1324 1445 1940 1946 1978 +#> [16] 2018 2673 2982 3190 3395 3484 3612 3632 3638 3696 3816 3835 3896 3921 4256 +#> [31] 4312 4405 4523 4551 4729 4938 5121 5226 5342 5410 5679 5770 5995 6089 6163 +#> [46] 6170 6203 6250 6525 6964 6979 7053 7198 7407 7564 7633 7915 7935 7967 7992 +#> [61] 8026 8088 8106 8156 8161 8267 8306 8501 8503 8521 8534 8694 8805 8841 9117 +#> [76] 9211 9302 9364 9398 9456 9676 9946 9971 9989 1173 2344 5394 8438 8567 9239 +#> [91] 1787 2104 2215 3121 7159 9133 + +# OSS +subsampling(y_name = "y", data = data_numeric, n = 30, method = "OSS") +#> [1] 8841 8961 1902 7512 48 9867 6547 9784 3392 3622 5780 6594 1890 1850 8335 +#> [16] 1254 6204 1257 4611 3831 4782 4919 1579 3404 718 7189 2060 4899 590 1800 ``` You can get more detailed examples from the article column on the diff --git a/man/ComputeLoss.Rd b/man/ComputeLoss.Rd new file mode 100644 index 0000000..558e5c5 --- /dev/null +++ b/man/ComputeLoss.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{ComputeLoss} +\alias{ComputeLoss} +\title{Compute loss function for OSS} +\usage{ +ComputeLoss(candi, last_index, X, norm) +} +\arguments{ +\item{candi}{The index of the candidate set.} + +\item{last_index}{The index of the seleted point in last iteration.} + +\item{X}{The whole data.} + +\item{norm}{Norm of the whole data.} +} +\value{ +Loss of every point in candidate set. +} +\description{ +Compute loss function for OSS +} diff --git a/man/IBOSS.Rd b/man/IBOSS.Rd index c33af84..791c4fe 100644 --- a/man/IBOSS.Rd +++ b/man/IBOSS.Rd @@ -7,12 +7,12 @@ IBOSS(n, X) } \arguments{ -\item{n}{subsample size.} +\item{n}{Subsample size.} \item{X}{A data.frame or matrix consists of explanatory variables.} } \value{ -subsample index. +Subsample index. } \description{ A subsampling method based on D-optiaml criterion inspired by optimal experimental design diff --git a/man/L2norm.Rd b/man/L2norm.Rd new file mode 100644 index 0000000..e7f2000 --- /dev/null +++ b/man/L2norm.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{L2norm} +\alias{L2norm} +\title{Get L2 norm} +\usage{ +L2norm(X) +} +\arguments{ +\item{X}{A matrix or data.frame.} +} +\value{ +L2 norm of \code{X}(every row). +} +\description{ +Get L2 norm of a matrix or data frame. +} diff --git a/man/OSS.Rd b/man/OSS.Rd new file mode 100644 index 0000000..f8eefaf --- /dev/null +++ b/man/OSS.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/OSS.R +\name{OSS} +\alias{OSS} +\title{Orthogonal subsampling for big data linear regression(OSS)} +\usage{ +OSS(n, X) +} +\arguments{ +\item{n}{Subsample size.} + +\item{X}{A matrix or data frame.} +} +\value{ +Subsample index. +} +\description{ +A subsampling method based on orthogonal array for linear model. +} +\examples{ +data_numeric_regression["y"] <- NULL +X <- as.matrix(data_numeric_regression) +OSS(100, X) + +} +\references{ +Lin Wang, Jake Elmstedt, Weng Kee Wong & Hongquan Xu (2021) +\emph{Orthogonal subsampling for big data linear regression, +The Annals of Applied Statistics, 15(3), 1273-1290}, +\url{https://projecteuclid.org/journals/annals-of-applied-statistics/volume-15/issue-3/Orthogonal-subsampling-for-big-data-linear-regression/10.1214/21-AOAS1462.short?tab=ArticleLink}. +} diff --git a/man/Unif.Rd b/man/Unif.Rd index b6270d4..9d31f08 100644 --- a/man/Unif.Rd +++ b/man/Unif.Rd @@ -16,8 +16,8 @@ will not affect the external environment} \item{replace}{A boolean. \itemize{ -\item \code{TRUE} (the default): sampling with replace. -\item \code{FALSE}: sampling without replace +\item \code{TRUE} (the default): Sampling with replace. +\item \code{FALSE}: Sampling without replace }} } \value{ diff --git a/man/armaComputeLoss.Rd b/man/armaComputeLoss.Rd new file mode 100644 index 0000000..e434330 --- /dev/null +++ b/man/armaComputeLoss.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{armaComputeLoss} +\alias{armaComputeLoss} +\title{Compute loss function for OSS (RcppArmadillo-version)} +\usage{ +armaComputeLoss(X, xa, y, ya, tPow) +} +\arguments{ +\item{X}{Matrix of the candidate set.} + +\item{xa}{Norm of the candidate set.} + +\item{y}{A vector. The point which be selected last iteration.} + +\item{ya}{Norm of \code{y}.} + +\item{tPow}{The power of the loss function.} +} +\value{ +Loss of the candidate set. +} +\description{ +Compute loss function for OSS (RcppArmadillo-version) +} diff --git a/man/armaOSS.Rd b/man/armaOSS.Rd new file mode 100644 index 0000000..998cc3f --- /dev/null +++ b/man/armaOSS.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{armaOSS} +\alias{armaOSS} +\title{OSS (RcppArmadillo-version)} +\usage{ +armaOSS(x, k, tPow = 2) +} +\arguments{ +\item{x}{A matrix.} + +\item{k}{Subsample size.} + +\item{tPow}{The power of the loss function.} +} +\value{ +Subsample index. +} +\description{ +OSS (RcppArmadillo-version) +} diff --git a/man/armaScaleMatrix.Rd b/man/armaScaleMatrix.Rd new file mode 100644 index 0000000..884579a --- /dev/null +++ b/man/armaScaleMatrix.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{armaScaleMatrix} +\alias{armaScaleMatrix} +\title{Scale a matrix (RcppArmadillo-version)} +\usage{ +armaScaleMatrix(X) +} +\arguments{ +\item{X}{A matrix.} +} +\value{ +Scaled matrix. +} +\description{ +Scale a matrix (RcppArmadillo-version) +} diff --git a/man/armabottom_k.Rd b/man/armabottom_k.Rd new file mode 100644 index 0000000..b76ac53 --- /dev/null +++ b/man/armabottom_k.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{armabottom_k} +\alias{armabottom_k} +\title{Find t smallest index of a vector (RcppArmadillo-version)} +\usage{ +armabottom_k(x, k) +} +\arguments{ +\item{x}{A vector.} + +\item{k}{A int.} +} +\value{ +The index of the t smallest element of the vector. +} +\description{ +Find t smallest index of a vector (RcppArmadillo-version) +} diff --git a/man/bottom_t_index.Rd b/man/bottom_t_index.Rd new file mode 100644 index 0000000..24f84fc --- /dev/null +++ b/man/bottom_t_index.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{bottom_t_index} +\alias{bottom_t_index} +\title{Find t smallest index of a vector} +\usage{ +bottom_t_index(loss, t) +} +\arguments{ +\item{loss}{A vector.} + +\item{t}{A int.} +} +\value{ +The index of the t smallest element of the vector. +} +\description{ +Find t smallest index of a vector +} diff --git a/man/getIdxR_cpp.Rd b/man/getIdxR_cpp.Rd new file mode 100644 index 0000000..6ad8f47 --- /dev/null +++ b/man/getIdxR_cpp.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{getIdxR_cpp} +\alias{getIdxR_cpp} +\title{Get subsample index of other column(except the first column) (IBOSS)} +\usage{ +getIdxR_cpp(r, z, rdel) +} +\arguments{ +\item{r}{Subsample size of the column.} + +\item{z}{A numeric vector. the column.} + +\item{rdel}{Subsample index of the first column.} +} +\value{ +Subsample index of the column. +} +\description{ +Get subsample index of other column(except the first column) (IBOSS) +} diff --git a/man/getIdx_cpp.Rd b/man/getIdx_cpp.Rd new file mode 100644 index 0000000..79ade97 --- /dev/null +++ b/man/getIdx_cpp.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{getIdx_cpp} +\alias{getIdx_cpp} +\title{Get subsample index of the first column(IBOSS)} +\usage{ +getIdx_cpp(r, z) +} +\arguments{ +\item{r}{Subsample size of the first column.} + +\item{z}{A numeric vector. the first column.} +} +\value{ +Subsample index of the first column. +} +\description{ +Get subsample index of the first column(IBOSS) +} diff --git a/man/get_Logistic_MLE.Rd b/man/get_Logistic_MLE.Rd index 64dc541..d5e7faa 100644 --- a/man/get_Logistic_MLE.Rd +++ b/man/get_Logistic_MLE.Rd @@ -16,9 +16,9 @@ get_Logistic_MLE(x, y, w) \value{ A list. \itemize{ -\item \code{par} : parameter estimation. -\item \code{message} : message during iteration. -\item \code{iter} : iteration times. +\item \code{par} : Parameter estimation. +\item \code{message} : Message during iteration. +\item \code{iter} : Iteration times. } } \description{ diff --git a/man/rComputeLoss.Rd b/man/rComputeLoss.Rd new file mode 100644 index 0000000..ea6ecf7 --- /dev/null +++ b/man/rComputeLoss.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/OSS.R +\name{rComputeLoss} +\alias{rComputeLoss} +\title{Compute loss function for OSS (r-version)} +\usage{ +rComputeLoss(candi, last_index, X, norm, p = ncol(X)) +} +\arguments{ +\item{candi}{The index of the candidate set.} + +\item{last_index}{The index of the seleted point in last iteration.} + +\item{X}{The whole data.} + +\item{norm}{Norm of the whole data.} + +\item{p}{Numbers of columns of the data.} +} +\value{ +Loss of every point in candidate set. +} +\description{ +Compute loss function for OSS (r-version) +} diff --git a/man/rL2norm.Rd b/man/rL2norm.Rd new file mode 100644 index 0000000..a8afc98 --- /dev/null +++ b/man/rL2norm.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/OSS.R +\name{rL2norm} +\alias{rL2norm} +\title{Get L2 norm (r-version)} +\usage{ +rL2norm(X) +} +\arguments{ +\item{X}{A matrix or data.frame.} +} +\value{ +L2 norm of \code{X}(every row). +} +\description{ +Get L2 norm of a matrix or data frame. +} diff --git a/man/rOSS.Rd b/man/rOSS.Rd new file mode 100644 index 0000000..b0e7db0 --- /dev/null +++ b/man/rOSS.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/OSS.R +\name{rOSS} +\alias{rOSS} +\title{OSS (r-version)} +\usage{ +rOSS(n, X) +} +\arguments{ +\item{n}{Subsample size.} + +\item{X}{A matrix.} +} +\value{ +Subsample index. +} +\description{ +OSS (r-version) +} diff --git a/man/rbottom_t_index.Rd b/man/rbottom_t_index.Rd new file mode 100644 index 0000000..1012482 --- /dev/null +++ b/man/rbottom_t_index.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/OSS.R +\name{rbottom_t_index} +\alias{rbottom_t_index} +\title{Find t smallest index of a vector.} +\usage{ +rbottom_t_index(loss, t) +} +\arguments{ +\item{loss}{A vector.} + +\item{t}{A int} +} +\value{ +The index of the t smallest element of the vector. +} +\description{ +Find t smallest index of a vector. +} diff --git a/man/rcppOSS.Rd b/man/rcppOSS.Rd new file mode 100644 index 0000000..474f1e5 --- /dev/null +++ b/man/rcppOSS.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{rcppOSS} +\alias{rcppOSS} +\title{Rcpp version OSS (core code of \code{OSS})} +\usage{ +rcppOSS(X, n) +} +\arguments{ +\item{X}{A matrix.} + +\item{n}{Subsample size.} +} +\value{ +Subsample index. +} +\description{ +Rcpp version OSS (core code of \code{OSS}) +} diff --git a/man/subsampling.Rd b/man/subsampling.Rd index ace558e..8e2be3d 100644 --- a/man/subsampling.Rd +++ b/man/subsampling.Rd @@ -35,12 +35,13 @@ Default to all variables except the response variable} \item \code{OSMAC_A}: A subsampling method based on A-optimal for logistic regression proposed by Wang et.al. (2018). \item \code{OSMAC_L}: A subsampling method based on L-optimal for logistic regression proposed by Wang et.al. (2018). \item \code{IBOSS}: A subsampling method based on D-optimal for linear regression proposed by Wang et.al. (2019). +\item \code{OSS} : A subsampling method based on Orthogonal Array proposed by Wang et.al.(2021). }} \item{replace}{A boolean. \itemize{ -\item \code{TRUE} (the default): sampling with replace. -\item \code{FALSE}: sampling without replace +\item \code{TRUE} (the default): Sampling with replace. +\item \code{FALSE}: Sampling without replace }} \item{seed_1}{Random seed for the first stage sampling or Unif.} @@ -65,4 +66,5 @@ subsampling(y_name = "y", data = data_binary, n = 30, pilot_n = 100, method = "O data_numeric <- data_numeric_regression subsampling(y_name = "y", data = data_numeric, n = 100, method = "IBOSS") +subsampling(y_name = "y", data = data_numeric, n = 30, method = "OSS") } diff --git a/src/IBOSS.cpp b/src/IBOSS.cpp index c7e7c38..cb9e8f6 100644 --- a/src/IBOSS.cpp +++ b/src/IBOSS.cpp @@ -6,12 +6,10 @@ using namespace Rcpp; //' Get subsample index of other column(except the first column) (IBOSS) //' -//' @param r subsample size of the column. +//' @param r Subsample size of the column. //' @param z A numeric vector. the column. -//' @param rdel subsample index of the first column. -//' @return subsample index of the column. -//' @export -// [[Rcpp::plugins("cpp99")]] +//' @param rdel Subsample index of the first column. +//' @return Subsample index of the column. // [[Rcpp::export]] IntegerVector getIdxR_cpp(int r, NumericVector z, IntegerVector rdel) { int m = rdel.size(), n = z.size(); @@ -46,27 +44,7 @@ IntegerVector getIdxR_cpp(int r, NumericVector z, IntegerVector rdel) { std::nth_element(y, y + r - 1, y + n - m); double yru = -y[r-1]; delete [] y; - // /********************************************/ - // // This code use twice memory - // double* yl = new double [n-m]; - // double* yu = new double [n-m]; - // int j = 0, k=0, kl = 0, ku = 0; - // for ( int i = 0; i < n; i++) { - // if ( j >= m) { - // yl[kl++] = z[i]; - // yu[ku++] = -z[i]; - // } - // else if ( del[j] != i + 1) { - // yl[kl++] = z[i]; - // yu[ku++] = -z[i]; - // } - // else - // j++; - // } - // std::nth_element(yl, yl + r - 1, yl + n - m); - // std::nth_element(yu, yu + r - 1, yu + n - m); - // double yrl = yl[r-1], yru = -yu[r-1]; - // /********************************************/ + int jl = 0, ju = 0; std::vector locl(r); std::vector locu(r); @@ -100,11 +78,9 @@ IntegerVector getIdxR_cpp(int r, NumericVector z, IntegerVector rdel) { //' Get subsample index of the first column(IBOSS) //' -//' @param r subsample size of the first column. +//' @param r Subsample size of the first column. //' @param z A numeric vector. the first column. -//' @return subsample index of the first column. -//' @export - // [[Rcpp::plugins("cpp99")]] +//' @return Subsample index of the first column. // [[Rcpp::export]] IntegerVector getIdx_cpp(int r, NumericVector z) { int n = z.size(); @@ -119,18 +95,7 @@ IntegerVector getIdx_cpp(int r, NumericVector z) { std::nth_element(y, y + r - 1, y + n); double yru = -y[r-1]; delete [] y; - // /********************************************/ - // // This code use twice memory - // double* yl = new double [n]; - // double* yu = new double [n]; - // for ( int i = 0; i < n; i++) { - // yl[i] = z[i]; - // yu[i] = -z[i]; - // } - // std::nth_element(yl, yl + r - 1, yl + n); - // std::nth_element(yu, yu + r - 1, yu + n); - // double yrl = yl[r-1], yru = -yu[r-1]; - // /*******************************************/ + int jl = 0, ju = 0; std::vector locl(r); std::vector locu(r); diff --git a/src/Makevars b/src/Makevars new file mode 100644 index 0000000..3a7f8ac --- /dev/null +++ b/src/Makevars @@ -0,0 +1,2 @@ +PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) +PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) diff --git a/src/Makevars.win b/src/Makevars.win new file mode 100644 index 0000000..3a7f8ac --- /dev/null +++ b/src/Makevars.win @@ -0,0 +1,2 @@ +PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) +PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) diff --git a/src/OSS.cpp b/src/OSS.cpp new file mode 100644 index 0000000..3f695b4 --- /dev/null +++ b/src/OSS.cpp @@ -0,0 +1,146 @@ +#include +using namespace Rcpp; + +// Scale a matrix +// +// @param X A matrix. +// +// @return Scaled matrix. +// +// NumericMatrix ScaleMatrix(NumericMatrix X){ +// int p = X.cols(); +// for(int j = 0; j < p; j++){ +// X(_,j) = (X(_,j) - mean(X(_,j))) / sd(X(_,j)); +// } +// return X; +// } + +//' Get L2 norm +//' +//' Get L2 norm of a matrix or data frame. +//' +//' @param X A matrix or data.frame. +//' +//' @return L2 norm of `X`(every row). +// [[Rcpp::export]] +NumericVector L2norm(NumericMatrix X){ + int N = X.rows(); + NumericVector norm(N); + for(int i=0; i pow(n,2)) { + t = N / (i+1); + } else { + t = N / pow(i+1, r-1); + } + // Rcout << "t = " << t << "\n"; + if (candi.length() > t) { + IntegerVector remain = bottom_t_index(loss, floor(t)); + // Rcout << "length of remain: " << remain.length() << "\n"; + // Rcout << "remain:" << remain << "\n"; + candi = candi[remain]; + loss = loss[remain]; + } + +// if (candi.length() == 0) { +// index = index[seq(0,i)]; +// break; +// } + // Rcout << "After eliminate:" << "\n"; + // Rcout << "length of candi: " << candi.length() << "\n"; + // Rcout << "candi: " << candi << "\n"; + // Rcout << "length of loss: " << loss.length() << "\n"; + // Rcout << "loss: " << loss << "\n"; + loss = loss + ComputeLoss(candi, index[i], X, norm); // loss can't located. + // Rcout << "After Update loss:" << "\n"; + // Rcout << "length of loss: " << loss.length() << "\n"; + // Rcout << "loss: " << loss << "\n" << "\n"; + } + return index + 1; +} + + diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 746aa78..fb208b6 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -1,6 +1,7 @@ // Generated by using Rcpp::compileAttributes() -> do not edit by hand // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 +#include #include using namespace Rcpp; @@ -35,10 +36,118 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// L2norm +NumericVector L2norm(NumericMatrix X); +RcppExport SEXP _dbsubsampling_L2norm(SEXP XSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< NumericMatrix >::type X(XSEXP); + rcpp_result_gen = Rcpp::wrap(L2norm(X)); + return rcpp_result_gen; +END_RCPP +} +// bottom_t_index +IntegerVector bottom_t_index(NumericVector loss, int t); +RcppExport SEXP _dbsubsampling_bottom_t_index(SEXP lossSEXP, SEXP tSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< NumericVector >::type loss(lossSEXP); + Rcpp::traits::input_parameter< int >::type t(tSEXP); + rcpp_result_gen = Rcpp::wrap(bottom_t_index(loss, t)); + return rcpp_result_gen; +END_RCPP +} +// ComputeLoss +NumericVector ComputeLoss(IntegerVector candi, int last_index, NumericMatrix X, NumericVector norm); +RcppExport SEXP _dbsubsampling_ComputeLoss(SEXP candiSEXP, SEXP last_indexSEXP, SEXP XSEXP, SEXP normSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< IntegerVector >::type candi(candiSEXP); + Rcpp::traits::input_parameter< int >::type last_index(last_indexSEXP); + Rcpp::traits::input_parameter< NumericMatrix >::type X(XSEXP); + Rcpp::traits::input_parameter< NumericVector >::type norm(normSEXP); + rcpp_result_gen = Rcpp::wrap(ComputeLoss(candi, last_index, X, norm)); + return rcpp_result_gen; +END_RCPP +} +// rcppOSS +IntegerVector rcppOSS(NumericMatrix X, int n); +RcppExport SEXP _dbsubsampling_rcppOSS(SEXP XSEXP, SEXP nSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< NumericMatrix >::type X(XSEXP); + Rcpp::traits::input_parameter< int >::type n(nSEXP); + rcpp_result_gen = Rcpp::wrap(rcppOSS(X, n)); + return rcpp_result_gen; +END_RCPP +} +// armabottom_k +arma::vec armabottom_k(arma::vec x, unsigned int k); +RcppExport SEXP _dbsubsampling_armabottom_k(SEXP xSEXP, SEXP kSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< arma::vec >::type x(xSEXP); + Rcpp::traits::input_parameter< unsigned int >::type k(kSEXP); + rcpp_result_gen = Rcpp::wrap(armabottom_k(x, k)); + return rcpp_result_gen; +END_RCPP +} +// armaScaleMatrix +arma::mat armaScaleMatrix(arma::mat X); +RcppExport SEXP _dbsubsampling_armaScaleMatrix(SEXP XSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< arma::mat >::type X(XSEXP); + rcpp_result_gen = Rcpp::wrap(armaScaleMatrix(X)); + return rcpp_result_gen; +END_RCPP +} +// armaComputeLoss +arma::vec armaComputeLoss(arma::mat X, arma::vec xa, arma::mat y, double ya, double tPow); +RcppExport SEXP _dbsubsampling_armaComputeLoss(SEXP XSEXP, SEXP xaSEXP, SEXP ySEXP, SEXP yaSEXP, SEXP tPowSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< arma::mat >::type X(XSEXP); + Rcpp::traits::input_parameter< arma::vec >::type xa(xaSEXP); + Rcpp::traits::input_parameter< arma::mat >::type y(ySEXP); + Rcpp::traits::input_parameter< double >::type ya(yaSEXP); + Rcpp::traits::input_parameter< double >::type tPow(tPowSEXP); + rcpp_result_gen = Rcpp::wrap(armaComputeLoss(X, xa, y, ya, tPow)); + return rcpp_result_gen; +END_RCPP +} +// armaOSS +arma::uvec armaOSS(arma::mat x, int k, double tPow); +RcppExport SEXP _dbsubsampling_armaOSS(SEXP xSEXP, SEXP kSEXP, SEXP tPowSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< arma::mat >::type x(xSEXP); + Rcpp::traits::input_parameter< int >::type k(kSEXP); + Rcpp::traits::input_parameter< double >::type tPow(tPowSEXP); + rcpp_result_gen = Rcpp::wrap(armaOSS(x, k, tPow)); + return rcpp_result_gen; +END_RCPP +} static const R_CallMethodDef CallEntries[] = { {"_dbsubsampling_getIdxR_cpp", (DL_FUNC) &_dbsubsampling_getIdxR_cpp, 3}, {"_dbsubsampling_getIdx_cpp", (DL_FUNC) &_dbsubsampling_getIdx_cpp, 2}, + {"_dbsubsampling_L2norm", (DL_FUNC) &_dbsubsampling_L2norm, 1}, + {"_dbsubsampling_bottom_t_index", (DL_FUNC) &_dbsubsampling_bottom_t_index, 2}, + {"_dbsubsampling_ComputeLoss", (DL_FUNC) &_dbsubsampling_ComputeLoss, 4}, + {"_dbsubsampling_rcppOSS", (DL_FUNC) &_dbsubsampling_rcppOSS, 2}, + {"_dbsubsampling_armabottom_k", (DL_FUNC) &_dbsubsampling_armabottom_k, 2}, + {"_dbsubsampling_armaScaleMatrix", (DL_FUNC) &_dbsubsampling_armaScaleMatrix, 1}, + {"_dbsubsampling_armaComputeLoss", (DL_FUNC) &_dbsubsampling_armaComputeLoss, 5}, + {"_dbsubsampling_armaOSS", (DL_FUNC) &_dbsubsampling_armaOSS, 3}, {NULL, NULL, 0} }; diff --git a/src/armaOSS.cpp b/src/armaOSS.cpp new file mode 100644 index 0000000..cce7aa8 --- /dev/null +++ b/src/armaOSS.cpp @@ -0,0 +1,132 @@ +#include +using namespace arma; + +//' Find t smallest index of a vector (RcppArmadillo-version) +//' +//' @param x A vector. +//' @param k A int. +//' +//' @return The index of the t smallest element of the vector. +// [[Rcpp::export]] +arma::vec armabottom_k(arma::vec x, unsigned int k) { + arma::vec x2 = x; // save a copy of x + arma::vec ind(k); // save the indexes of the smallest k numbers + std::nth_element(x.begin(), x.begin() + k - 1, x.end()); // std::greater()); + for(int ii=0, i=0; i(n); + for(int i=0; i(1,n,n); + arma::uvec ind=linspace(1,k,k); + arma::vec L=sum(pow(x,2),1); + arma::vec xa=L; + uword mm=L.index_max(); + ind(0)=candi(mm); + candi.shed_row(mm); + L.shed_row(mm); + + // Rcout << "i = 0" << "\n"; + // Rcout << "index: " << ind(0)-1 << "\n"; + // Rcout << "length of candi: " << candi.n_elem << "\n"; + // Rcout << "candi: " << candi.t()-1 << "\n"; + + arma::mat sx=sign(x); + + /* GOSS original: + double r=log(n/k)/log(k); + */ + double r=log(n)/log(k); // modified + for(int i=1; i1) & (L.n_elem>double(nc))){ + arma::uvec tt=arma::conv_to::from(bottom_k(L,nc)); + L=L.elem(tt); + candi=candi.elem(tt); + */ + if( L.n_elem > nc ){ //modified + arma::uvec tt=arma::conv_to::from(armabottom_k(L,floor(nc))); + L=L.elem(tt); + candi=candi.elem(tt); + + // Rcout << "t = " << nc << "\n"; + // Rcout << "length of remain: " << tt.n_elem << "\n"; + // Rcout << "remain: " << tt.t() << "\n"; + // Rcout << "After eliminatie" << "\n"; + // Rcout << "length of candi: " << candi.n_elem << "\n"; + // Rcout << "candi: " << candi.t()-1 << "\n"; + // Rcout << "length of loss: " << L.n_elem << "\n"; + // Rcout << "loss: " << L.t() << "\n"; + + } + } + return ind; +} diff --git a/tests/testthat/test-OSS.R b/tests/testthat/test-OSS.R new file mode 100644 index 0000000..10b4c78 --- /dev/null +++ b/tests/testthat/test-OSS.R @@ -0,0 +1,12 @@ +test_that("OSS with Rcpp works well and get the same result with r-version", { + data_numeric_regression["y"] <- NULL + X <- data_numeric_regression + + # X <- scale(as.matrix(data_numeric_regression)) + # attributes(X) <- attributes(X)["dim"] + # expect_equal(L2norm(X), rowSums(X^2)) + # expect_equal(bottom_t_index(X[,1], 20) + 1, which(X[,1] <= sort(X[,1])[20])) + + expect_equal(OSS(100, X), rOSS(100, X)) + expect_equal(OSS(100, X), OSS(100, X)) +}) diff --git a/tests/testthat/test-armaOSS.R b/tests/testthat/test-armaOSS.R new file mode 100644 index 0000000..5df9bd5 --- /dev/null +++ b/tests/testthat/test-armaOSS.R @@ -0,0 +1,11 @@ +test_that("OSS with RcppArmadill get the same result with Rcpp", { + data_numeric_regression["y"] <- NULL + X <- as.matrix(data_numeric_regression) + attributes(X) <- attributes(X)["dim"] + + expect_equal(as.vector(armaOSS(X, 100)), OSS(100, X)) + expect_equal(as.vector(armaOSS(X, 100)), as.vector(armaOSS(X, 100))) +}) + + + diff --git a/vignettes/Subsampling.Rmd b/vignettes/Subsampling.Rmd index 3da72f1..31179dd 100644 --- a/vignettes/Subsampling.Rmd +++ b/vignettes/Subsampling.Rmd @@ -45,7 +45,7 @@ x <- data_binary[-which(names(data_binary) == "y")] OSMAC(X = x, Y = y, r1 = 100, r2 = 10, method="mmse", seed_1 = 123, seed_2 = 456) ``` -or you can use a unified interface(recommended): +or you can use a unified interface (recommended): ```{r} subsampling(y_name = "y", data = data_binary, n = 10, pilot_n = 100, method = "OSMAC_A", seed_1 = 123, seed_2 = 456) @@ -56,7 +56,7 @@ L-optimal minimise the trace of the covariance matrix of the linear combination ```{r OSMAC-L} OSMAC(X = x, Y = y, r1 = 100, r2 = 10, method="mvc", seed_1 = 123, seed_2 = 456) ``` -or you can use a unified interface(recommended): +or you can use a unified interface (recommended): ```{r} subsampling(y_name = "y", data = data_binary, n = 10, pilot_n = 100, method = "OSMAC_L", seed_1 = 123, seed_2 = 456) @@ -72,11 +72,21 @@ A subsampling method based on D-optimal for linear regression proposed by [Wang ```{r} data_numeric <- data_numeric_regression X <- data_numeric[-which(names(data_numeric) == "y")] -IBOSS(n = 30, X = X) +IBOSS(n = 100, X = X) ``` -or you can use a unified interface(recommended): +or you can use a unified interface (recommended): ```{r} -subsampling(y_name = "y", data = data_numeric, n = 30, method = "IBOSS") +subsampling(y_name = "y", data = data_numeric, n = 100, method = "IBOSS") +``` + +# OSS +A subsampling method based on Orthogonal Array proposed by [Wang et.al.(2021)](https://projecteuclid.org/journals/annals-of-applied-statistics/volume-15/issue-3/Orthogonal-subsampling-for-big-data-linear-regression/10.1214/21-AOAS1462.short?tab=ArticleLink)^[Lin Wang, Jake Elmstedt, Weng Kee Wong & Hongquan Xu (2021) Orthogonal subsampling for big data linear regression, The Annals of Applied Statistics, 15(3), 1273-1290.]. +```{r} +OSS(n = 10, X = X) +``` +or you can use a unified interface (recommended): +```{r} +subsampling(y_name = "y", data = data_numeric, n = 10, method = "OSS") ``` **We're working on more features,such as subsampling based on OSS, Lowcon, support point, etc. **