Add OSS

JieYinStat · Feb 16, 2024 · 58908bc · 58908bc
1 parent 46a040b
commit 58908bc
Show file tree

Hide file tree

Showing 38 changed files with 1,026 additions and 74 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -15,6 +15,7 @@ BugReports: https://github.com/JieYinStat/dbsubsampling/issues
 Suggests: 
     knitr,
     mvtnorm,
+    RcppArmadillo,
     rmarkdown,
     testthat (>= 3.0.0)
 Config/testthat/edition: 3
@@ -26,4 +27,5 @@ Depends:
 LazyData: true
 VignetteBuilder: knitr
 LinkingTo: 
-    Rcpp
+    Rcpp,
+    RcppArmadillo
diff --git a/NAMESPACE b/NAMESPACE
@@ -2,6 +2,7 @@
 
 export(IBOSS)
 export(OSMAC)
+export(OSS)
 export(Unif)
 export(subsampling)
 importFrom(Rcpp,sourceCpp)

diff --git a/R/IBOSS.R b/R/IBOSS.R
@@ -3,10 +3,10 @@
 #' A subsampling method based on D-optiaml criterion inspired by optimal experimental design
 #' used for linear regression.
 #'
-#' @param n subsample size.
+#' @param n Subsample size.
 #' @param X A data.frame or matrix consists of explanatory variables.
 #'
-#' @return subsample index.
+#' @return Subsample index.
 #' @references HaiYing Wang, Min Yang & John Stufken (2019)
 #' \emph{Information-Based Optimal Subdata Selection for Big Data Linear Regression,
 #' Journal of the American Statistical Association, 114:525, 393-405},

diff --git a/R/OSMAC.R b/R/OSMAC.R
@@ -7,9 +7,9 @@
 #' @param w A numeric vector. The weight of each sample.
 #'
 #' @return A list.
-#'  * `par` : parameter estimation.
-#'  * `message` : message during iteration.
-#'  * `iter` : iteration times.
+#'  * `par` : Parameter estimation.
+#'  * `message` : Message during iteration.
+#'  * `iter` : Iteration times.
 get_Logistic_MLE <- function(x, y, w) {
   d <- ncol(x)
   beta <- rep(0, d)

diff --git a/R/OSS.R b/R/OSS.R
@@ -0,0 +1,135 @@
+#' Orthogonal subsampling for big data linear regression(OSS)
+#'
+#' A subsampling method based on orthogonal array for linear model.
+#'
+#' @param n Subsample size.
+#' @param X A matrix or data frame.
+#'
+#' @return Subsample index.
+#'
+#' @examples
+#' data_numeric_regression["y"] <- NULL
+#' X <- as.matrix(data_numeric_regression)
+#' OSS(100, X)
+#'
+#' @references Lin Wang, Jake Elmstedt, Weng Kee Wong & Hongquan Xu (2021)
+#' \emph{Orthogonal subsampling for big data linear regression,
+#' The Annals of Applied Statistics, 15(3), 1273-1290},
+#' \url{https://projecteuclid.org/journals/annals-of-applied-statistics/volume-15/issue-3/Orthogonal-subsampling-for-big-data-linear-regression/10.1214/21-AOAS1462.short?tab=ArticleLink}.
+#'
+#' @export
+OSS <- function(n, X){
+  X <- scale(as.matrix(X)) # need scale
+  attributes(X) <- attributes(X)["dim"]
+  subindex <- rcppOSS(X = X, n = n)
+  return(subindex)
+}
+
+#' Get L2 norm (r-version)
+#'
+#' Get L2 norm of a matrix or data frame.
+#' @param X A matrix or data.frame.
+#'
+#' @return L2 norm of `X`(every row).
+#'
+# @examples
+# X <- matrix(1:12, 4, 3)
+# X <- scale(X)
+# rL2norm(X)
+rL2norm <- function(X) {
+  return(rowSums(X^2))
+}
+
+#' Compute loss function for OSS (r-version)
+#'
+#' @param candi The index of the candidate set.
+#' @param last_index The index of the seleted point in last iteration.
+#' @param X The whole data.
+#' @param norm Norm of the whole data.
+#' @param p Numbers of columns of the data.
+#'
+#' @return Loss of every point in candidate set.
+# @examples
+# X <- matrix(1:20, 5, 4)
+# X <- scale(X)
+# norm <- rL2norm(X)
+# rComputeLoss(c(1,3,4), 2, X, norm)
+rComputeLoss <- function(candi, last_index, X, norm, p = ncol(X)){
+  delta <- rowSums(t(apply(X[candi, ], 1, function(.row) sign(.row) == sign(X[last_index,]))))
+  loss <- (p - norm[candi]/2  -  norm[last_index]/2 + delta)^2
+  return(loss)
+}
+
+#' Find t smallest index of a vector.
+#'
+#' @param loss A vector.
+#' @param t A int
+#'
+#' @return The index of the t smallest element of the vector.
+#'
+# @examples
+# loss <- rnorm(10)
+# rbottom_t_index(loss, 3)
+rbottom_t_index <- function(loss, t){
+  return(which(loss <= sort(loss)[t]))
+}
+
+
+#' OSS (r-version)
+#'
+#' @param n Subsample size.
+#' @param X A matrix.
+#'
+#' @return Subsample index.
+#'
+# @examples
+# data_numeric_regression["y"] <- NULL
+# X <- as.matrix(data_numeric_regression)
+# rOSS(X, 100)
+rOSS <- function(n, X){
+  X <- scale(as.matrix(X))
+  attributes(X) <- attributes(X)["dim"]
+  N <- nrow(X)
+
+  index <- numeric(n)
+  candi <- 1:N
+
+  norm <- rL2norm(X)
+  r <- log(N)/log(n)
+
+  for (i in 1:n) {
+    # Initial
+    if (i == 1) {
+      index[1] <- which.max(norm)
+      candi <- candi[-index[1]]
+      loss <- rComputeLoss(candi, index[1], X, norm)
+      next
+    }
+
+    # Election
+    tmp <- which.min(loss)
+    index[i] <- candi[tmp]
+    candi <- candi[-tmp]
+    loss <- loss[-tmp]
+
+    # Elimination
+    t <- ifelse(N > (n^2), N/i, N/(i^(r-1)))
+    if (length(candi) > t) {
+      candi <- candi[rbottom_t_index(loss,t)]
+      loss <- loss[rbottom_t_index(loss,t)]
+    }
+
+#    if (length(candi) == 0) {
+#      index <- index[1:i]
+#      break
+#    }
+    # Update loss
+    loss <- loss + rComputeLoss(candi, index[i], X, norm)
+  }
+
+  return(index)
+}
+
+
+
+
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -1,11 +1,107 @@
 # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
+#' Get subsample index of other column(except the first column) (IBOSS)
+#'
+#' @param r Subsample size of the column.
+#' @param z A numeric vector. the column.
+#' @param rdel Subsample index of the first column.
+#' @return Subsample index of the column.
 getIdxR_cpp <- function(r, z, rdel) {
     .Call(`_dbsubsampling_getIdxR_cpp`, r, z, rdel)
 }
 
+#' Get subsample index of the first column(IBOSS)
+#'
+#' @param r Subsample size of the first column.
+#' @param z A numeric vector. the first column.
+#' @return Subsample index of the first column.
 getIdx_cpp <- function(r, z) {
     .Call(`_dbsubsampling_getIdx_cpp`, r, z)
 }
 
+#' Get L2 norm
+#'
+#' Get L2 norm of a matrix or data frame.
+#'
+#' @param X A matrix or data.frame.
+#'
+#' @return L2 norm of `X`(every row).
+L2norm <- function(X) {
+    .Call(`_dbsubsampling_L2norm`, X)
+}
+
+#' Find t smallest index of a vector
+#'
+#' @param loss A vector.
+#' @param t A int.
+#'
+#' @return The index of the t smallest element of the vector.
+bottom_t_index <- function(loss, t) {
+    .Call(`_dbsubsampling_bottom_t_index`, loss, t)
+}
+
+#' Compute loss function for OSS
+#'
+#' @param candi The index of the candidate set.
+#' @param last_index The index of the seleted point in last iteration.
+#' @param X The whole data.
+#' @param norm Norm of the whole data.
+#'
+#' @return Loss of every point in candidate set.
+ComputeLoss <- function(candi, last_index, X, norm) {
+    .Call(`_dbsubsampling_ComputeLoss`, candi, last_index, X, norm)
+}
+
+#' Rcpp version OSS (core code of `OSS`)
+#'
+#' @param X A matrix.
+#' @param n Subsample size.
+#'
+#' @return Subsample index.
+rcppOSS <- function(X, n) {
+    .Call(`_dbsubsampling_rcppOSS`, X, n)
+}
+
+#' Find t smallest index of a vector (RcppArmadillo-version)
+#'
+#' @param x A vector.
+#' @param k A int.
+#'
+#' @return The index of the t smallest element of the vector.
+armabottom_k <- function(x, k) {
+    .Call(`_dbsubsampling_armabottom_k`, x, k)
+}
+
+#' Scale a matrix (RcppArmadillo-version)
+#'
+#' @param X A matrix.
+#'
+#' @return Scaled matrix.
+armaScaleMatrix <- function(X) {
+    .Call(`_dbsubsampling_armaScaleMatrix`, X)
+}
+
+#' Compute loss function for OSS (RcppArmadillo-version)
+#'
+#' @param X Matrix of the candidate set.
+#' @param xa Norm of the candidate set.
+#' @param y A vector. The point which be selected last iteration.
+#' @param ya Norm of `y`.
+#' @param tPow The power of the loss function.
+#'
+#' @return Loss of the candidate set.
+armaComputeLoss <- function(X, xa, y, ya, tPow) {
+    .Call(`_dbsubsampling_armaComputeLoss`, X, xa, y, ya, tPow)
+}
+
+#' OSS (RcppArmadillo-version)
+#' @param x A matrix.
+#' @param k Subsample size.
+#' @param tPow The power of the loss function.
+#'
+#' @return Subsample index.
+armaOSS <- function(x, k, tPow = 2) {
+    .Call(`_dbsubsampling_armaOSS`, x, k, tPow)
+}
+
diff --git a/R/Unif.R b/R/Unif.R
@@ -5,8 +5,8 @@
 #' @param N Total sample size.
 #' @param n Subsample size.
 #' @param replace A boolean.
-#'  * `TRUE` (the default): sampling with replace.
-#'  * `FALSE`: sampling without replace
+#'  * `TRUE` (the default): Sampling with replace.
+#'  * `FALSE`: Sampling without replace
 #' @param seed Random seed which is an integer (default NULL). This random seed is only valid for this sampling and
 #'  will not affect the external environment
 #'

diff --git a/R/subsampling.R b/R/subsampling.R
@@ -13,9 +13,10 @@
 #'  * `OSMAC_A`: A subsampling method based on A-optimal for logistic regression proposed by Wang et.al. (2018).
 #'  * `OSMAC_L`: A subsampling method based on L-optimal for logistic regression proposed by Wang et.al. (2018).
 #'  * `IBOSS`: A subsampling method based on D-optimal for linear regression proposed by Wang et.al. (2019).
+#'  * `OSS` : A subsampling method based on Orthogonal Array proposed by Wang et.al.(2021).
 #' @param replace A boolean.
-#'  * `TRUE` (the default): sampling with replace.
-#'  * `FALSE`: sampling without replace
+#'  * `TRUE` (the default): Sampling with replace.
+#'  * `FALSE`: Sampling without replace
 #' @param seed_1 Random seed for the first stage sampling or Unif.
 #' @param seed_2 Random seed for the second stage sampling.
 #' @param na_method Method to handle NA.
@@ -33,6 +34,7 @@
 #'
 #' data_numeric <- data_numeric_regression
 #' subsampling(y_name = "y", data = data_numeric, n = 100, method = "IBOSS")
+#' subsampling(y_name = "y", data = data_numeric, n = 30, method = "OSS")
 subsampling <- function(y_name, x_name = NULL, data, n, pilot_n = NULL, method = "Unif",
                         replace = TRUE, seed_1 = NULL, seed_2 = NULL, na_method = NULL) {
 
@@ -47,10 +49,10 @@ subsampling <- function(y_name, x_name = NULL, data, n, pilot_n = NULL, method =
          Unif = Unif(N = N, n = n, seed = seed_1, replace = TRUE),
          IBOSS = IBOSS(n = n, X = x),
          OSMAC_A = OSMAC(X = x, Y = y, r1 = pilot_n, r2 = n, method = "mmse", seed_1 = seed_1, seed_2 = seed_2),
-         OSMAC_L = OSMAC(X = x, Y = y, r1 = pilot_n, r2 = n, method = "mvc", seed_1 = seed_1, seed_2 = seed_2)
+         OSMAC_L = OSMAC(X = x, Y = y, r1 = pilot_n, r2 = n, method = "mvc", seed_1 = seed_1, seed_2 = seed_2),
+         OSS = OSS(n = n, X = x)
          # Support =
          # Lowcon =
-         # OSS =
          # DDS =
     )
   return(subsample_index)

diff --git a/README.Rmd b/README.Rmd
@@ -32,7 +32,8 @@ devtools::install_github("JieYinStat/dbsubsampling")
 
 ## Example
 
-This is a basic example which shows you how to get subsample index, such as uniform sampling, OSMAC and IBOSS:
+This is a basic example which shows you how to get subsample index, such as uniform sampling, OSMAC, IBOSS and
+OSS:
 
 ```{r example}
 library(dbsubsampling)
@@ -52,7 +53,10 @@ subsampling(y_name = "y", data = data_binary, n = 10, pilot_n = 100, method = "O
 
 # IBOSS
 data_numeric <- data_numeric_regression
-subsampling(y_name = "y", data = data_numeric, n = 30, method = "IBOSS")
+subsampling(y_name = "y", data = data_numeric, n = 100, method = "IBOSS")
+
+# OSS
+subsampling(y_name = "y", data = data_numeric, n = 30, method = "OSS")
 ```
 
 You can get more detailed examples from the article column on the [website](jieyinstat.github.io/dbsubsampling/).
diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ devtools::install_github("JieYinStat/dbsubsampling")
 ## Example
 
 This is a basic example which shows you how to get subsample index, such
-as uniform sampling, OSMAC and IBOSS:
+as uniform sampling, OSMAC, IBOSS and OSS:
 
 ``` r
 library(dbsubsampling)
@@ -47,9 +47,19 @@ subsampling(y_name = "y", data = data_binary, n = 10, pilot_n = 100, method = "O
 
 # IBOSS
 data_numeric <- data_numeric_regression
-subsampling(y_name = "y", data = data_numeric, n = 30, method = "IBOSS")
-#>  [1]  419 1144 3395 3484 3896 5121 6203 7915 7967 8026 8156 8694 8841 9117 8438
-#> [16] 3121
+subsampling(y_name = "y", data = data_numeric, n = 100, method = "IBOSS")
+#>  [1]  183  226  395  419  584  666  711  758 1027 1144 1324 1445 1940 1946 1978
+#> [16] 2018 2673 2982 3190 3395 3484 3612 3632 3638 3696 3816 3835 3896 3921 4256
+#> [31] 4312 4405 4523 4551 4729 4938 5121 5226 5342 5410 5679 5770 5995 6089 6163
+#> [46] 6170 6203 6250 6525 6964 6979 7053 7198 7407 7564 7633 7915 7935 7967 7992
+#> [61] 8026 8088 8106 8156 8161 8267 8306 8501 8503 8521 8534 8694 8805 8841 9117
+#> [76] 9211 9302 9364 9398 9456 9676 9946 9971 9989 1173 2344 5394 8438 8567 9239
+#> [91] 1787 2104 2215 3121 7159 9133
+
+# OSS
+subsampling(y_name = "y", data = data_numeric, n = 30, method = "OSS")
+#>  [1] 8841 8961 1902 7512   48 9867 6547 9784 3392 3622 5780 6594 1890 1850 8335
+#> [16] 1254 6204 1257 4611 3831 4782 4919 1579 3404  718 7189 2060 4899  590 1800
 ```
 
 You can get more detailed examples from the article column on the