Skip to content

Commit

Permalink
Add OSS
Browse files Browse the repository at this point in the history
  • Loading branch information
JieYinStat committed Feb 16, 2024
1 parent 46a040b commit 58908bc
Show file tree
Hide file tree
Showing 38 changed files with 1,026 additions and 74 deletions.
4 changes: 3 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ BugReports: https://github.com/JieYinStat/dbsubsampling/issues
Suggests:
knitr,
mvtnorm,
RcppArmadillo,
rmarkdown,
testthat (>= 3.0.0)
Config/testthat/edition: 3
Expand All @@ -26,4 +27,5 @@ Depends:
LazyData: true
VignetteBuilder: knitr
LinkingTo:
Rcpp
Rcpp,
RcppArmadillo
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

export(IBOSS)
export(OSMAC)
export(OSS)
export(Unif)
export(subsampling)
importFrom(Rcpp,sourceCpp)
Expand Down
4 changes: 2 additions & 2 deletions R/IBOSS.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
#' A subsampling method based on D-optiaml criterion inspired by optimal experimental design
#' used for linear regression.
#'
#' @param n subsample size.
#' @param n Subsample size.
#' @param X A data.frame or matrix consists of explanatory variables.
#'
#' @return subsample index.
#' @return Subsample index.
#' @references HaiYing Wang, Min Yang & John Stufken (2019)
#' \emph{Information-Based Optimal Subdata Selection for Big Data Linear Regression,
#' Journal of the American Statistical Association, 114:525, 393-405},
Expand Down
6 changes: 3 additions & 3 deletions R/OSMAC.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
#' @param w A numeric vector. The weight of each sample.
#'
#' @return A list.
#' * `par` : parameter estimation.
#' * `message` : message during iteration.
#' * `iter` : iteration times.
#' * `par` : Parameter estimation.
#' * `message` : Message during iteration.
#' * `iter` : Iteration times.
get_Logistic_MLE <- function(x, y, w) {
d <- ncol(x)
beta <- rep(0, d)
Expand Down
135 changes: 135 additions & 0 deletions R/OSS.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
#' Orthogonal subsampling for big data linear regression(OSS)
#'
#' A subsampling method based on orthogonal array for linear model.
#'
#' @param n Subsample size.
#' @param X A matrix or data frame.
#'
#' @return Subsample index.
#'
#' @examples
#' data_numeric_regression["y"] <- NULL
#' X <- as.matrix(data_numeric_regression)
#' OSS(100, X)
#'
#' @references Lin Wang, Jake Elmstedt, Weng Kee Wong & Hongquan Xu (2021)
#' \emph{Orthogonal subsampling for big data linear regression,
#' The Annals of Applied Statistics, 15(3), 1273-1290},
#' \url{https://projecteuclid.org/journals/annals-of-applied-statistics/volume-15/issue-3/Orthogonal-subsampling-for-big-data-linear-regression/10.1214/21-AOAS1462.short?tab=ArticleLink}.
#'
#' @export
OSS <- function(n, X){
X <- scale(as.matrix(X)) # need scale
attributes(X) <- attributes(X)["dim"]
subindex <- rcppOSS(X = X, n = n)
return(subindex)
}

#' Get L2 norm (r-version)
#'
#' Get L2 norm of a matrix or data frame.
#' @param X A matrix or data.frame.
#'
#' @return L2 norm of `X`(every row).
#'
# @examples
# X <- matrix(1:12, 4, 3)
# X <- scale(X)
# rL2norm(X)
rL2norm <- function(X) {
return(rowSums(X^2))
}

#' Compute loss function for OSS (r-version)
#'
#' @param candi The index of the candidate set.
#' @param last_index The index of the seleted point in last iteration.
#' @param X The whole data.
#' @param norm Norm of the whole data.
#' @param p Numbers of columns of the data.
#'
#' @return Loss of every point in candidate set.
# @examples
# X <- matrix(1:20, 5, 4)
# X <- scale(X)
# norm <- rL2norm(X)
# rComputeLoss(c(1,3,4), 2, X, norm)
rComputeLoss <- function(candi, last_index, X, norm, p = ncol(X)){
delta <- rowSums(t(apply(X[candi, ], 1, function(.row) sign(.row) == sign(X[last_index,]))))
loss <- (p - norm[candi]/2 - norm[last_index]/2 + delta)^2
return(loss)
}

#' Find t smallest index of a vector.
#'
#' @param loss A vector.
#' @param t A int
#'
#' @return The index of the t smallest element of the vector.
#'
# @examples
# loss <- rnorm(10)
# rbottom_t_index(loss, 3)
rbottom_t_index <- function(loss, t){
return(which(loss <= sort(loss)[t]))
}


#' OSS (r-version)
#'
#' @param n Subsample size.
#' @param X A matrix.
#'
#' @return Subsample index.
#'
# @examples
# data_numeric_regression["y"] <- NULL
# X <- as.matrix(data_numeric_regression)
# rOSS(X, 100)
rOSS <- function(n, X){
X <- scale(as.matrix(X))
attributes(X) <- attributes(X)["dim"]
N <- nrow(X)

index <- numeric(n)
candi <- 1:N

norm <- rL2norm(X)
r <- log(N)/log(n)

for (i in 1:n) {
# Initial
if (i == 1) {
index[1] <- which.max(norm)
candi <- candi[-index[1]]
loss <- rComputeLoss(candi, index[1], X, norm)
next
}

# Election
tmp <- which.min(loss)
index[i] <- candi[tmp]
candi <- candi[-tmp]
loss <- loss[-tmp]

# Elimination
t <- ifelse(N > (n^2), N/i, N/(i^(r-1)))
if (length(candi) > t) {
candi <- candi[rbottom_t_index(loss,t)]
loss <- loss[rbottom_t_index(loss,t)]
}

# if (length(candi) == 0) {
# index <- index[1:i]
# break
# }
# Update loss
loss <- loss + rComputeLoss(candi, index[i], X, norm)
}

return(index)
}




96 changes: 96 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -1,11 +1,107 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

#' Get subsample index of other column(except the first column) (IBOSS)
#'
#' @param r Subsample size of the column.
#' @param z A numeric vector. the column.
#' @param rdel Subsample index of the first column.
#' @return Subsample index of the column.
getIdxR_cpp <- function(r, z, rdel) {
.Call(`_dbsubsampling_getIdxR_cpp`, r, z, rdel)
}

#' Get subsample index of the first column(IBOSS)
#'
#' @param r Subsample size of the first column.
#' @param z A numeric vector. the first column.
#' @return Subsample index of the first column.
getIdx_cpp <- function(r, z) {
.Call(`_dbsubsampling_getIdx_cpp`, r, z)
}

#' Get L2 norm
#'
#' Get L2 norm of a matrix or data frame.
#'
#' @param X A matrix or data.frame.
#'
#' @return L2 norm of `X`(every row).
L2norm <- function(X) {
.Call(`_dbsubsampling_L2norm`, X)
}

#' Find t smallest index of a vector
#'
#' @param loss A vector.
#' @param t A int.
#'
#' @return The index of the t smallest element of the vector.
bottom_t_index <- function(loss, t) {
.Call(`_dbsubsampling_bottom_t_index`, loss, t)
}

#' Compute loss function for OSS
#'
#' @param candi The index of the candidate set.
#' @param last_index The index of the seleted point in last iteration.
#' @param X The whole data.
#' @param norm Norm of the whole data.
#'
#' @return Loss of every point in candidate set.
ComputeLoss <- function(candi, last_index, X, norm) {
.Call(`_dbsubsampling_ComputeLoss`, candi, last_index, X, norm)
}

#' Rcpp version OSS (core code of `OSS`)
#'
#' @param X A matrix.
#' @param n Subsample size.
#'
#' @return Subsample index.
rcppOSS <- function(X, n) {
.Call(`_dbsubsampling_rcppOSS`, X, n)
}

#' Find t smallest index of a vector (RcppArmadillo-version)
#'
#' @param x A vector.
#' @param k A int.
#'
#' @return The index of the t smallest element of the vector.
armabottom_k <- function(x, k) {
.Call(`_dbsubsampling_armabottom_k`, x, k)
}

#' Scale a matrix (RcppArmadillo-version)
#'
#' @param X A matrix.
#'
#' @return Scaled matrix.
armaScaleMatrix <- function(X) {
.Call(`_dbsubsampling_armaScaleMatrix`, X)
}

#' Compute loss function for OSS (RcppArmadillo-version)
#'
#' @param X Matrix of the candidate set.
#' @param xa Norm of the candidate set.
#' @param y A vector. The point which be selected last iteration.
#' @param ya Norm of `y`.
#' @param tPow The power of the loss function.
#'
#' @return Loss of the candidate set.
armaComputeLoss <- function(X, xa, y, ya, tPow) {
.Call(`_dbsubsampling_armaComputeLoss`, X, xa, y, ya, tPow)
}

#' OSS (RcppArmadillo-version)
#' @param x A matrix.
#' @param k Subsample size.
#' @param tPow The power of the loss function.
#'
#' @return Subsample index.
armaOSS <- function(x, k, tPow = 2) {
.Call(`_dbsubsampling_armaOSS`, x, k, tPow)
}

4 changes: 2 additions & 2 deletions R/Unif.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
#' @param N Total sample size.
#' @param n Subsample size.
#' @param replace A boolean.
#' * `TRUE` (the default): sampling with replace.
#' * `FALSE`: sampling without replace
#' * `TRUE` (the default): Sampling with replace.
#' * `FALSE`: Sampling without replace
#' @param seed Random seed which is an integer (default NULL). This random seed is only valid for this sampling and
#' will not affect the external environment
#'
Expand Down
10 changes: 6 additions & 4 deletions R/subsampling.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@
#' * `OSMAC_A`: A subsampling method based on A-optimal for logistic regression proposed by Wang et.al. (2018).
#' * `OSMAC_L`: A subsampling method based on L-optimal for logistic regression proposed by Wang et.al. (2018).
#' * `IBOSS`: A subsampling method based on D-optimal for linear regression proposed by Wang et.al. (2019).
#' * `OSS` : A subsampling method based on Orthogonal Array proposed by Wang et.al.(2021).
#' @param replace A boolean.
#' * `TRUE` (the default): sampling with replace.
#' * `FALSE`: sampling without replace
#' * `TRUE` (the default): Sampling with replace.
#' * `FALSE`: Sampling without replace
#' @param seed_1 Random seed for the first stage sampling or Unif.
#' @param seed_2 Random seed for the second stage sampling.
#' @param na_method Method to handle NA.
Expand All @@ -33,6 +34,7 @@
#'
#' data_numeric <- data_numeric_regression
#' subsampling(y_name = "y", data = data_numeric, n = 100, method = "IBOSS")
#' subsampling(y_name = "y", data = data_numeric, n = 30, method = "OSS")
subsampling <- function(y_name, x_name = NULL, data, n, pilot_n = NULL, method = "Unif",
replace = TRUE, seed_1 = NULL, seed_2 = NULL, na_method = NULL) {

Expand All @@ -47,10 +49,10 @@ subsampling <- function(y_name, x_name = NULL, data, n, pilot_n = NULL, method =
Unif = Unif(N = N, n = n, seed = seed_1, replace = TRUE),
IBOSS = IBOSS(n = n, X = x),
OSMAC_A = OSMAC(X = x, Y = y, r1 = pilot_n, r2 = n, method = "mmse", seed_1 = seed_1, seed_2 = seed_2),
OSMAC_L = OSMAC(X = x, Y = y, r1 = pilot_n, r2 = n, method = "mvc", seed_1 = seed_1, seed_2 = seed_2)
OSMAC_L = OSMAC(X = x, Y = y, r1 = pilot_n, r2 = n, method = "mvc", seed_1 = seed_1, seed_2 = seed_2),
OSS = OSS(n = n, X = x)
# Support =
# Lowcon =
# OSS =
# DDS =
)
return(subsample_index)
Expand Down
8 changes: 6 additions & 2 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ devtools::install_github("JieYinStat/dbsubsampling")

## Example

This is a basic example which shows you how to get subsample index, such as uniform sampling, OSMAC and IBOSS:
This is a basic example which shows you how to get subsample index, such as uniform sampling, OSMAC, IBOSS and
OSS:

```{r example}
library(dbsubsampling)
Expand All @@ -52,7 +53,10 @@ subsampling(y_name = "y", data = data_binary, n = 10, pilot_n = 100, method = "O
# IBOSS
data_numeric <- data_numeric_regression
subsampling(y_name = "y", data = data_numeric, n = 30, method = "IBOSS")
subsampling(y_name = "y", data = data_numeric, n = 100, method = "IBOSS")
# OSS
subsampling(y_name = "y", data = data_numeric, n = 30, method = "OSS")
```

You can get more detailed examples from the article column on the [website](jieyinstat.github.io/dbsubsampling/).
18 changes: 14 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ devtools::install_github("JieYinStat/dbsubsampling")
## Example

This is a basic example which shows you how to get subsample index, such
as uniform sampling, OSMAC and IBOSS:
as uniform sampling, OSMAC, IBOSS and OSS:

``` r
library(dbsubsampling)
Expand All @@ -47,9 +47,19 @@ subsampling(y_name = "y", data = data_binary, n = 10, pilot_n = 100, method = "O

# IBOSS
data_numeric <- data_numeric_regression
subsampling(y_name = "y", data = data_numeric, n = 30, method = "IBOSS")
#> [1] 419 1144 3395 3484 3896 5121 6203 7915 7967 8026 8156 8694 8841 9117 8438
#> [16] 3121
subsampling(y_name = "y", data = data_numeric, n = 100, method = "IBOSS")
#> [1] 183 226 395 419 584 666 711 758 1027 1144 1324 1445 1940 1946 1978
#> [16] 2018 2673 2982 3190 3395 3484 3612 3632 3638 3696 3816 3835 3896 3921 4256
#> [31] 4312 4405 4523 4551 4729 4938 5121 5226 5342 5410 5679 5770 5995 6089 6163
#> [46] 6170 6203 6250 6525 6964 6979 7053 7198 7407 7564 7633 7915 7935 7967 7992
#> [61] 8026 8088 8106 8156 8161 8267 8306 8501 8503 8521 8534 8694 8805 8841 9117
#> [76] 9211 9302 9364 9398 9456 9676 9946 9971 9989 1173 2344 5394 8438 8567 9239
#> [91] 1787 2104 2215 3121 7159 9133

# OSS
subsampling(y_name = "y", data = data_numeric, n = 30, method = "OSS")
#> [1] 8841 8961 1902 7512 48 9867 6547 9784 3392 3622 5780 6594 1890 1850 8335
#> [16] 1254 6204 1257 4611 3831 4782 4919 1579 3404 718 7189 2060 4899 590 1800
```

You can get more detailed examples from the article column on the
Expand Down
Loading

0 comments on commit 58908bc

Please sign in to comment.