-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
46a040b
commit 58908bc
Showing
38 changed files
with
1,026 additions
and
74 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,7 @@ | |
|
||
export(IBOSS) | ||
export(OSMAC) | ||
export(OSS) | ||
export(Unif) | ||
export(subsampling) | ||
importFrom(Rcpp,sourceCpp) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
#' Orthogonal subsampling for big data linear regression(OSS) | ||
#' | ||
#' A subsampling method based on orthogonal array for linear model. | ||
#' | ||
#' @param n Subsample size. | ||
#' @param X A matrix or data frame. | ||
#' | ||
#' @return Subsample index. | ||
#' | ||
#' @examples | ||
#' data_numeric_regression["y"] <- NULL | ||
#' X <- as.matrix(data_numeric_regression) | ||
#' OSS(100, X) | ||
#' | ||
#' @references Lin Wang, Jake Elmstedt, Weng Kee Wong & Hongquan Xu (2021) | ||
#' \emph{Orthogonal subsampling for big data linear regression, | ||
#' The Annals of Applied Statistics, 15(3), 1273-1290}, | ||
#' \url{https://projecteuclid.org/journals/annals-of-applied-statistics/volume-15/issue-3/Orthogonal-subsampling-for-big-data-linear-regression/10.1214/21-AOAS1462.short?tab=ArticleLink}. | ||
#' | ||
#' @export | ||
OSS <- function(n, X){ | ||
X <- scale(as.matrix(X)) # need scale | ||
attributes(X) <- attributes(X)["dim"] | ||
subindex <- rcppOSS(X = X, n = n) | ||
return(subindex) | ||
} | ||
|
||
#' Get L2 norm (r-version) | ||
#' | ||
#' Get L2 norm of a matrix or data frame. | ||
#' @param X A matrix or data.frame. | ||
#' | ||
#' @return L2 norm of `X`(every row). | ||
#' | ||
# @examples | ||
# X <- matrix(1:12, 4, 3) | ||
# X <- scale(X) | ||
# rL2norm(X) | ||
rL2norm <- function(X) { | ||
return(rowSums(X^2)) | ||
} | ||
|
||
#' Compute loss function for OSS (r-version) | ||
#' | ||
#' @param candi The index of the candidate set. | ||
#' @param last_index The index of the seleted point in last iteration. | ||
#' @param X The whole data. | ||
#' @param norm Norm of the whole data. | ||
#' @param p Numbers of columns of the data. | ||
#' | ||
#' @return Loss of every point in candidate set. | ||
# @examples | ||
# X <- matrix(1:20, 5, 4) | ||
# X <- scale(X) | ||
# norm <- rL2norm(X) | ||
# rComputeLoss(c(1,3,4), 2, X, norm) | ||
rComputeLoss <- function(candi, last_index, X, norm, p = ncol(X)){ | ||
delta <- rowSums(t(apply(X[candi, ], 1, function(.row) sign(.row) == sign(X[last_index,])))) | ||
loss <- (p - norm[candi]/2 - norm[last_index]/2 + delta)^2 | ||
return(loss) | ||
} | ||
|
||
#' Find t smallest index of a vector. | ||
#' | ||
#' @param loss A vector. | ||
#' @param t A int | ||
#' | ||
#' @return The index of the t smallest element of the vector. | ||
#' | ||
# @examples | ||
# loss <- rnorm(10) | ||
# rbottom_t_index(loss, 3) | ||
rbottom_t_index <- function(loss, t){ | ||
return(which(loss <= sort(loss)[t])) | ||
} | ||
|
||
|
||
#' OSS (r-version) | ||
#' | ||
#' @param n Subsample size. | ||
#' @param X A matrix. | ||
#' | ||
#' @return Subsample index. | ||
#' | ||
# @examples | ||
# data_numeric_regression["y"] <- NULL | ||
# X <- as.matrix(data_numeric_regression) | ||
# rOSS(X, 100) | ||
rOSS <- function(n, X){ | ||
X <- scale(as.matrix(X)) | ||
attributes(X) <- attributes(X)["dim"] | ||
N <- nrow(X) | ||
|
||
index <- numeric(n) | ||
candi <- 1:N | ||
|
||
norm <- rL2norm(X) | ||
r <- log(N)/log(n) | ||
|
||
for (i in 1:n) { | ||
# Initial | ||
if (i == 1) { | ||
index[1] <- which.max(norm) | ||
candi <- candi[-index[1]] | ||
loss <- rComputeLoss(candi, index[1], X, norm) | ||
next | ||
} | ||
|
||
# Election | ||
tmp <- which.min(loss) | ||
index[i] <- candi[tmp] | ||
candi <- candi[-tmp] | ||
loss <- loss[-tmp] | ||
|
||
# Elimination | ||
t <- ifelse(N > (n^2), N/i, N/(i^(r-1))) | ||
if (length(candi) > t) { | ||
candi <- candi[rbottom_t_index(loss,t)] | ||
loss <- loss[rbottom_t_index(loss,t)] | ||
} | ||
|
||
# if (length(candi) == 0) { | ||
# index <- index[1:i] | ||
# break | ||
# } | ||
# Update loss | ||
loss <- loss + rComputeLoss(candi, index[i], X, norm) | ||
} | ||
|
||
return(index) | ||
} | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,107 @@ | ||
# Generated by using Rcpp::compileAttributes() -> do not edit by hand | ||
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 | ||
|
||
#' Get subsample index of other column(except the first column) (IBOSS) | ||
#' | ||
#' @param r Subsample size of the column. | ||
#' @param z A numeric vector. the column. | ||
#' @param rdel Subsample index of the first column. | ||
#' @return Subsample index of the column. | ||
getIdxR_cpp <- function(r, z, rdel) { | ||
.Call(`_dbsubsampling_getIdxR_cpp`, r, z, rdel) | ||
} | ||
|
||
#' Get subsample index of the first column(IBOSS) | ||
#' | ||
#' @param r Subsample size of the first column. | ||
#' @param z A numeric vector. the first column. | ||
#' @return Subsample index of the first column. | ||
getIdx_cpp <- function(r, z) { | ||
.Call(`_dbsubsampling_getIdx_cpp`, r, z) | ||
} | ||
|
||
#' Get L2 norm | ||
#' | ||
#' Get L2 norm of a matrix or data frame. | ||
#' | ||
#' @param X A matrix or data.frame. | ||
#' | ||
#' @return L2 norm of `X`(every row). | ||
L2norm <- function(X) { | ||
.Call(`_dbsubsampling_L2norm`, X) | ||
} | ||
|
||
#' Find t smallest index of a vector | ||
#' | ||
#' @param loss A vector. | ||
#' @param t A int. | ||
#' | ||
#' @return The index of the t smallest element of the vector. | ||
bottom_t_index <- function(loss, t) { | ||
.Call(`_dbsubsampling_bottom_t_index`, loss, t) | ||
} | ||
|
||
#' Compute loss function for OSS | ||
#' | ||
#' @param candi The index of the candidate set. | ||
#' @param last_index The index of the seleted point in last iteration. | ||
#' @param X The whole data. | ||
#' @param norm Norm of the whole data. | ||
#' | ||
#' @return Loss of every point in candidate set. | ||
ComputeLoss <- function(candi, last_index, X, norm) { | ||
.Call(`_dbsubsampling_ComputeLoss`, candi, last_index, X, norm) | ||
} | ||
|
||
#' Rcpp version OSS (core code of `OSS`) | ||
#' | ||
#' @param X A matrix. | ||
#' @param n Subsample size. | ||
#' | ||
#' @return Subsample index. | ||
rcppOSS <- function(X, n) { | ||
.Call(`_dbsubsampling_rcppOSS`, X, n) | ||
} | ||
|
||
#' Find t smallest index of a vector (RcppArmadillo-version) | ||
#' | ||
#' @param x A vector. | ||
#' @param k A int. | ||
#' | ||
#' @return The index of the t smallest element of the vector. | ||
armabottom_k <- function(x, k) { | ||
.Call(`_dbsubsampling_armabottom_k`, x, k) | ||
} | ||
|
||
#' Scale a matrix (RcppArmadillo-version) | ||
#' | ||
#' @param X A matrix. | ||
#' | ||
#' @return Scaled matrix. | ||
armaScaleMatrix <- function(X) { | ||
.Call(`_dbsubsampling_armaScaleMatrix`, X) | ||
} | ||
|
||
#' Compute loss function for OSS (RcppArmadillo-version) | ||
#' | ||
#' @param X Matrix of the candidate set. | ||
#' @param xa Norm of the candidate set. | ||
#' @param y A vector. The point which be selected last iteration. | ||
#' @param ya Norm of `y`. | ||
#' @param tPow The power of the loss function. | ||
#' | ||
#' @return Loss of the candidate set. | ||
armaComputeLoss <- function(X, xa, y, ya, tPow) { | ||
.Call(`_dbsubsampling_armaComputeLoss`, X, xa, y, ya, tPow) | ||
} | ||
|
||
#' OSS (RcppArmadillo-version) | ||
#' @param x A matrix. | ||
#' @param k Subsample size. | ||
#' @param tPow The power of the loss function. | ||
#' | ||
#' @return Subsample index. | ||
armaOSS <- function(x, k, tPow = 2) { | ||
.Call(`_dbsubsampling_armaOSS`, x, k, tPow) | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.