<a href="https://colab.research.google.com/github/Jinzhao-Yu/BioStat615/blob/main/BIOSTAT615_Lecture_15_Fall_2022.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BIOSTAT615 Lecture 15 -R 

## 1. Evaluating implementations with microbenchmark

In [1]:
## microbenchmark is a package that helps evaluate computational efficiency
install.packages("microbenchmark")
install.packages("Rcpp")

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



In [2]:
## Testing microbenchmarking
library(microbenchmark)
x = runif(1000)
print(microbenchmark(sqrt(x),x^0.5,exp(0.5*log(x))))

Unit: microseconds
              expr    min      lq     mean  median      uq     max neval
           sqrt(x)  2.760  3.7440  5.07596  4.5970  5.6405  22.637   100
             x^0.5 76.777 77.8660 80.41870 79.0940 80.3265 113.130   100
 exp(0.5 * log(x)) 46.074 47.3465 50.62273 48.8075 50.1845 134.198   100


In [3]:
## Loopy implementation of cumulative sum
cumsum_1 = function(x){
  for(i in 2:length(x)){
    x[i] = x[i] + x[i-1]
  }
  return(x)
}

In [4]:
## Cumulative sum with global update
cumsum_2 = function(x){
  sapply(2:length(x), function(i) x[i] <<-x[i]+x[i-1] )
  return(x)
}

In [5]:
## Evaluation of different implementations
## cumsum_1 : using for loop and local assignment
## cumsum_2 : using sapply and global assignment
## cumsum   : built-in function in R
x = rep(1,length=1000)
print(microbenchmark(cumsum_1(x), cumsum_2(x), cumsum(x)))

Unit: microseconds
        expr     min       lq       mean   median        uq      max neval
 cumsum_1(x) 100.578 102.4185  149.56806  104.023  107.0025 4412.782   100
 cumsum_2(x) 926.497 969.3525 1114.13391 1006.050 1055.2270 6403.558   100
   cumsum(x)   1.279   1.6285    2.29112    2.038    2.5640   11.036   100


## 2. Comparing `for` loop, `sapply`, and `apply`

In [6]:
## x is 100 x 10,000 matrix
x = matrix(rnorm(1e6),nrow=100,ncol=1e4)

## sapply_cumsum : use sapply() to run cumsum for each column
sapply_cumsum = function(X) sapply(1:ncol(X), function(i) cumsum(X[,i]))

## apply_cumsum : use apply() to run cumsum for each column
apply_cumsum = function(X) apply(X, 2, cumsum)

## loop_cumsum : use a for loop to run cumsum for each column
loop_cumsum = function(X) { 
  ret = matrix(NA,nrow(X),ncol(X))
  for(i in 1:ncol(X)) {
    ret[,i] = cumsum(X[,i])
  }
  return(ret)
}

## compare the efficiency using microbenchmark
print(microbenchmark( 
  sapply_cumsum(x),
  apply_cumsum(x),
  loop_cumsum(x)
))

Unit: milliseconds
             expr      min       lq     mean   median       uq      max neval
 sapply_cumsum(x) 27.27676 37.56194 43.54816 40.17622 43.32988 111.5174   100
  apply_cumsum(x) 29.53987 40.85985 53.79164 44.33717 52.03735 128.4414   100
   loop_cumsum(x) 24.22735 31.74534 39.51864 34.56488 38.19176 115.2343   100


## 3. Introducing `Rcpp`

In [7]:
library(Rcpp)
cppFunction('void cumsum_3(NumericVector& x){
            for(int i = 1; i < x.size(); i++){
                x[i] = x[i] + x[i-1];
            }
}')

In [8]:
fn = function() rep(1,length=1000) 
print(microbenchmark(cumsum_1(fn()),
               cumsum_2(fn()),
               cumsum_3(fn()),
               cumsum(fn()),times=1000))

Unit: microseconds
           expr     min       lq        mean   median        uq      max neval
 cumsum_1(fn()) 102.520 104.4335  111.900320 106.7125  111.2725  243.510  1000
 cumsum_2(fn()) 916.805 957.0465 1082.620494 975.6150 1011.9340 5791.649  1000
 cumsum_3(fn())   7.531   8.3745   13.095580  10.0100   12.6620 1600.343  1000
   cumsum(fn())   3.475   4.1585    5.058874   4.6020    5.3095   27.862  1000


## 4. Using `cppFunction()`

In [9]:
library(Rcpp)
cppFunction('int add(int x, int y, int z) {
  int sum = x + y + z; 
  return sum;
}')

In [10]:
# add works like a regular R function (but you cannot see inside)
add

In [11]:
add(1, 2, 3)

## 5. No input, scalar output

In [12]:
oneR <- function() 1L

In [13]:
cppFunction('int oneC() { 
  return 1;
}')

In [14]:
print(oneR())
print(oneC())

[1] 1
[1] 1


## 6. Scalar input, scalar output

In [15]:
signR <- function(x) {
  if (x > 0) { 
    1
  } else if (x == 0) { 
    0
  } else { 
    -1
  } 
}

In [16]:
cppFunction('int signC(int x) {
  if (x > 0) { 
    return 1;
  } else if (x == 0) { 
    return 0;
  } else { 
    return -1;
  } 
}')

In [17]:
print(signR(100))
print(signR(0))
print(signR(-100))
print(signC(100))
print(signC(0))
print(signC(-100))

[1] 1
[1] 0
[1] -1
[1] 1
[1] 0
[1] -1


## 7. Vector input, scalar output

In [18]:
sumR <- function(x) {
  total <- 0
  for (i in seq_along(x)) {
    total <- total + x[i]
  }
  total
}

In [19]:
cppFunction('double sumC(NumericVector x) {
            int n = x.size();
            double total = 0;
            for(int i = 0; i < n; ++i) {
            total += x[i];
            }
            return total;
            }')

In [20]:
## which one do you think it is faster?
x <- runif(1e3)
print(microbenchmark(
  sum(x),
  sumC(x),
  sumR(x)
))

Unit: microseconds
    expr    min      lq     mean  median      uq      max neval
  sum(x)  1.359  1.4105  1.74690  1.4715  1.5515   14.263   100
 sumC(x)  3.421  3.5220 16.12354  3.6320  3.9440 1071.779   100
 sumR(x) 40.317 40.6480 76.82605 40.8205 41.8950 3529.761   100


## 8. Vector input, vector output

In [21]:
pdistR <- function(x, ys) {
  sqrt((x - ys) ^ 2)
}

In [22]:
cppFunction('NumericVector pdistC(double x, NumericVector ys) { 
  int n = ys.size();
  NumericVector out(n);
  for(int i = 0; i < n; ++i) {
    out[i] = sqrt(pow(ys[i] - x, 2.0));
  }
  return out;
}')

In [23]:
## which one do you think it is faster?
y = runif(1e6)
print(microbenchmark(pdistR(0,y),pdistC(0,y),times=1000L))

Unit: milliseconds
         expr      min       lq     mean   median       uq      max neval
 pdistR(0, y) 5.624995 5.785878 6.589199 5.835555 5.957583 74.22311  1000
 pdistC(0, y) 8.267597 8.335563 9.133033 8.368965 8.465787 81.52314  1000


## 9. Matrix input, matrix output

In [24]:
cppFunction('NumericVector rowSumsC(NumericMatrix x) {
  int nrow = x.nrow(), ncol = x.ncol();
  NumericVector out(nrow);
  for (int i = 0; i < nrow; i++) {
    double total = 0;
    for (int j = 0; j < ncol; j++) {
      total += x(i, j);
    }
    out[i] = total;
  }
  return out;
}')

In [25]:
set.seed(2022)
fn = function() matrix(sample(100), 10)
print(microbenchmark(rowSums(fn()),rowSumsC(fn()),times=10000L))

Unit: microseconds
           expr    min     lq     mean  median      uq      max neval
  rowSums(fn()) 19.071 20.684 25.19209 21.5990 23.4680 9987.783 10000
 rowSumsC(fn()) 15.970 17.640 22.17528 18.5865 20.5175 9721.043 10000


## 10. Using `sourceCpp()`

In [26]:
## This is just to write a C++ file in R
## Usually, you would write a C++ file in a different editor.
cat("#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
double meanC(NumericVector x) {
  int n = x.size();
  double total = 0;
  for(int i = 0; i < n; ++i) { 
    total += x[i];
  }\n
  return total / n;
}",file="example.cpp")

In [27]:
## show that file was written and saved
cat(system("ls -l",intern=TRUE),sep="\n")

total 8
-rw-r--r-- 1 root root  209 Nov 30 05:14 example.cpp
drwxr-xr-x 1 root root 4096 Nov 22 00:14 sample_data


In [28]:
## show the contents of the file
cat(paste0(system("cat example.cpp", intern=TRUE)),sep="\n")

#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
double meanC(NumericVector x) {
  int n = x.size();
  double total = 0;
  for(int i = 0; i < n; ++i) { 
    total += x[i];
  }

  return total / n;
}


In [29]:
## compile the C++ function
sourceCpp("example.cpp")

In [30]:
x = runif(1e6)
print(microbenchmark(meanC(x),mean(x)))

Unit: milliseconds
     expr      min       lq     mean   median       uq      max neval
 meanC(x) 1.521025 1.549222 1.591125 1.565568 1.584999 3.072309   100
  mean(x) 2.304500 2.352338 2.422401 2.385559 2.435685 3.294785   100


## 11. Attributes and other classes

All R objects have attributes, which can be queried and modified with `R.attr()`. `Rcpp` also provides `.names()` as an alias for the name attribute. 

In [31]:
cppFunction('NumericVector attribs() {
  NumericVector out = NumericVector::create(1, 2, 3);
  out.names() = CharacterVector::create("a", "b", "c");
  out.attr("my-attr") = "my-value";
  out.attr("class") = "my-class";          
  return out;
}')

In [32]:
attribs()

a b c 
1 2 3 
attr(,"my-attr")
[1] "my-value"
attr(,"class")
[1] "my-class"

## 12. Lists and data frames

The following code illustrates how you might extract the mean percentage error `mpe()` of a linear model. 


In [33]:
cppFunction('double mpe(List mod) {
  if (!mod.inherits("lm")) stop("Input must be a linear model");
  NumericVector resid = as<NumericVector>(mod["residuals"]);
  NumericVector fitted = as<NumericVector>(mod["fitted.values"]); 

  int n = resid.size();
  double err = 0;
  for(int i = 0; i < n; ++i) {
    err += resid[i] / (fitted[i] + resid[i]);
  }
  return err / n;
}')

Note the use of `.inherits()` and the `stop()` to check that the object really is a linear model.


In [34]:
set.seed(2021)
x = rnorm(1000)
y = 1+2*x+rnorm(1000)
fit = lm(y~x)
mpe(fit)

### Functions

You can put `R` functions in an object of type `Function`. This makes calling an `R` function from `C++` straightforward. 

In [35]:
cppFunction(
'RObject callfun(Function f, int n) {
  return f(n);
}'
)

What if we don't know type of object does an `R` function return?
Use `RObject` to catch all types. 

In [36]:
print(microbenchmark(callfun(seq_along,100),seq_along(100)))

Unit: nanoseconds
                    expr   min      lq     mean median      uq     max neval
 callfun(seq_along, 100) 14706 15669.5 31364.34  16096 16586.5 1422721   100
          seq_along(100)    70    80.0   106.94     91    99.0     560   100


In [37]:
print(typeof(callfun(seq_along,100)))

[1] "integer"


In [38]:
print(microbenchmark(callfun(rnorm,100),rnorm(100)))

Unit: microseconds
                expr    min      lq     mean  median      uq    max neval
 callfun(rnorm, 100) 24.800 26.2145 28.53419 27.1665 28.4345 91.392   100
          rnorm(100)  8.878  9.7765 10.93568 10.3930 11.3110 27.663   100
