# Common pitfalls

  * The benefit of using R is that coding time is greatly
reduced.
  * But it's very easy to write programs that are
incredibly slow.

In [1]:
# BAD

bad_hit <- function(n = 100) {
    hit = NULL
    for(i in 1:n) {
        if(runif(1) < 0.3) 
            hit[i] = TRUE
        else
            hit[i] = FALSE
    }
}

# GOOD

good_hit <- function(n = 100) {
    hit = runif(n) < 0.3
}

rbenchmark::benchmark(
    good_hit(1000), bad_hit(1000), order = "relative", replications = 100)

test,replications,elapsed,relative,user.self,sys.self,user.child,sys.child
good_hit(1000),100,0.006,1.0,0.005,0.0,0,0
bad_hit(1000),100,0.148,24.667,0.141,0.007,0,0


# AVOID RBIND AT ALL COSTS
### FIRST RULE OF R CLUB IS TO NEVER GROW A VECTOR

* However, a reasonable upper bound on the size of the final object is often known
* So pre-allocate!

In [4]:
# THIS IS BAD

bad_df_add <- function(n) {
    df1 = data.frame(a = character(0), b = numeric(0))
    for(i in 1:n)
        df1 = rbind(df1, 
                data.frame(a = sample(letters, 1), b = runif(1)))
    return(df1)
}

# THIS IS GOOD

good_df_add <- function(n) {
    df2 <- tibble::tibble(a = character(n), b = numeric(n))
    
    for(i in 1:10) {
        df2[i, "a"] <- sample(letters, 1)
        df2[i, "b"] <- runif(1)
    }
    return(df2)
}

z <- rbenchmark::benchmark(
    good_df_add(1000), bad_df_add(1000), 
    order = "relative", replications = 5)

print(z)

               test replications elapsed relative user.self sys.self user.child
1 good_df_add(1000)            5   0.021    1.000     0.021    0.000          0
2  bad_df_add(1000)            5   2.278  108.476     2.259    0.015          0
  sys.child
1         0
2         0


## Always Vectorize

In [5]:
x = rnorm(10)
ans = NULL
for(i in 1:length(x)) {
    if(x[i] < 0) 
        ans = c(ans, x[i])
}

# GOOD VECTORIZE

x = x[x < 0]