In [12]:
library(vegan)
library(microbiome)
library(tidyverse)
library(here)
library(ranger)
library(caret)

In [13]:
# load data and helper functions
source("https://raw.githubusercontent.com/HenrikEckermann/in_use/master/bayesian_helper.R")
source("https://raw.githubusercontent.com/HenrikEckermann/in_use/master/mb_helper.R")
source("https://raw.githubusercontent.com/HenrikEckermann/in_use/master/reporting.R")

In [14]:
load(here("data/data_transfer.RData"))
source(here("R/read.R"))
# take over the meta variables I created in other docs
meta_new <- data_transfer[, 1:9] 
# create new pseq object (read.R results in the object "genus" Leo created)
otu <- otu_to_df(genus, transpose = FALSE)
otu <- otu %>% 
    select(species, meta_new$sample_id) %>% 
    df_to_otu()
pseq <- phyloseq(otu, df_to_sd(meta_new), tax_table(genus))
# clr and relative abundance transformation to deal with compositionality of mb data
pseq.clr <- microbiome::transform(pseq, transform = "clr")
otus.clr <- otu_to_df(pseq.clr)
colnames(otus.clr)[which(colnames(otus.clr) == "Clostridium \\(sensu stricto\\)")] <- "Clostridium_sensu_stricto"
colnames(otus.clr) <- c("sample_id", gsub("_", "", colnames(otus.clr)[-1]))
colnames(otus.clr) <- gsub("\\.", "", colnames(otus.clr))
colnames(otus.clr) <- gsub(" ", "", colnames(otus.clr))
# final object for analysis:
df <- sd_to_df(pseq.clr) %>%
    select(cc, time, sample_id) %>%
    left_join(otus.clr, by = "sample_id") 

# next we randomly select 80% of each groups/timepoint as training data
ccpre <- df %>% filter(cc == "yes", time == "pre") %>% select(sample_id)
ccpost <- df %>% filter(cc == "yes", time == "post") %>% select(sample_id)
noccpre <- df %>% filter(cc == "no", time == "pre") %>% select(sample_id)
noccpost <- df %>% filter(cc == "no", time == "post") %>% select(sample_id)
# splitting
train_id <- c(
  sample(ccpre$sample_id, 39, replace = FALSE),
  sample(ccpost$sample_id, 39, replace = FALSE),
  sample(noccpre$sample_id, 39, replace = FALSE),
  sample(noccpost$sample_id, 39, replace = FALSE)
)
test_id <- df$sample_id[!df$sample_id %in% train_id]
# final object for analysis:
df_train <- df %>%
    filter(sample_id %in% train_id) %>%
    select(-sample_id) %>%
    group_by(time) %>%
    nest()
df_test <- df %>%
    filter(sample_id %in% test_id) %>%
    select(-sample_id) %>%
    group_by(time) %>%
    nest()   
df_train$data <- df_train$data %>% setNames(c("post", "pre"))
df_test$data <- df_test$data %>% setNames(c("post", "pre"))

“Setting class(x) to multiple strings ("tbl_df", "tbl", ...); result will no longer be an S4 object”

In [9]:
# tune hyperparameters using grid_search 
hyper_grid <- expand.grid(
  mtry = seq(2, 130, by = 4),
  node_size = seq(1, 11, by =2),
  samp_size = c(0.55, 0.632, 0.7, 0.8)
)
hyper_grid$rn <- 1:nrow(hyper_grid)
hyper_grid <- hyper_grid %>% group_by(rn) %>% nest()

rf_ranger <- function(x, time = "pre") {
  model <- ranger(
    data = df_train$data[[time]],
    formula = cc ~ .,
    num.trees = 60000,
    mtry = x$mtry,
    min.node.size = x$node_size,
    sample.fraction = x$samp_size,
    importance = "impurity",
    probability = FALSE,
    keep.inbag = TRUE,
    seed = 123)
  return(model$prediction.error)
}
hyper_grid$oob_post <- map_dbl(hyper_grid$data, rf_ranger, "post")
hyper_grid$oob_pre <- map_dbl(hyper_grid$data, rf_ranger, "pre")

# what is the index of the model that resulted in lowest OOB?
post_id <- which.min(hyper_grid$oob_post)
pre_id <- which.min(hyper_grid$oob_pre)
hyper_grid$data[post_id]
hyper_grid$data[pre_id]
# what are the actual OOBs?
hyper_grid$oob_post[post_id]
hyper_grid$oob_pre[pre_id]

mtry,node_size,samp_size
26,7,0.55


mtry,node_size,samp_size
30,3,0.55


In [1]:
# fit models with the tuned parameters and we use high tree size
models <- map2(df_train$data, c(post_id, pre_id), ~ranger(
  data = .x,
  formula = cc ~ .,
  num.trees = 100000,
  mtry = hyper_grid$data[[.y]]$mtry,
  min.node.size = hyper_grid$data[[.y]]$node_size,
  sample.fraction = hyper_grid$data[[.y]]$samp_size,
  importance = "impurity",
  probability = FALSE,
  keep.inbag = TRUE,
))
models$post
imp_post <- models$post$variable.importance %>% 
  as.data.frame() %>%
  rownames_to_column("genus")
models$pre 
imp_pre <- models$pre$variable.importance %>% 
  as.data.frame() %>%
  rownames_to_column("genus")
imp_pre
map(list(imp_post, imp_pre), function(x){
  colnames(x) <- c("genus", "importance")
  x %>% arrange(desc(importance)) %>%
  top_n(10) %>%
  ggplot(aes(reorder(genus, importance), importance)) +
    geom_col() +
    coord_flip()
})

ERROR: Error in map2(df_train$data, c(post_id, pre_id), ~ranger(data = .x, formula = cc ~ : could not find function "map2"


In [42]:
preds <- map2(
    .x = models, 
    .y = df_test$data, 
    ~predict(.x, data = .y))

In [51]:
cm <- map2(
    preds, 
    df_test$data,
    ~caret::confusionMatrix(
        data = .x$predictions,
        reference = .y$cc)
)
map(cm, print)

Confusion Matrix and Statistics

          Reference
Prediction no yes
       no   4   4
       yes  6   6
                                        
               Accuracy : 0.5           
                 95% CI : (0.272, 0.728)
    No Information Rate : 0.5           
    P-Value [Acc > NIR] : 0.5881        
                                        
                  Kappa : 0             
 Mcnemar's Test P-Value : 0.7518        
                                        
            Sensitivity : 0.4           
            Specificity : 0.6           
         Pos Pred Value : 0.5           
         Neg Pred Value : 0.5           
             Prevalence : 0.5           
         Detection Rate : 0.2           
   Detection Prevalence : 0.4           
      Balanced Accuracy : 0.5           
                                        
       'Positive' Class : no            
                                        
Confusion Matrix and Statistics

          Reference
Prediction no yes
  

$post
Confusion Matrix and Statistics

          Reference
Prediction no yes
       no   4   4
       yes  6   6
                                        
               Accuracy : 0.5           
                 95% CI : (0.272, 0.728)
    No Information Rate : 0.5           
    P-Value [Acc > NIR] : 0.5881        
                                        
                  Kappa : 0             
 Mcnemar's Test P-Value : 0.7518        
                                        
            Sensitivity : 0.4           
            Specificity : 0.6           
         Pos Pred Value : 0.5           
         Neg Pred Value : 0.5           
             Prevalence : 0.5           
         Detection Rate : 0.2           
   Detection Prevalence : 0.4           
      Balanced Accuracy : 0.5           
                                        
       'Positive' Class : no            
                                        

$pre
Confusion Matrix and Statistics

          Reference
Predicti

In [52]:
cm

$post
Confusion Matrix and Statistics

          Reference
Prediction no yes
       no   4   4
       yes  6   6
                                        
               Accuracy : 0.5           
                 95% CI : (0.272, 0.728)
    No Information Rate : 0.5           
    P-Value [Acc > NIR] : 0.5881        
                                        
                  Kappa : 0             
 Mcnemar's Test P-Value : 0.7518        
                                        
            Sensitivity : 0.4           
            Specificity : 0.6           
         Pos Pred Value : 0.5           
         Neg Pred Value : 0.5           
             Prevalence : 0.5           
         Detection Rate : 0.2           
   Detection Prevalence : 0.4           
      Balanced Accuracy : 0.5           
                                        
       'Positive' Class : no            
                                        

$pre
Confusion Matrix and Statistics

          Reference
Predicti

In [27]:
library(randomForest)
rftest <- randomForest(
    formula = cc ~.,
    data = df_train$data$pre,
    ntree = 50000
)
rftest


Call:
 randomForest(formula = cc ~ ., data = df_train$data$pre, ntree = 50000) 
               Type of random forest: classification
                     Number of trees: 50000
No. of variables tried at each split: 11

        OOB estimate of  error rate: 62.82%
Confusion matrix:
    no yes class.error
no  13  26   0.6666667
yes 23  16   0.5897436

In [16]:
df_train$data

cc,Actinomycetaceae,Aerococcus,Aeromonas,Akkermansia,Alcaligenesfaecalisetrel,Allistipesetrel,Anaerobiospirillum,Anaerofustis,Anaerostipescaccaeetrel,⋯,UnculturedClostridialesI,UnculturedClostridialesII,UnculturedMollicutes,UnculturedSelenomonadaceae,Veillonella,Vibrio,Weissellaetrel,Wissellaetrel,Xanthomonadaceae,Yersiniaetrel
yes,-1.1487469,-1.3362337,-1.893312,2.3960052,-0.5114675,0.8545251,-1.893489,-1.478015,0.9067342,⋯,1.520613,1.402049,0.7529889,-1.886635,-0.42762701,-0.48058497,-0.63053966,-1.871706,-1.1036079,-0.46743007
yes,-1.2804591,-2.0095689,-2.061294,-1.2705896,0.5855784,2.1011975,-2.045376,-1.970758,0.2746978,⋯,1.358991,1.214345,0.5915842,-2.070707,1.40981337,-0.58935189,-0.76588664,-2.065654,-1.1978460,-0.63461592
yes,-1.0078686,-1.6348779,-1.800270,-1.0039236,-0.4235226,0.9541429,-1.814272,-1.604841,0.6328799,⋯,1.610125,1.466770,0.8462107,-1.808475,-0.31503858,-0.41845211,-0.64204932,-1.821146,-0.9773829,-0.39607439
no,-0.8338622,-1.6955862,-1.803449,-0.9452251,-0.3970180,1.0473193,-1.810424,-1.715313,0.5662785,⋯,1.621011,1.462914,0.8533308,-1.818444,0.41719950,-0.39016152,-0.66813024,-1.779084,-0.9487641,-0.39551659
no,-0.9602640,-1.4372530,-1.763603,-1.0048991,-0.4772602,1.0778727,-1.864315,-1.718843,0.5284222,⋯,1.552633,1.429354,0.7843715,-1.855473,-0.47524421,-0.36170173,-0.51666182,-1.781343,-0.8140790,-0.37485597
no,-1.0818240,-1.2844102,-2.002802,-1.1437791,-0.5125062,0.9840218,-2.010761,-1.138843,1.0516797,⋯,1.413921,1.328260,0.6589038,-2.026250,-0.49128878,-0.50023207,-0.69399722,-1.998362,-1.0717794,-0.55184971
yes,-1.0603349,-1.3446468,-1.956028,-1.1231504,-0.5570571,0.8910310,-1.949803,-1.669304,0.4548830,⋯,1.536855,1.352708,0.7043035,-1.943443,0.96036986,0.08709808,-0.55608884,-1.696683,-1.0782454,-0.54139356
yes,-0.8041767,-1.4324196,-1.858773,-0.9700204,-0.4884715,0.9868843,-1.856663,-1.669955,0.5887133,⋯,1.564167,1.418973,0.7955048,-1.857459,-0.48203082,-0.46252934,-0.65584524,-1.792957,-0.8334733,-0.43635471
no,-1.2054569,-1.6282165,-1.988563,-1.2642101,-0.6255138,0.7502474,-1.994022,-1.586523,0.7698923,⋯,1.428328,1.352994,0.6726102,-1.991499,1.79970313,-0.53638358,-0.57842372,-1.987427,-1.1429772,-0.33670459
yes,-1.2261784,-1.1740109,-2.000033,-1.2105582,-0.6167662,0.9791550,-1.849814,-1.578022,0.3957620,⋯,1.375699,1.317883,0.6109129,-2.040470,-0.24510900,-0.42235522,-0.63036469,-2.008523,-1.0344941,-0.23879870

cc,Actinomycetaceae,Aerococcus,Aeromonas,Akkermansia,Alcaligenesfaecalisetrel,Allistipesetrel,Anaerobiospirillum,Anaerofustis,Anaerostipescaccaeetrel,⋯,UnculturedClostridialesI,UnculturedClostridialesII,UnculturedMollicutes,UnculturedSelenomonadaceae,Veillonella,Vibrio,Weissellaetrel,Wissellaetrel,Xanthomonadaceae,Yersiniaetrel
yes,-0.4083877,-1.823766,-1.744455,-0.8834496,-0.3742030,1.2337616,-1.794903,-1.804735,0.6119538,⋯,1.615525,1.462349,0.8428856,-1.878538,-0.484741672,-0.071064396,-0.60915710,-1.7189240,-0.6601576,-0.27228076
no,-1.4983331,-1.152917,-2.040809,-1.5114909,-0.7205450,0.6260844,-1.920393,-1.579053,0.4225294,⋯,1.212792,1.253614,0.4562151,-2.194052,0.300600456,-0.005020428,-0.46916711,-2.1003279,0.6002183,3.29158199
no,-1.1190081,-1.367617,-1.938402,-0.7168266,-0.5405521,0.9888573,-1.689446,-1.669559,0.4280865,⋯,1.473211,1.367651,0.7107346,-1.943172,-0.391413166,-0.411624418,-0.65865618,-1.9082991,-0.9779541,0.59558408
yes,-1.2510910,-1.829195,-2.035452,-1.2135305,-0.6556321,1.6164157,-2.035418,-1.928480,0.3106494,⋯,1.381603,1.248213,0.5994117,-2.058903,-0.301134150,-0.517800824,-0.81479029,-2.0037250,-1.2150723,-0.62280905
yes,-0.8328623,-1.861506,-1.938176,-1.1469005,-0.6467697,0.9812469,-2.055411,-1.810261,0.6227536,⋯,1.678290,1.372299,0.6733259,-1.860790,-0.446094759,0.843936538,-0.84010573,1.2517813,-0.8614286,-0.37252481
no,-0.8317233,-1.625681,-1.818459,-1.0412487,-0.3583937,1.1188984,-1.819416,-1.790179,0.5473381,⋯,3.652751,1.455171,0.8322639,-1.833505,-0.256969533,-0.296050844,-0.66475829,-1.7864341,-0.8669146,-0.37261309
no,-1.1701427,-1.513191,-2.131774,-1.3121934,-0.8006378,0.6242998,-1.432673,-1.885243,0.2779425,⋯,1.208580,1.191847,0.4814394,-2.223704,-0.136003384,-0.182737794,-0.71045993,-2.1546242,-0.8239231,1.12336646
yes,-0.4120358,-1.697833,-1.794272,-0.9193139,-0.4397619,1.1287813,-1.840462,-1.720021,0.5919487,⋯,1.621543,1.463607,0.8291749,-1.864633,-0.365900193,-0.420281228,-0.63945242,-1.8258182,-0.8718936,-0.37866905
yes,-1.1081240,-1.346810,-1.919874,-0.9141335,-0.5218606,0.8783488,-1.894901,-1.627176,0.8363335,⋯,1.501996,1.392677,0.7462862,-1.913614,4.814101491,-0.428918028,-0.01420404,-1.8818390,-1.1382395,-0.49413590
yes,-0.9777561,-1.832647,-1.932068,-1.0857427,-0.5548277,0.8525952,-1.936831,-1.333395,1.0639072,⋯,1.485420,1.487334,0.7692611,-1.948444,-0.390778696,-0.477433818,-0.41691346,-1.9166916,-1.1243628,-0.51069471
