In [1]:
library('tidyverse'); packageVersion('tidyverse')
library("phyloseq"); packageVersion('phyloseq')
library("ggpubr"); packageVersion('ggpubr')
library("vegan"); packageVersion('vegan')
library("MASS"); packageVersion('MASS')
library("scales"); packageVersion('scales')
library("picante"); packageVersion('picante')
library("caret"); packageVersion('caret')
library("AppliedPredictiveModeling"); packageVersion('AppliedPredictiveModeling')
library("ranger"); packageVersion('ranger')
library("e1071"); packageVersion('e1071')
library("randomForest"); packageVersion('randomForest')
library("alluvial"); packageVersion('alluvial')

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.5     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.0.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



[1] ‘1.3.1’

[1] ‘1.36.0’

[1] ‘0.4.0’

Loading required package: permute

Loading required package: lattice

This is vegan 2.5-7



[1] ‘2.5.7’


Attaching package: ‘MASS’


The following object is masked from ‘package:dplyr’:

    select




[1] ‘7.3.54’


Attaching package: ‘scales’


The following object is masked from ‘package:purrr’:

    discard


The following object is masked from ‘package:readr’:

    col_factor




[1] ‘1.1.1’

Loading required package: ape


Attaching package: ‘ape’


The following object is masked from ‘package:ggpubr’:

    rotate


Loading required package: nlme


Attaching package: ‘nlme’


The following object is masked from ‘package:dplyr’:

    collapse




[1] ‘1.8.2’


Attaching package: ‘caret’


The following object is masked from ‘package:vegan’:

    tolerance


The following object is masked from ‘package:purrr’:

    lift




[1] ‘6.0.90’

[1] ‘1.1.7’

[1] ‘0.13.1’

[1] ‘1.7.9’

randomForest 4.6-14

Type rfNews() to see new features/changes/bug fixes.


Attaching package: ‘randomForest’


The following object is masked from ‘package:ranger’:

    importance


The following object is masked from ‘package:dplyr’:

    combine


The following object is masked from ‘package:ggplot2’:

    margin




[1] ‘4.6.14’

[1] ‘0.1.2’

In [2]:
# Theme set and Color Palettes
theme_set(theme_pubr())
rootstock_palette <- c('#1b9e77', '#f0a4af', '#7570b3')
scion_palette <- c('#ed254e', '#0e79b2')
site_palette <- c('#e6ab02', '#281c39', '#12664c')
compartment_palette <- c("#5a1991", "#139d08", "#5c3c0d") #https://lospec.com/palette-list/famicube
safe_colorblind_palette <- c("#88CCEE", "#CC6677", "#DDCC77", "#AA4499", "#332288", "#117733", 
                             "#661100", "#999933", "#44AA99", "#882255", "#6699CC", "#888888")
# Set seed for analysis
set.seed(1154829343)

In [3]:
# Functions
# Function to return metadata df from phyloseq object
pssd2veg <- function(physeq) {
  # From a phyloseq object return a dataframe of the sample metadata for use in vegan
  # From: https://jacobrprice.github.io/2017/08/26/phyloseq-to-vegan-and-back.html
  sd <- sample_data(physeq)
  return(as(sd,"data.frame"))
}
# Function to plot or run a linear model on a ASV from a phyloseq object
plot_deseq2_DiffAbunMicob <- function(physeq_obj, ASV_number){
  temp_sample_tab <- pssd2veg(physeq_obj)
  otu_matrix <- as(otu_table(physeq_obj), "matrix")
  TEMP <- data.frame(ASV_count = otu_matrix[ASV_number,])
  TEMP2<- cbind(temp_sample_tab, TEMP)
  #Anova(lm(ASV_count ~ Compartment*Rootstock + Compartment*Irrigation + Rootstock*Irrigation + Block, data = TEMP2), type = "III")
  ggplot(TEMP2, aes(plant_body_site, ASV_count, fill= rootstock)) + geom_boxplot(outlier.shape = NA) + geom_point(position = position_jitterdodge(jitter.width = 0.2)) + scale_fill_manual(name = "Rootstock", values=rootstock_palette) + scale_y_continuous(name="Abundance") + xlab("Compartment") + theme(legend.position="right", axis.title = element_text(size = 14), axis.text = element_text(size = 12), plot.title = element_text(size=22))
}

# Function to plot confusion matrix using ggtile plot from a confussion matrix object
# By user: Enrique Perez Herrero 
# on https://stackoverflow.com/questions/46063234/how-to-produce-a-confusion-matrix-and-find-the-misclassification-rate-of-the-na%C3%AF
ggplotConfusionMatrix <- function(m){
  mytitle <- paste("Accuracy", percent_format()(m$overall[1]),
                   "Kappa", percent_format()(m$overall[2]))
  
  d <- as.data.frame.matrix(m$table)
  drn <- colnames(d)
  drr <- rownames(d)
  drs <- rowSums(d)
  d <- d %>% mutate_if(is.numeric, funs(./drs))
  d <- d %>% gather(x, value)
  Y <- cbind(as.data.frame(m$table), Proportion = d$value)
  Y$Reference <- fct_rev(Y$Reference) # Added this line to get a downward diagonal 
  p <-
    ggplot(data = Y, aes(x = Reference, y = Prediction, fill= Proportion)) +
    geom_tile( colour = "white") +
    scale_fill_gradient(low = "white", high = "#14A02E", na.value = "white", limits=c(0,1)) +
    ggtitle(mytitle) +
    theme(legend.position = "right", axis.text.x = element_text(angle = 60, hjust = 1)) +
    guides(fill = guide_colorbar(frame.colour = "black", ticks = FALSE))
  return(p)
}

In [5]:
# HERE THE FUNCTION FOR MLing

# In case I want interactions
# Example below
# Compartment_Rootstock = paste(ASV_metadata.df$plant_body_site, ASV_metadata.df$rootstock, sep = "_"),


MachineLearning_RF_ranger <- function(PHYSEQ_OBJ_1, GROUPING, TREES) {
    # Remove ASV Table and meta data from phyloseq objects
    ASV.df <- as.data.frame(otu_table(PHYSEQ_OBJ_1))
    ASV_metadata.df <- as.data.frame(sample_data(PHYSEQ_OBJ_1))
    # Format ASV table to be used for machine learning applications and make metadata df
    ASV.df <- t(ASV.df)
    ASV_meta.df <- data.frame(Sample = rownames(ASV_metadata.df), Year = ASV_metadata.df$year, Scion = ASV_metadata.df$scion, 
                              Rootstock = ASV_metadata.df$rootstock, Compartment = ASV_metadata.df$plant_body_site, Site = ASV_metadata.df$site,
                              Brix = ASV_metadata.df$brix_2_breaks, 
                              Compartment_Rootstock = paste(ASV_metadata.df$plant_body_site, ASV_metadata.df$rootstock, sep = "_"))
    ASV_prefiltered.df <- cbind(ASV.df, ASV_meta.df)
    # ~80:20 split train/test datasets while respecting groups (i.e. sampling the same number of samples from each label)
    # 594 total samples * 0.8 = 474
    # 474/Factor level = sample_n(#)
    # e.g. for compartment 474/3 = 158
    ### DUE TO THE DESIGN FOR SOME OF THE FACTORS BEING IMBALANCED I AM JUST GOING TO TAKE A RANDOM 80/20 SPLIT. ####
    #if ((GROUPING) == "Compartment"){
    #  train_index <- as.data.frame(ASV_prefiltered.df %>% group_by_(GROUPING) %>% sample_n(158))
    #} else if ((GROUPING) == "Site"){
    #  train_index <- as.data.frame(ASV_prefiltered.df %>% group_by_(GROUPING) %>% sample_n(158))
    #} else if ((GROUPING) == "Scion"){
    #  train_index <- as.data.frame(ASV_prefiltered.df %>% group_by_(GROUPING) %>% sample_n(237))
    #} else if ((GROUPING) == "Year"){
    #  train_index <- as.data.frame(ASV_prefiltered.df %>% group_by_(GROUPING) %>% sample_n(237))
    #} else if ((GROUPING) == "Rootstock"){
    #  train_index <- as.data.frame(ASV_prefiltered.df %>% group_by_(GROUPING) %>% sample_n(158))
    #}
    train_index <- as.data.frame(ASV_prefiltered.df %>% sample_n(475))
    rownames(train_index) <- train_index$Sample
    train_index <- match(rownames(train_index), rownames(ASV_prefiltered.df))
    train_x <- as.data.frame(ASV.df[train_index, ])
    test_y <- as.data.frame(ASV.df[-train_index, ])
    # Train set, 475
    train_x$Sample <- rownames(train_x)
    Training_meta.df <- merge(train_x, ASV_meta.df, by = 'Sample')
    train_x <- subset(Training_meta.df, select = -c(Compartment, Site, Scion, Year, Rootstock, Compartment_Rootstock, Brix))
    rownames(train_x) <- train_x$Sample
    train_x <- subset(train_x, select = -c(Sample))
    Training_meta.df <- subset(Training_meta.df, select = c(Compartment, Site, Scion, Year, Rootstock, Compartment_Rootstock, Brix))
    rownames(Training_meta.df) <- Training_meta.df$Sample 
    # Test set, 119 samples
    test_y$Sample <- rownames(test_y)
    Testing_meta.df <- merge(test_y, ASV_meta.df, by = "Sample")
    test_y <- subset(Testing_meta.df, select = -c(Compartment, Site, Scion, Year, Rootstock, Compartment_Rootstock, Brix))
    rownames(test_y) <- test_y$Sample
    test_y <- subset(test_y, select = -c(Sample))
    Testing_meta.df <- subset(Testing_meta.df, select = c(Compartment, Site, Scion, Year, Rootstock, Compartment_Rootstock, Brix))
    rownames(Testing_meta.df) <- Testing_meta.df$Sample 
    # Training model
    Training_grid <- expand.grid(.mtry = seq(10, length(train_x), round(length(train_x)*0.1)), .splitrule= "gini",
                                 .min.node.size = c(1, 5, 10))
    train_control <- trainControl(method="cv", number=10)
    RF_CM <- list()
    RF_CM[["RF_model"]] <- train(x = train_x, y = Training_meta.df[[GROUPING]], method = "ranger", importance = "impurity",
                                 tuneGrid = Training_grid, trControl = train_control, num.trees = TREES)
    RF_prediction_3 <- predict(RF_CM[["RF_model"]], test_y)
    RF_CM[["CMatrix"]] <- confusionMatrix(RF_prediction_3, as.factor(Testing_meta.df[[GROUPING]]), mode = "everything")
    RF_CM[["CMatrixPLOT"]] <- ggplotConfusionMatrix(RF_CM[["CMatrix"]])
    RF_CM[["VarImporance"]] <- varImp(RF_CM[["RF_model"]])
    return(RF_CM)
}

In [6]:
phy_vst <- readRDS("phyloseq_16s_no_soil_filtered_vst_dataset.rds")
phy_vst@sam_data$brix_2_breaks <- cut(phy_vst@sam_data$brix, breaks = c(-Inf, 7, Inf), labels = c("Pre-ripening", "Ripening")) # Check if I want to use these terms
summary(sample_data(phy_vst))

     barcode.sequence            LinkerPrimerSequence extraction_num
 AACACATGGGTT:  1     GTGTGYCAGCMGCCGCGGTAA:594       10     :  1   
 AACAGGTCTCTG:  1                                     100    :  1   
 AACATTGCAGGT:  1                                     101    :  1   
 AACGAATACCAC:  1                                     103    :  1   
 AACGACACGCTT:  1                                     104    :  1   
 AACGCGAAATTC:  1                                     105    :  1   
 (Other)     :588                                     (Other):588   
  date_collect          extract_date col_week           block    
 Min.   :2018-06-19   11212019: 24   1:115    089 OLCESE 24: 44  
 1st Qu.:2018-07-10   1302020 : 24   2:117    005 SEC D    : 43  
 Median :2018-08-01   7022020 : 24   4:123    103 SEC B    : 43  
 Mean   :2018-12-04   7072020 : 24   6:122    018 CARPENTER: 42  
 3rd Qu.:2019-06-27   7092020 : 24   7:117    RIP 760      : 42  
 Max.   :2019-07-25   7302020 : 24            (Other

In [None]:
datalist = c()

for (i in seq(1, 501, by = 2)){
    print(i)
    # Run models
    RF_1.1 <- MachineLearning_RF_ranger(phy_vst, "Site", TREES = i)
    RF_1.2 <- MachineLearning_RF_ranger(phy_vst, "Rootstock", TREES = i)
    RF_1.3 <- MachineLearning_RF_ranger(phy_vst, "Compartment", TREES = i)
    RF_1.4 <- MachineLearning_RF_ranger(phy_vst, "Year", TREES = i)
    RF_1.5 <-MachineLearning_RF_ranger(phy_vst, "Scion", TREES = i)
    RF_1.6 <-MachineLearning_RF_ranger(phy_vst, "Brix", TREES = i)

    # Record error rate of models
    X <- 1 - max(RF_1.1["RF_model"][[1]]$results$Accuracy)
    X <- rbind(X, 1 - max(RF_1.2["RF_model"][[1]]$results$Accuracy))
    X <- rbind(X, 1 - max(RF_1.3["RF_model"][[1]]$results$Accuracy))
    X <- rbind(X, 1 - max(RF_1.4["RF_model"][[1]]$results$Accuracy))
    X <- rbind(X, 1 - max(RF_1.5["RF_model"][[1]]$results$Accuracy))
    X <- rbind(X, 1 - max(RF_1.6["RF_model"][[1]]$results$Accuracy))
    
    
    rownames(X) <- c("RF1.1", "RF1.2", "RF1.3", "RF1.4", "RF1.5", "RF1.6")
    datalist[[i]] <- X
    message("Done with trees equal to", i)
}

save(datalist, file="NumTree_OBBerror_6models.Rda")

[1] 1


Done with trees equal to1



[1] 3


Done with trees equal to3



[1] 5


Done with trees equal to5



[1] 7


Done with trees equal to7



[1] 9


Done with trees equal to9



[1] 11


Done with trees equal to11



[1] 13


Done with trees equal to13



[1] 15


Done with trees equal to15



[1] 17


Done with trees equal to17



[1] 19


Done with trees equal to19



[1] 21


Done with trees equal to21



[1] 23


Done with trees equal to23



[1] 25


Done with trees equal to25



[1] 27


Done with trees equal to27



[1] 29


Done with trees equal to29



[1] 31


Done with trees equal to31



[1] 33


Done with trees equal to33



[1] 35


Done with trees equal to35



[1] 37


Done with trees equal to37



[1] 39


Done with trees equal to39



[1] 41


Done with trees equal to41



[1] 43


Done with trees equal to43



[1] 45


Done with trees equal to45



[1] 47


Done with trees equal to47



[1] 49


Done with trees equal to49



[1] 51


Done with trees equal to51



[1] 53


Done with trees equal to53



[1] 55


Done with trees equal to55



[1] 57


Done with trees equal to57



[1] 59


Done with trees equal to59



[1] 61


Done with trees equal to61



[1] 63


Done with trees equal to63



[1] 65


Done with trees equal to65



[1] 67


Done with trees equal to67



[1] 69


Done with trees equal to69



[1] 71


Done with trees equal to71



[1] 73


Done with trees equal to73



[1] 75


Done with trees equal to75



[1] 77


Done with trees equal to77



[1] 79


Done with trees equal to79



[1] 81


Done with trees equal to81



[1] 83


Done with trees equal to83



[1] 85


Done with trees equal to85



[1] 87


Done with trees equal to87



[1] 89


Done with trees equal to89



[1] 91


Done with trees equal to91



[1] 93


Done with trees equal to93



[1] 95


Done with trees equal to95



[1] 97


Done with trees equal to97



[1] 99


Done with trees equal to99



[1] 101


Done with trees equal to101



[1] 103


Done with trees equal to103



[1] 105


Done with trees equal to105



[1] 107


Done with trees equal to107



[1] 109


Done with trees equal to109



[1] 111


Done with trees equal to111



[1] 113


Done with trees equal to113



[1] 115


Done with trees equal to115



[1] 117


Done with trees equal to117



[1] 119
