This part of the pipeline estimates the genomic divergence rate of each rRNA lineage using Panstripe.

### Paths and parameters

#### Pipeline input folders

In [None]:
pa.1.file = "./05-pangenomes/group1/gene_presence_absence.Rtab"
pa.4.file = "./05-pangenomes/group4/gene_presence_absence.Rtab"
pa.14a.file = "./05-pangenomes/group14a/gene_presence_absence.Rtab"
pa.14b.file = "./05-pangenomes/group14b/gene_presence_absence.Rtab"
pa.m.file = "./05-pangenomes/merge/gene_presence_absence.Rtab"

tree.1.file = "./08-core-phylogeny/subtrees/group1.contree"
tree.4.file = "./08-core-phylogeny/subtrees/group4.contree"
tree.14a.file = "./08-core-phylogeny/subtrees/group14a.contree"
tree.14b.file = "./08-core-phylogeny/subtrees/group14b.contree"
tree.m.file = "./08-core-phylogeny/subtrees/merge.contree"

metadata = "./genomes_metadata"

#### Pipeline output folders

In [None]:
task_root = "09-temporalAnalysis"
system(paste0('mkdir -p ', task_root), intern = TRUE)

#### Tool pointers and parameters

In [None]:
set.seed(127)

In [None]:
library(panstripe)
library(ape)
library(ggplot2)

### Load files and metadata

#### Cluster annotation

In [None]:
clusters = read.table(metadata, sep="\t", header=TRUE)
clusters = subset(clusters, select = c('Genome_accession', 'Taxonomic_cluster'))
clusters

#### Presence/absence files

In [None]:
pa.1 = read_rtab(pa.1.file)
pa.4 = read_rtab(pa.4.file)
pa.14a = read_rtab(pa.14a.file)
pa.14b = read_rtab(pa.14b.file)
pa.m = read_rtab(pa.m.file)

In [None]:
nrow(pa.1)
nrow(pa.4)
nrow(pa.14a)
nrow(pa.14b)
nrow(pa.m)

#### Cluster annotation for rRNA clusters

In [None]:
clusters_fac = as.factor(clusters[match(rownames(pa.m), clusters$Genome_accession),]$Taxonomic_cluster)
clusters_fac

#### Phylogenies

In [None]:
tree.m = read.tree(tree.m.file)
tree.1 = read.tree(tree.1.file)
tree.4 = read.tree(tree.4.file)
tree.14a = read.tree(tree.14a.file)
tree.14b = read.tree(tree.14b.file)

### Fitting genomic divergence models

using Gaussian GLMs for robustness and ease of convergence

In [None]:
fit.1 = panstripe(pa.1, tree.1, family='gaussian')
fit.4 = panstripe(pa.4, tree.4, family='gaussian')
fit.14a = panstripe(pa.14a, tree.14a, family='gaussian')
fit.14b = panstripe(pa.14b, tree.14b, family='gaussian')
fit.m = panstripe(pa.m, tree.m, family='gaussian')

In [None]:
plot_residuals(fit.1)

In [None]:
plot_residuals(fit.4)

In [None]:
plot_residuals(fit.14a)

In [None]:
plot_residuals(fit.14b)

In [None]:
plot_residuals(fit.m)

In [None]:
fit.1$summary

In [None]:
fit.4$summary

In [None]:
fit.14a$summary

In [None]:
fit.14b$summary

In [None]:
fit.m$summary

In [None]:
svg(paste(task_root, 'panstripe_cumulative_pangenome.svg', sep = "/"))
plot_pangenome_cumulative(list(cluster_I = fit.1, cluster_IV = fit.4, cluster_XIVa = fit.14a, cluster_XIVb = fit.14b))
dev.off()

#### Statistically comparing the model fits

In [None]:
compare_pangenomes(fit.1, fit.4, family = "gaussian")

In [None]:
compare_pangenomes(fit.14a, fit.4, family = "gaussian")

In [None]:
compare_pangenomes(fit.1, fit.14a, family = "gaussian")

In [None]:
compare_pangenomes(fit.1, fit.14b, family = "gaussian")

In [None]:
compare_pangenomes(fit.4, fit.14b, family = "gaussian")

In [None]:
compare_pangenomes(fit.14a, fit.14b, family = "gaussian")

#### Gene gain/loss fits from the Panstripe models

In [None]:
svg(paste(task_root, 'panstripe_gain_loss_clust14a.svg', sep = "/"))
plot_gain_loss(fit.14a, tip_label=FALSE)
dev.off()

In [None]:
svg(paste(task_root, 'panstripe_gain_loss_clust14b.svg', sep = "/"))
plot_gain_loss(fit.14b, tip_label=FALSE)
dev.off()

In [None]:
svg(paste(task_root, 'panstripe_gain_loss_clust1.svg', sep = "/"))
plot_gain_loss(fit.1, tip_label=FALSE)
dev.off()

In [None]:
svg(paste(task_root, 'panstripe_gain_loss_clust4.svg', sep = "/"))
plot_gain_loss(fit.4, tip_label=FALSE)
dev.off()

In [None]:
svg(paste(task_root, 'panstripe_gain_loss_merge.svg', sep = "/"))
plot_gain_loss(fit.m, tip_label=FALSE)
dev.off()

In [None]:
save.image(file = paste(task_root, "environment.RData", sep = "/"))

In [2]:
sessionInfo()

R version 4.1.2 (2021-11-01)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Linux Mint 21.2

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=nl_BE.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=nl_BE.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=nl_BE.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] ggplot2_3.5.1   ape_5.8         panstripe_0.2.0

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.12       pillar_1.9.0      compiler_4.1.2    base64enc_0.1-3  
 [5] tools_4.1.2       digest_0.6.35     uuid_1.2-0        jsonlite_1.8.8  