This part of the pipeline produces the UMAP plots as well as the accumulation curves of all pangenomes.

### Paths and parameters

#### Pipeline input folders

In [None]:
pa.1.file = "./05-pangenomes/group1/gene_presence_absence.Rtab"
pa.4.file = "./05-pangenomes/group4/gene_presence_absence.Rtab"
pa.14a.file = "./05-pangenomes/group14a/gene_presence_absence.Rtab"
pa.14b.file = "./05-pangenomes/group14b/gene_presence_absence.Rtab"
pa.m.file = "./05-pangenomes/merge/gene_presence_absence.Rtab"

metadata = "./genomes_metadata"

plot_umap = "./utils/plot_umap.R"

#### Pipeline output folders

In [None]:
task_root = "./06-pangenome-postprocessing"
system(paste0('mkdir -p ', task_root), intern = TRUE)

#### Tool pointers and parameters

Importing the `plot_umap` function as it's not officially released yet.

In [None]:
source(plot_umap)
environment(plot_umap) = asNamespace('panstripe')

In [None]:
set.seed(127)

In [None]:
library(panstripe)
library(ape)
library(ggplot2)
library(umap)

### Load files and metadata

#### Cluster annotation

In [None]:
clusters = read.table(metadata, sep="\t", header=TRUE)
clusters = subset(clusters, select = c('Genome_accession', 'Taxonomic_cluster'))
clusters

#### Presence/absence files

In [None]:
pa.1 = read_rtab(pa.1.file)
pa.4 = read_rtab(pa.4.file)
pa.14a = read_rtab(pa.14a.file)
pa.14b = read_rtab(pa.14b.file)
pa.m = read_rtab(pa.m.file)

In [None]:
nrow(pa.1)
nrow(pa.4)
nrow(pa.14a)
nrow(pa.14b)
nrow(pa.m)

#### Cluster annotation for rRNA clusters

In [None]:
clusters_fac = as.factor(clusters[match(rownames(pa.m), clusters$Genome_accession),]$Taxonomic_cluster)
clusters_fac

### Plotting pangenome curves

#### Accumulation curve

In [None]:
svg(paste(task_root, "panstripe_accumulation_curve.svg", sep = "/"))
plot_acc(list(cluster_I = pa.1, cluster_IV = pa.4, cluster_XIVa = pa.14a, cluster_XIVb = pa.14b, merged = pa.m))
dev.off()

#### UMAP plots

In [None]:
svg(paste(task_root, 'panstripe_umap_clust1.svg', sep = "/"))
plot_umap(pa.1)
dev.off()

In [None]:
svg(paste(task_root, 'panstripe_umap_clust4.svg', sep = "/"))
plot_umap(pa.4)
dev.off()

In [None]:
svg(paste(task_root, 'panstripe_umap_clust14a.svg', sep = "/"))
plot_umap(pa.14a)
dev.off()

In [None]:
svg(paste(task_root, 'panstripe_umap_clust14b.svg', sep = "/"))
plot_umap(pa.14b)
dev.off()

In [None]:
svg(paste(task_root, 'panstripe_umap_merged.svg', sep = "/"))
plot_umap(pa.m, category = clusters_fac)
dev.off()

In [4]:
sessionInfo()

R version 4.1.2 (2021-11-01)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Linux Mint 21.2

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=nl_BE.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=nl_BE.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=nl_BE.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] umap_0.2.10.0   ggplot2_3.5.1   ape_5.8         panstripe_0.2.0

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.12       RSpectra_0.16-1   pillar_1.9.0      compiler_4.1.2   
 [5] base64enc_0.1-3   tools_4.1.2       digest_0.6.35     