This part of the pipeline plots the COG count and frequency tables using ggplot.

### Paths and parameters

#### Pipeline input folders

In [None]:
partition_sizes = "./07-PangenomeAnnotation/pangenome_partition_sizes.tsv"
cog_cats = "./utils/COG_cats.tsv"
cog_fractions = "./07-PangenomeAnnotation/COG/processed_output/COG_fractions.tsv"

#### Pipeline output folders

In [None]:
task_root = "./07-PangenomeAnnotation/COG"
plots = paste(task_root, "plots", sep = "/")

system(paste('mkdir -p', plots), intern = TRUE)

#### Tool pointers and parameters

#### Libraries and other setup

In [None]:
library(ggplot2)
library(RColorBrewer)
library(pals)
library(IRdisplay)

In [None]:
# Base color palette: 'Stepped' (n=24) from the pals package in reversed order
getPalette = colorRampPalette(rev(stepped()))

### Reading metadata

In [None]:
## Pangenome partition sizes
pangenome.sizes = read.table(partition_sizes, sep = "\t", header = TRUE, row.names = 1)
pangenome.sizes$Partition = factor(pangenome.sizes$Partition, levels = c('core','accessory','unique'))
pangenome.sizes$Cluster = factor(pangenome.sizes$Cluster, levels = c('Merged', 'I', 'IV', 'XIVa','XIVb'))
levels(pangenome.sizes$Cluster)[levels(pangenome.sizes$Cluster)=='Merged'] = 'Full'
pangenome.sizes

### COGs plotting

In [None]:
## COG categories
cog.cats = read.table(cog_cats, sep = "\t", row.names = 1)

In [None]:
## Data
cog = read.table(cog_fractions, sep = "\t", header = TRUE)
# Reorder factor levelling for plotting later on
cog$Partition = factor(cog$Partition, levels = c('core','accessory','unique'))
cog$Cluster = factor(cog$Cluster, levels = c('Merged', 'I', 'IV', 'XIVa', 'XIVb'))
levels(cog$Cluster)[levels(cog$Cluster)=='Merged'] = 'Full'
cog

In [None]:
n.colors = length(unique(cog$Annotation))

present.cogs = intersect(unique(cog$Annotation), rownames(cog.cats))

svg(paste(plots, 'COG_Annotation_Fractions.svg', sep = "/"), height = 5, width = 10)
ggplot(cog, aes(x = Cluster, y = Fraction, fill = Annotation)) +
  geom_bar(stat = "identity", position="stack") +
  facet_grid(~ Partition) +
  scale_fill_manual(values = getPalette(n.colors), 
                    guide = guide_legend(ncol = 1, keyheight = 0.8, keywidth = 0.4),
                    labels = factor(paste0("(", present.cogs, ") ", cog.cats[present.cogs,]))) +
  scale_y_continuous(expand = c(0,0)) +
  scale_x_discrete(expand = c(0,0)) +
  labs(x = "rRNA cluster", y = "% COGs", fill = "COG category") +
  theme(panel.spacing = unit(1, "lines"))
dev.off()

In [None]:
display_svg(file=paste(plots, 'COG_Annotation_Fractions.svg', sep = "/"))

In [2]:
sessionInfo()

R version 4.1.2 (2021-11-01)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Linux Mint 21.2

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=nl_BE.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=nl_BE.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=nl_BE.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] IRdisplay_1.1      pals_1.8           RColorBrewer_1.1-3 ggplot2_3.5.1     

loaded via a namespace (and not attached):
 [1] pillar_1.9.0      compiler_4.1.2    base64enc_0.1-3   tools_4.1.2      
 [5] digest_0.6.35     uuid_1.2-0        jsonli