This part of the pipeline plots the KOG count and frequency tables using ggplot.

### Paths and parameters

#### Pipeline input folders

In [None]:
partition_sizes = "./07-PangenomeAnnotation/pangenome_partition_sizes.tsv"
processed_output.folder = "./07-PangenomeAnnotation/KOG/processed_output"
kog_A_fractions = paste(processed_output.folder, "KOG_A_fractions.tsv", sep = "/")
kog_B_A_MET_fractions = paste(processed_output.folder, "KOG_B_A_MET_fractions.tsv", sep = "/")
kog_B_A_EIP_fractions = paste(processed_output.folder, "KOG_B_A_EIP_fractions.tsv", sep = "/")
kog_B_A_GIP_fractions = paste(processed_output.folder, "KOG_B_A_GIP_fractions.tsv", sep = "/")
kog_B_A_CPS_fractions = paste(processed_output.folder, "KOG_B_A_CPS_fractions.tsv", sep = "/")
kog_B_A_NINC_fractions = paste(processed_output.folder, "KOG_B_A_NINC_fractions.tsv", sep = "/")

#### Pipeline output folders

In [None]:
task_root = "./07-PangenomeAnnotation/KOG"
plots = paste(task_root, "plots", sep = "/")

system(paste('mkdir -p', plots), intern = TRUE)

#### Tool pointers and parameters

#### Libraries and other setup

In [None]:
library(ggplot2)
library(RColorBrewer)
library(pals)
library(IRdisplay)
library(dplyr)

In [None]:
# Base color palette: 'Stepped' (n=24) from the pals package in reversed order
getPalette = colorRampPalette(rev(stepped()))

### Reading metadata

In [None]:
## Pangenome partition sizes
pangenome.sizes = read.table(partition_sizes, sep = "\t", header = TRUE, row.names = 1)
pangenome.sizes$Partition = factor(pangenome.sizes$Partition, levels = c('core','accessory','unique'))
pangenome.sizes$Cluster = factor(pangenome.sizes$Cluster, levels = c('Merged', 'I', 'IV', 'XIVa','XIVb'))
levels(pangenome.sizes$Cluster)[levels(pangenome.sizes$Cluster)=='Merged'] = 'Full'
pangenome.sizes

### KOG plotting

#### Level A

In [None]:
kog.A = read.table(kog_A_fractions, sep = "\t", header = TRUE)
kog.A$Partition = factor(kog.A$Partition, levels = c('core','accessory','unique'))
kog.A$Cluster = factor(kog.A$Cluster, levels = c('Merged', 'I', 'IV', 'XIVa','XIVb'))
levels(kog.A$Cluster)[levels(kog.A$Cluster)=='Merged'] = 'Full'
kog.A

In [None]:
n.colors = length(unique(kog.A$Annotation))

svg(paste(plots, 'KOG_A_Annotation_Fractions.svg', sep = "/"), height = 3, width = 10)
ggplot(kog.A, aes(x = Cluster, y = Fraction, fill = Annotation)) +
  geom_bar(stat = "identity", position="stack") +
  facet_grid(~ Partition) +
  scale_fill_manual(values = getPalette(n.colors), 
                    guide = guide_legend(ncol = 1, keyheight = 0.8, keywidth = 0.4)) +
  scale_y_continuous(expand = c(0,0)) +
  scale_x_discrete(expand = c(0,0)) +
  labs(x = "rRNA cluster", y = "% KOGs", fill = "BRITE-A category") +
  theme(panel.spacing = unit(1, "lines"))
dev.off()

In [None]:
display_svg(file=paste(plots, 'KOG_A_Annotation_Fractions.svg', sep = "/"))

#### Level B for A = Metabolism

In [None]:
kog.B.met = read.table(kog_B_A_MET_fractions, sep = "\t", header = TRUE)
kog.B.met$Partition = factor(kog.B.met$Partition, levels = c('core','accessory','unique'))
kog.B.met$Cluster = factor(kog.B.met$Cluster, levels = c('Merged', 'I', 'IV', 'XIVa', 'XIVb'))
levels(kog.B.met$Cluster)[levels(kog.B.met$Cluster)=='Merged'] = 'Full'
kog.B.met = kog.B.met %>% group_by(Cluster, Partition) %>% mutate(normFrac = Fraction/sum(Fraction)*100)
kog.B.met

In [None]:
n.colors = length(unique(kog.B.met$Annotation))

svg(paste(plots, 'KOG_B_A_MET_Annotation_Fractions.svg', sep = "/"), height = 4, width = 10)
ggplot(kog.B.met, aes(x = Cluster, y = normFrac, fill = Annotation)) +
  geom_bar(stat = "identity", position="stack") +
  facet_grid(~ Partition) +
  scale_fill_manual(values = getPalette(n.colors), 
                    guide = guide_legend(ncol = 1, keyheight = 0.8, keywidth = 0.75)) +
  scale_y_continuous(expand = c(0,0)) +
  scale_x_discrete(expand = c(0,0)) +
  labs(x = "rRNA cluster", y = "% KOGs", fill = "BRITE-B Metabolism category \n(annotated genes only)") +
  theme(panel.spacing = unit(1, "lines"))
dev.off()

In [None]:
display_svg(file=paste(plots, 'KOG_B_A_MET_Annotation_Fractions.svg', sep = "/"))

#### Level B for A = Environmental Information Processing

In [None]:
kog.B.eip = read.table(kog_B_A_EIP_fractions, sep = "\t", header = TRUE)
kog.B.eip$Partition = factor(kog.B.eip$Partition, levels = c('core','accessory','unique'))
kog.B.eip$Cluster = factor(kog.B.eip$Cluster, levels = c('Merged', 'I', 'IV', 'XIVa','XIVb'))
levels(kog.B.eip$Cluster)[levels(kog.B.eip$Cluster)=='Merged'] = 'Full'
kog.B.eip = kog.B.eip %>% group_by(Cluster, Partition) %>% mutate(normFrac = Fraction/sum(Fraction)*100)
kog.B.eip

In [None]:
n.colors = length(unique(kog.B.eip$Annotation))

svg(paste(plots, 'KOG_B_A_EIP_Annotation_Fractions.svg', sep = "/"), height = 6, width = 10)
ggplot(kog.B.eip, aes(x = Cluster, y = normFrac, fill = Annotation)) +
  geom_bar(stat = "identity", position="stack") +
  facet_grid(~ Partition) +
  scale_fill_manual(values = getPalette(n.colors), 
                    guide = guide_legend(ncol = 1, keyheight = 0.8, keywidth = 0.75)) +
  scale_y_continuous(expand = c(0,0)) +
  scale_x_discrete(expand = c(0,0)) +
  labs(x = "rRNA cluster", y = "% KOGs", fill = "BRITE-B category (annotated genes only)") +
  theme(panel.spacing = unit(1, "lines"))
dev.off()

In [None]:
display_svg(file=paste(plots, 'KOG_B_A_EIP_Annotation_Fractions.svg', sep = "/"))

#### Level B for A = Genetic Information Processing

In [None]:
kog.B.gip = read.table(kog_B_A_GIP_fractions, sep = "\t", header = TRUE)
kog.B.gip$Partition = factor(kog.B.gip$Partition, levels = c('core','accessory','unique'))
kog.B.gip$Cluster = factor(kog.B.gip$Cluster, levels = c('Merged', 'I', 'IV', 'XIVa','XIVb'))
levels(kog.B.gip$Cluster)[levels(kog.B.gip$Cluster)=='Merged'] = 'Full'
kog.B.gip = kog.B.gip %>% group_by(Cluster, Partition) %>% mutate(normFrac = Fraction/sum(Fraction)*100)
kog.B.gip

In [None]:
n.colors = length(unique(kog.B.gip$Annotation))

svg(paste(plots, "KOG_B_A_GIP_Annotation_Fractions.svg", sep = "/"), height = 6, width = 10)
ggplot(kog.B.gip, aes(x = Cluster, y = normFrac, fill = Annotation)) +
  geom_bar(stat = "identity", position="stack") +
  facet_grid(~ Partition) +
  scale_fill_manual(values = getPalette(n.colors), 
                    guide = guide_legend(ncol = 1, keyheight = 0.8, keywidth = 0.75)) +
  scale_y_continuous(expand = c(0,0)) +
  scale_x_discrete(expand = c(0,0)) +
  labs(x = "rRNA cluster", y = "% KOGs", fill = "BRITE-B category (annotated genes only)") +
  theme(panel.spacing = unit(1, "lines"))
dev.off()

In [None]:
display_svg(file=paste(plots, "KOG_B_A_GIP_Annotation_Fractions.svg", sep = "/"))

#### Level B for A = Cellular Processes

In [None]:
kog.B.cps = read.table(kog_B_A_CPS_fractions, sep = "\t", header = TRUE)
kog.B.cps$Partition = factor(kog.B.cps$Partition, levels = c('core','accessory','unique'))
kog.B.cps$Cluster = factor(kog.B.cps$Cluster, levels = c('Merged', 'I', 'IV', 'XIVa','XIVb'))
levels(kog.B.cps$Cluster)[levels(kog.B.cps$Cluster)=='Merged'] = 'Full'
kog.B.cps = kog.B.cps %>% group_by(Cluster, Partition) %>% mutate(normFrac = Fraction/sum(Fraction)*100)
kog.B.cps

In [None]:
n.colors = length(unique(kog.B.cps$Annotation))

svg(paste(plots, 'KOG_B_A_CPS_Annotation_Fractions.svg', sep = "/"), height = 6, width = 10)
ggplot(kog.B.cps, aes(x = Cluster, y = normFrac, fill = Annotation)) +
  geom_bar(stat = "identity", position="stack") +
  facet_grid(~ Partition) +
  scale_fill_manual(values = getPalette(n.colors), 
                    guide = guide_legend(ncol = 1, keyheight = 0.8, keywidth = 0.75)) +
  scale_y_continuous(expand = c(0,0)) +
  scale_x_discrete(expand = c(0,0)) +
  labs(x = "rRNA cluster", y = "% KOGs", fill = "BRITE-B category (annotated genes only)") +
  theme(panel.spacing = unit(1, "lines"))
dev.off()

In [None]:
display_svg(file=paste(plots, 'KOG_B_A_CPS_Annotation_Fractions.svg', sep = "/"))

#### Level B unclassified fractions

In [None]:
kog.B.ninc = read.table(kog_B_A_NINC_fractions, sep = "\t", header = TRUE)
kog.B.ninc$Partition = factor(kog.B.ninc$Partition, levels = c('core','accessory','unique'))
kog.B.ninc$Cluster = factor(kog.B.ninc$Cluster, levels = c('Merged', 'I', 'IV', 'XIVa','XIVb'))
levels(kog.B.ninc$Cluster)[levels(kog.B.ninc$Cluster)=='Merged'] = 'Full'
kog.B.ninc = kog.B.ninc %>% group_by(Cluster, Partition) %>% mutate(normFrac = Fraction/sum(Fraction)*100)
kog.B.ninc

In [None]:
n.colors = length(unique(kog.B.ninc$Annotation))

svg(paste(plots, 'KOG_B_A_NINC_Annotation_Fractions.svg', sep = "/"), height = 6, width = 10)
ggplot(kog.B.ninc, aes(x = Cluster, y = normFrac, fill = Annotation)) +
  geom_bar(stat = "identity", position="stack") +
  facet_grid(~ Partition) +
  scale_fill_manual(values = getPalette(n.colors), 
                    guide = guide_legend(ncol = 1, keyheight = 0.8, keywidth = 0.75)) +
  scale_y_continuous(expand = c(0,0)) +
  scale_x_discrete(expand = c(0,0)) +
  labs(x = "rRNA cluster", y = "% KOGs", fill = "BRITE-B category (annotated genes only)") +
  theme(panel.spacing = unit(1, "lines"))
dev.off()

In [None]:
display_svg(file=paste(plots, 'KOG_B_A_NINC_Annotation_Fractions.svg', sep = "/"))

### Size figure

In [None]:
# Partition sizes
svg(paste(task_root, 'partition_sizes.svg', sep = "/"), height = 2, width = 6)
ggplot(pangenome.sizes, aes(x = Cluster, y = No.genes, fill = Cluster)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ Partition, scales = 'free') +
  scale_x_discrete(expand = c(0,0)) + 
  theme(legend.position = "none") +
  labs(x = '', y = "No. genes") +
  theme(panel.spacing = unit(.5, "lines"))
dev.off()

In [None]:
display_svg(file = paste(task_root, 'partition_sizes.svg', sep = "/"))

### Generating KRONA plots

In [None]:
system(paste0("cd ", processed_output.folder, "&& dir -1 | grep KRONA | xargs basename -s .tsv | xargs -I % $HOME/bin/KronaTools/bin/ktImportText -o %.html %.tsv"),
       intern = TRUE)

In [2]:
sessionInfo()

R version 4.1.2 (2021-11-01)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Linux Mint 21.2

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=nl_BE.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=nl_BE.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=nl_BE.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] dplyr_1.1.4        IRdisplay_1.1      pals_1.8           RColorBrewer_1.1-3
[5] ggplot2_3.5.1     

loaded via a namespace (and not attached):
 [1] pillar_1.9.0      compiler_4.1.2    base64enc_0.1-3   tools_4.1.2      
 [5] digest_0.6.35     u