This part of the pipeline processes the results from the Ancestral State Reconstruction using the Count tool, and annotates the exchanged genes using COG and BRITE.

### Paths and parameters

#### Pipeline input folders

In [None]:
apa.file = "families.tsv"
tree.file = "input_ready.tree"
indices.path = "02-QC/indices"
cog.file = "matrix_annotations.tsv"
kog.file = "07-PangenomeAnnotation/KOG/processed_output/merged_full_annot"

#### Pipeline output folders

In [None]:
task_root = "12-ASR-analysis"
ASR_output = paste(task_root, "output", sep = "/")

system(paste('mkdir -p', task_root, ASR_output), intern = TRUE)

#### Tool pointers and parameters

In [None]:
cog.cats.file = "utils/COG_cats.tsv"

In [None]:
library(data.table)
library(dplyr)
library(tidyr)
library(Matrix)
library(ggplot2)
library(ggh4x)
library(RColorBrewer)
library(pals)
library(IRdisplay)
library(ape)
library(ggtree)
library(stringr)

In [None]:
getPalette = colorRampPalette(rev(stepped()))

## Reading files

In [None]:
root = getwd()

In [None]:
setwd(task_root)

#### Ancestral P/A data

In [None]:
apa = as.data.frame(fread(apa.file))

In [None]:
head(apa)

Genes that are gained in the root node (with label 1) are the LCA genes.

In [None]:
common = apa$'1' != 0

#### Phylogeny

In [None]:
tree = read.tree(tree.file)

In [None]:
ggtree(tree) + geom_text(aes(label = label), nudge_x = -0.01)

#### Genome indices

In [None]:
index.1 = read.table(paste(paste(root, indices.path, sep = "/"), "group1", sep = "/"))$V1
index.4 = read.table(paste(paste(root, indices.path, sep = "/"), "group4", sep = "/"))$V1
index.14a = read.table(paste(paste(root, indices.path, sep = "/"), "group14a", sep = "/"))$V1
index.14b = read.table(paste(paste(root, indices.path, sep = "/"), "group14b", sep = "/"))$V1

In [None]:
mrca.1 = getMRCA(tree, tip = index.1)
mrca.4 = getMRCA(tree, tip = index.4)
mrca.14a = getMRCA(tree, tip = index.14a)

## Identifying the exchanged genes per node

### Auxiliary functions

The conversion between node IDs (number assigned internally by `ape`) and node labels (number assigned externally by `Count`) has been pushed in such a way by the conversion script for the inputs for `Count` (`convert_ASR_inputs.R`) that they can be easily interconverted by adding or subtracting the number of terminal leafs (i.e. the leafs that had a non-numerical label before input conversion).

In [None]:
nodeID_to_nodeLabel = function(nodeID, tree) {
    return(nodeID - length(tree$tip.label))}
nodeLabel_to_nodeID = function(nodeLabel, tree) {
    return(nodeLabel + length(tree$tip.label))}

In [None]:
## Returns the node ID of a given label in a given phylogeny object.
##
## PARAMS
## label      node label to get the ID of
## tree       phylogeny in the form of an ape tree object
##
## OUTPUT
## node ID of the tree node with the given label
##
findID = function(label, tree) {
    lab = str_replace_all(label, ' ', '_')
    ID.tip = which(tree$tip.label == lab)
    if (length(ID.tip) == 0) {
        ID.node = which(tree$node.label == lab)
        ID = nodeLabel_to_nodeID(ID.node, tree)
    }
    else {
        ID = ID.tip
    }
    return(ID)
}

In [None]:
## Sorts the gene exchange events into gene gain and loss events, neglecting the number of genes exchanged.
## Also returns the full list of tree nodes and gene families.
##
## PARAMS
## apa      the gene presence table of the ASR produced by Count (presences.tsv)
## tree     the phylogeny in the form of an ape tree object
##
## OUTPUT
## a list of the binary gained and lost sparse matrices, and the array of tree nodes and gene families
##
identify_type = function(apa, tree) {
    families = apa$name
    nodes = colnames(apa)[str_detect(colnames(apa), "GCF [0-9]+\\.[0-9]|[1-9][0-9]+|[2-9]")]

    # Initialise two sparse matrices to represent whether a gene gain or loss, respectively, has taken place in a certain node
    gained.c = Matrix(FALSE, length(families), length(nodes)+1, sparse = TRUE)
    lost.c = gained.c

    # We'll compare the presence of gene families of each node with the one of its ancestor node
    for (node in nodes) {
        # Find the node label of the ancestor node using the IDs of the current node and its ancestor in the phylogeny object
        node.ID = findID(node, tree)
        ancestor.ID = tree$edge[tree$edge[,2] == node.ID, 1]
        ancestor = nodeID_to_nodeLabel(ancestor.ID, tree)

        # Get the presence of gene families in both tree nodes from the ASR presence table from Count
        pa.comp = subset(apa, select = c(ancestor, node))

        # Gained genes are present in this node but not in the ancestor node
        gained = pa.comp[,1] == 0 & pa.comp[,2] > 0
        gained.c[,node.ID] = gained

        # Lost genes are present in the ancestor node but not in this node
        lost = pa.comp[,1] > 0 & pa.comp[,2] == 0
        lost.c[,node.ID] = lost
    }
    
    res = list('gained' = gained.c, 'lost' = lost.c, 'nodes' = nodes, 'families' = families)
    return(res)
}

### Determining the exchange type of all gene families

In [None]:
identified.c = identify_type(apa, tree)
gained.c = identified.c$gained
lost.c = identified.c$lost
nodes = identified.c$nodes
families = identified.c$families

In [None]:
rm(apa)

In [None]:
save(gained.c, lost.c, nodes, families, file = 'output/sorted_genes.RData')

### Overview trees

In [None]:
## Returns a ape tree object supplemented with the number of gene exchange events by tree node
##
## PARAMS
## tree       phylogeny in the form an ape tree object
## ggl_data   sparse binary matrix indicating whether a certain type of gene exchange event has taken place in a certain tree node;
##            produced by identify_type()
##
## OUTPUT
## an ape tree object joined with a gene exchange metadata column
##
extend_tree_with_ggl = function(tree, ggl_data) {
    dt = data.frame(node = 1:(length(tree$node.label)+length(tree$tip.label)), trait = colSums(ggl_data))
    gt = full_join(fortify(tree), dt, by = "node")
    return(gt)
}

In [None]:
gt.gained = extend_tree_with_ggl(tree, gained.c)
gt.gained

In [None]:
dir.create('output/gained', recursive = TRUE)
svg('output/gained/overview_tree.svg', width = 8, height = 12)
ggtree(gt.gained, aes(color=.data$trait), size=0.5) +
    labs(colour='Total genes gained') +
    scale_color_gradientn(colours = magma(12), transform = "log10", na.value = "black")
dev.off()
display_svg(file = 'output/gained/overview_tree.svg')

In [None]:
gt.lost = extend_tree_with_ggl(tree, lost.c)
gt.lost

In [None]:
dir.create('output/lost', recursive = TRUE)
svg('output/lost/overview_tree.svg', width = 8, height = 12)
ggtree(gt.lost, aes(color=.data$trait), size=0.5) +
    labs(colour='Total genes lost') +
    scale_colour_gradientn(colours = magma(12), transform = "log10", na.value = "black")
dev.off()
display_svg(file = 'output/lost/overview_tree.svg')

## Characterising the exchanged genes of some nodes in particular

### Reading the annotation files

In [None]:
cog = as.data.frame(fread(cog.file, header = FALSE, col.names = c('family', 'annotation', 'category')))

In [None]:
cog

In [None]:
kog = as.data.frame(fread(paste(root, kog.file, sep = "/")))

In [None]:
kog

In [None]:
cog.cats = read.table(paste(root, cog.cats.file, sep = "/"), sep = "\t", row.names = 1)

### Auxiliary functions

In [None]:
## Gathers the gene family labels that were exchanged in a certain tree node
##
## PARAMS
## var.c      sparse binary matrix produced by identify_type() indicating in which tree node a certain gene family was exchanged
## families   array of gene family labels produced by identify_type()
## mrca       tree node to get the exchange gene family labels for
##
## OUTPUT
## a single-column dataframe listing the gene families that were exchanged in the given tree node
##
gather_genes = function(var.c, families, mrca) {
    var.mrca = as.data.frame(families[var.c[,mrca]])
    colnames(var.mrca) = c('family')
    return(var.mrca)
}

In [None]:
## Aggregates and counts the COG annotations of an array of gene families
##
## PARAMS
## var        a single-column dataframe with a list of gene families; produced by gather_genes()
## full_cog   the COG category annotations listed by gene family
##
## OUTPUT
## a dataframe with relative COG category frequencies
##
aggregate_and_count_cog = function(var, full_cog) {
    cog = left_join(var, full_cog, by = 'family') # left join to preserve the unknown gene families
    cog.freq = cog %>% arrange(family) %>% count(category)

    # Redistribute the plural annotations (e.g. 'BE')
    for (c in cog.freq$category) {
        # Case for known gene families
        if (!is.na(c)) {
            # Only do something for plural annotations
            if (nchar(c) > 1) {
                idx = which(cog.freq$category == c)
                c.split = str_split(c, '')[[1]]
                # Redistributing plural annotations to the separate single annotation categories, creating a new category if non-existent
                for (cs in c.split) {
                    where = which(cog.freq$category == cs)
                    if (!length(where) == 0) {
                        cog.freq[where,"n"] = cog.freq[where,"n"] + cog.freq[idx,"n"]
                    }
                    else {
                        cog.freq[nrow(cog.freq)+1,] = list(cs, cog.freq[idx,"n"])
                    }
                }
                cog.freq = cog.freq[-c(idx),]
            }
        }
        # Unknown gene families end up with a NA label, so redistributing those to the unannotated category ('-')
        else {
            idx = which(is.na(cog.freq$category))
            cog.freq[which(cog.freq$category == '-'),'n'] = cog.freq[which(cog.freq$category == '-'),'n'] + cog.freq[idx,'n']
            cog.freq = cog.freq[-c(idx),]
        }
    }
    # Convert to relative frequencies
    cog.freq = arrange(cog.freq, category) %>% mutate(freq = n/sum(n)*100) %>% arrange(desc(n))
    return(cog.freq)
}

In [None]:
## Aggregates and counts the KOG annotations at a given BRITE level of an array of gene families
##
## PARAMS
## var        a single-column dataframe with a list of gene families; produced by gather_genes()
## full_kog   the KOG category annotations listed by gene family and BRITE level
## level      BRITE level to filter the KOG annotation dataframe for
##
## OUTPUT
## a dataframe with relative KOG category frequencies of a given BRITE level
##
aggregate_and_count_kog = function(var, full_kog, level) {
    kog = left_join(var, full_kog, by = join_by('family' == 'ID')) %>% arrange(family) # left join to preserve the unknown gene families
    kog.freq = subset(kog, select = c('family', level)) # filter
        %>% distinct() # deduplicate
        %>% arrange(family) %>% count(.data[[level]]) # count
        %>% mutate(freq = n/sum(n)*100) %>% arrange(desc(n)) # convert to relative frequencies
    return(kog.freq)
}

In [None]:
## Gathers and counts both COG and KOG annotations of both gene exchange events for a certain tree node
##
## PARAMS
## gained.c      sparse binary matrix indicating whether a gene gain event has taken place in a tree node; produced by identify_type()
## lost.c        sparse binary matrix indicating whether a gene loss event has taken place in a tree node; produced by identify_type()
## families      array of gene family labels; produced by identify_type()
## mrca          tree node for which the annotations of the exchanged genes need to be examined
## cog           full COG annotation table for this genome set; expects the converted COG annotation list that was the input for Count
## kog           full KOG annotation table for this genome set; expects the processed KOG-BRITE table produced by notebook 07b2
## write_file    flag indicating whether the frequency tables should be written away (default = FALSE)
## output        output directory in which the frequency tables will be saved as tsv files; ignored if write_file is FALSE
## prefix        prefix for the filename of the frequency tables to distinguish different genome sets; ignored if write_file is FALSE
##
## OUTPUT
## a nested list of both gained and lost gene families with their COG and KOG annotations at all levels
##
gather_and_count = function(gained.c, lost.c, families, mrca, cog, kog, write_file = FALSE, output = NULL, prefix = NULL) {
    ## Gained genes
    gained = gather_genes(gained.c, families, mrca)
    writeLines(c("Number of genes gained", nrow(gained)))
    cog.gained.freq = aggregate_and_count_cog(gained, cog)
    kog.gained.freq.A = aggregate_and_count_kog(gained, kog, 'A')
    kog.gained.freq.B = aggregate_and_count_kog(gained, kog, 'B')
    kog.gained.freq.C = aggregate_and_count_kog(gained, kog, 'C')
    kog.gained.freq.D = aggregate_and_count_kog(gained, kog, 'D')

    # Saving results
    if write_file {
        dir.create(paste(output, 'gained', sep = "/"), recursive = TRUE)
        path = paste(output, 'gained', prefix, sep = "/")
        write.table(cog.gained.freq, paste(path, 'cog', sep = "."), sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE)
        write.table(kog.gained.freq.A, paste(path, 'kog.A', sep = "."), sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE)
        write.table(kog.gained.freq.B, paste(path, 'kog.B', sep = "."), sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE)
        write.table(kog.gained.freq.C, paste(path, 'kog.C', sep = "."), sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE)
        write.table(kog.gained.freq.D, paste(path, 'kog.D', sep = "."), sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE)
        writeLines('Results for gained genes saved!')
    }

    ## Lost genes
    lost = gather_genes(lost.c, families, mrca)
    writeLines(c("Number of genes lost", nrow(lost)))
    cog.lost.freq = aggregate_and_count_cog(lost, cog)
    kog.lost.freq.A = aggregate_and_count_kog(lost, kog, 'A')
    kog.lost.freq.B = aggregate_and_count_kog(lost, kog, 'B')
    kog.lost.freq.C = aggregate_and_count_kog(lost, kog, 'C')
    kog.lost.freq.D = aggregate_and_count_kog(lost, kog, 'D')

    # Saving results
    if write_file {
        dir.create(paste(output, 'lost', sep = "/"), recursive = TRUE)
        path = paste(output, 'lost', prefix, sep = "/")
        write.table(cog.lost.freq, paste(path, 'cog', sep = "."), sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE)
        write.table(kog.lost.freq.A, paste(path, 'kog.A', sep = "."), sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE)
        write.table(kog.lost.freq.B, paste(path, 'kog.B', sep = "."), sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE)
        write.table(kog.lost.freq.C, paste(path, 'kog.C', sep = "."), sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE)
        write.table(kog.lost.freq.D, paste(path, 'kog.D', sep = "."), sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE)
        writeLines('Results for lost genes saved!')
    }

    res = list('gained' = 
               list('listed' = gained,
                    'cog' = cog.gained.freq, 
                    'kog' = list('A' = kog.gained.freq.A, 'B' = kog.gained.freq.B, 'C' = kog.gained.freq.C, 'D' = kog.gained.freq.D)),
               'lost' =
               list('listed' = lost,
                    'cog' = cog.lost.freq,
                    'kog' = list('A' = kog.lost.freq.A, 'B' = kog.lost.freq.B, 'C' = kog.lost.freq.C, 'D' = kog.lost.freq.D))
               )
    return(res)
}

### MRCA of cluster 1

In [None]:
counted = gather_and_count(gained.c, lost.c, families, mrca.1, cog, kog, TRUE, 'output', 'mrca.1')

#### Gained genes

In [None]:
gained.1 = counted$gained$listed
cog.1.gained.freq = counted$gained$cog
kog.1.gained.freq.A = counted$gained$kog$A
kog.1.gained.freq.B = counted$gained$kog$B
kog.1.gained.freq.C = counted$gained$kog$C
kog.1.gained.freq.D = counted$gained$kog$D

In [None]:
gained.1 %>% left_join(cog, by = "family") %>% arrange(annotation)

In [None]:
kog.1.gained.freq.C %>% arrange(desc(n))

#### Lost genes

In [None]:
lost.1 = counted$lost$listed
cog.1.lost.freq = counted$lost$cog
kog.1.lost.freq.A = counted$lost$kog$A
kog.1.lost.freq.B = counted$lost$kog$B
kog.1.lost.freq.C = counted$lost$kog$C
kog.1.lost.freq.D = counted$lost$kog$D

In [None]:
lost.1 %>% left_join(cog, by = "family") %>% arrange(annotation)

In [None]:
kog.1.lost.freq.C %>% arrange(desc(n))

### MRCA of cluster 4

In [None]:
counted = gather_and_count(gained.c, lost.c, families, mrca.4, cog, kog, TRUE, 'output', 'mrca.4')

#### Gained genes

In [None]:
gained.4 = counted$gained$listed
cog.4.gained.freq = counted$gained$cog
kog.4.gained.freq.A = counted$gained$kog$A
kog.4.gained.freq.B = counted$gained$kog$B
kog.4.gained.freq.C = counted$gained$kog$C
kog.4.gained.freq.D = counted$gained$kog$D

In [None]:
gained.4 %>% left_join(cog, by = "family") %>% arrange(annotation)

In [None]:
kog.4.gained.freq.C %>% arrange(desc(n))

#### Lost genes

In [None]:
lost.4 = counted$lost$listed
cog.4.lost.freq = counted$lost$cog
kog.4.lost.freq.A = counted$lost$kog$A
kog.4.lost.freq.B = counted$lost$kog$B
kog.4.lost.freq.C = counted$lost$kog$C
kog.4.lost.freq.D = counted$lost$kog$D

In [None]:
lost.4 %>% left_join(cog, by = "family") %>% arrange(annotation)

In [None]:
kog.4.lost.freq.C %>% arrange(desc(n))

### MRCA of cluster 14a

In [None]:
counted = gather_and_count(gained.c, lost.c, families, mrca.14a, cog, kog, TRUE, 'output', 'mrca.14a')

#### Gained genes

In [None]:
gained.14a = counted$gained$listed
cog.14a.gained.freq = counted$gained$cog
kog.14a.gained.freq.A = counted$gained$kog$A
kog.14a.gained.freq.B = counted$gained$kog$B
kog.14a.gained.freq.C = counted$gained$kog$C
kog.14a.gained.freq.D = counted$gained$kog$D

In [None]:
gained.14a %>% left_join(cog, by = "family") %>% arrange(annotation)

In [None]:
kog.14a.gained.freq.C

#### Lost genes

In [None]:
lost.14a = counted$lost$listed
cog.14a.lost.freq = counted$lost$cog
kog.14a.lost.freq.A = counted$lost$kog$A
kog.14a.lost.freq.B = counted$lost$kog$B
kog.14a.lost.freq.C = counted$lost$kog$C
kog.14a.lost.freq.D = counted$lost$kog$D

In [None]:
lost.14a %>% left_join(cog, by = "family") %>% arrange(annotation)

In [None]:
kog.14a.lost.freq.C

### Plotting

#### Gained genes

In [None]:
gained.cog.toplot = bind_rows(list('1' = cog.1.gained.freq, '4' = cog.4.gained.freq, '14a' = cog.14a.gained.freq), .id = 'cluster')
n.colors = length(unique(gained.cog.toplot$category))
present.cogs = sort(intersect(unique(gained.cog.toplot$category), rownames(cog.cats)))

svg('output/gained/COG_Freqs_Gained.svg', height = 4.5, width = 6)
ggplot(gained.cog.toplot, aes(x = cluster, y = freq, fill = category)) +
  geom_bar(stat = "identity", position = "stack") +
  scale_fill_manual(values = getPalette(n.colors), 
                    guide = guide_legend(ncol = 1, keyheight = 0.8, keywidth = 0.4),
                    labels = factor(paste0("(", present.cogs, ") ", cog.cats[present.cogs,]))) +
  scale_y_continuous(expand = c(0,0)) +
  scale_x_discrete(expand = c(0,0)) +
  labs(x = "rRNA cluster", y = "% COGs", fill = "COG category") +
  theme(panel.spacing = unit(1, "lines"))
dev.off()
display_svg(file = 'output/gained/COG_Freqs_Gained.svg')

In [None]:
gained.kog.A.toplot = bind_rows(list('1' = kog.1.gained.freq.A, '4' = kog.4.gained.freq.A, '14a' = kog.14a.gained.freq.A), .id = 'cluster')
n.colors = length(unique(gained.kog.A.toplot$A))

svg('output/gained/KOG_A_Freqs_Gained.svg', height = 2, width = 4.5)
ggplot(gained.kog.A.toplot, aes(x = cluster, y = freq, fill = A)) +
  geom_bar(stat = "identity", position="stack") +
  scale_fill_manual(values = getPalette(n.colors), 
                    guide = guide_legend(ncol = 1, keyheight = 0.8, keywidth = 0.4)) +
  scale_y_continuous(expand = c(0,0)) +
  scale_x_discrete(expand = c(0,0)) +
  labs(x = "rRNA cluster", y = "% KOGs", fill = "BRITE-A category") +
  theme(panel.spacing = unit(1, "lines"))
dev.off()
display_svg(file = 'output/gained/KOG_A_Freqs_Gained.svg')

In [None]:
gained.kog.B.toplot = bind_rows(list('1' = kog.1.gained.freq.B, '4' = kog.4.gained.freq.B, '14a' = kog.14a.gained.freq.B), .id = 'cluster')
n.colors = length(unique(gained.kog.B.toplot$B))

svg('output/gained/KOG_B_Freqs_Gained.svg', height = 5, width = 5)
ggplot(gained.kog.B.toplot, aes(x = cluster, y = freq, fill = B)) +
  geom_bar(stat = "identity", position="stack") +
  scale_fill_manual(values = getPalette(n.colors), 
                    guide = guide_legend(ncol = 1, keyheight = 0.8, keywidth = 0.4)) +
  scale_y_continuous(expand = c(0,0)) +
  scale_x_discrete(expand = c(0,0)) +
  labs(x = "rRNA cluster", y = "% KOGs", fill = "BRITE-B category") +
  theme(panel.spacing = unit(1, "lines"))
dev.off()
display_svg(file = 'output/gained/KOG_B_Freqs_Gained.svg')

#### Lost genes

In [None]:
lost.cog.toplot = bind_rows(list('1' = cog.1.lost.freq, '4' = cog.4.lost.freq, '14a' = cog.14a.lost.freq), .id = 'cluster')
n.colors = length(unique(lost.cog.toplot$category))
present.cogs = sort(intersect(unique(lost.cog.toplot$category), rownames(cog.cats)))

svg('output/lost/COG_Freqs_Lost.svg', height = 4, width = 5.5)
ggplot(lost.cog.toplot, aes(x = cluster, y = freq, fill = category)) +
  geom_bar(stat = "identity", position="stack") +
  scale_fill_manual(values = getPalette(n.colors), 
                    guide = guide_legend(ncol = 1, keyheight = 0.8, keywidth = 0.4),
                    labels = factor(paste0("(", present.cogs, ") ", cog.cats[present.cogs,]))) +
  scale_y_continuous(expand = c(0,0)) +
  scale_x_discrete(expand = c(0,0)) +
  labs(x = "rRNA cluster", y = "% COGs", fill = "COG category") +
  theme(panel.spacing = unit(1, "lines"))
dev.off()
display_svg(file = 'output/lost/COG_Freqs_Lost.svg')

In [None]:
lost.kog.A.toplot = bind_rows(list('1' = kog.1.lost.freq.A, '4' = kog.4.lost.freq.A, '14a' = kog.14a.lost.freq.A), .id = 'cluster')
n.colors = length(unique(lost.kog.A.toplot$A))

svg('output/lost/KOG_A_Freqs_Lost.svg', height = 2, width = 4)
ggplot(lost.kog.A.toplot, aes(x = cluster, y = freq, fill = A)) +
  geom_bar(stat = "identity", position="stack") +
  scale_fill_manual(values = getPalette(n.colors), 
                    guide = guide_legend(ncol = 1, keyheight = 0.8, keywidth = 0.4)) +
  scale_y_continuous(expand = c(0,0)) +
  scale_x_discrete(expand = c(0,0)) +
  labs(x = "rRNA cluster", y = "% KOGs", fill = "BRITE-A category") +
  theme(panel.spacing = unit(1, "lines"))
dev.off()
display_svg(file = 'output/lost/KOG_A_Freqs_Lost.svg')

In [None]:
lost.kog.B.toplot = bind_rows(list('1' = kog.1.lost.freq.B, '4' = kog.4.lost.freq.B, '14a' = kog.14a.lost.freq.B), .id = 'cluster')
n.colors = length(unique(lost.kog.B.toplot$B))

svg('output/lost/KOG_B_Freqs_Lost.svg', height = 5, width = 4.5)
ggplot(lost.kog.B.toplot, aes(x = cluster, y = freq, fill = B)) +
  geom_bar(stat = "identity", position="stack") +
  scale_fill_manual(values = getPalette(n.colors), 
                    guide = guide_legend(ncol = 1, keyheight = 0.8, keywidth = 0.4)) +
  scale_y_continuous(expand = c(0,0)) +
  scale_x_discrete(expand = c(0,0)) +
  labs(x = "rRNA cluster", y = "% KOGs", fill = "BRITE-B category") +
  theme(panel.spacing = unit(1, "lines"))
dev.off()
display_svg(file = 'output/lost/KOG_B_Freqs_Lost.svg')

### Overview plot of the number of exchanged genes

In [None]:
numbers.exchanged.toplot = bind_rows(list(
    '1' = bind_cols(list('gained' = nrow(gained.1), 'lost' = nrow(lost.1))),
    '4' = bind_cols(list('gained' = nrow(gained.4), 'lost' = nrow(lost.4))),
    '14a' = bind_cols(list('gained' = nrow(gained.14a), 'lost' = nrow(lost.14a)))
    ), .id = 'cluster') %>% pivot_longer(cols = c('gained', 'lost'), names_to = "type", values_to = "counts")

In [None]:
svg('output/exchanged.svg', height = 2, width = 3)
ggplot(numbers.exchanged.toplot, aes(x = cluster, y = counts, fill = cluster)) +
    geom_bar(stat = "identity") +
    facet_grid2(~type, scales = "free", independent = 'y') +
    scale_x_discrete(expand = c(0,0)) +
    theme(panel.spacing = unit(1, "lines"), legend.position = "none") +
    labs(x = 'rRNA cluster MRCA', y = 'Number of genes')
dev.off()
display_svg(file = "output/exchanged.svg")

## Characterising the genes in the LCA

In [None]:
lca.genes = as.data.frame(families[common])
colnames(lca.genes) = c("family")

In [None]:
lca.genes

In [None]:
lca.cog.freq = aggregate_and_count_cog(lca.genes, cog)
lca.kog.freq.A = aggregate_and_count_kog(lca.genes, kog, 'A')
lca.kog.freq.B = aggregate_and_count_kog(lca.genes, kog, 'B')
lca.kog.freq.C = aggregate_and_count_kog(lca.genes, kog, 'C')
lca.kog.freq.D = aggregate_and_count_kog(lca.genes, kog, 'D')

In [None]:
lca.kog.freq.C

In [None]:
dir.create('output/common', recursive = TRUE)
write.table(lca.cog.freq, 'output/common/lca.cog', sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE)
write.table(lca.kog.freq.A, 'output/common/lca.kog.A', sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE)
write.table(lca.kog.freq.B, 'output/common/lca.kog.B', sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE)
write.table(lca.kog.freq.C, 'output/common/lca.kog.C', sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE)
write.table(lca.kog.freq.D, 'output/common/lca.kog.D', sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE)

#### Plotting

In [None]:
lca.cog.toplot = bind_rows(list('LCA' = lca.cog.freq), .id = 'cluster')
n.colors = length(unique(lca.cog.toplot$category))
present.cogs = sort(intersect(unique(lca.cog.toplot$category), rownames(cog.cats)))

svg('output/common/COG_Freqs.svg', height = 4, width = 5)
ggplot(lca.cog.toplot, aes(x = cluster, y = freq, fill = category)) +
  geom_bar(stat = "identity", position="stack") +
  scale_fill_manual(values = getPalette(n.colors), 
                    guide = guide_legend(ncol = 1, keyheight = 0.8, keywidth = 0.4),
                    labels = factor(paste0("(", present.cogs, ") ", cog.cats[present.cogs,]))) +
  scale_y_continuous(expand = c(0,0)) +
  scale_x_discrete(expand = c(0,0)) +
  labs(x = "", y = "% COGs", fill = "COG category") +
  theme(panel.spacing = unit(1, "lines"))
dev.off()
display_svg(file = 'output/common/COG_Freqs.svg')

In [None]:
lca.kog.A.toplot = bind_rows(list('LCA' = lca.kog.freq.A), .id = 'cluster')
n.colors = length(unique(lca.kog.A.toplot$A))

svg('output/common/KOG_A_Freqs.svg', height = 2, width = 3.5)
ggplot(lca.kog.A.toplot, aes(x = cluster, y = freq, fill = A)) +
  geom_bar(stat = "identity", position="stack") +
  scale_fill_manual(values = getPalette(n.colors), 
                    guide = guide_legend(ncol = 1, keyheight = 0.8, keywidth = 0.4)) +
  scale_y_continuous(expand = c(0,0)) +
  scale_x_discrete(expand = c(0,0)) +
  labs(x = "", y = "% KOGs", fill = "BRITE-A category") +
  theme(panel.spacing = unit(1, "lines"))
dev.off()
display_svg(file = 'output/common/KOG_A_Freqs.svg')

In [None]:
lca.kog.B.toplot = bind_rows(list('LCA' = lca.kog.freq.B), .id = 'cluster')
n.colors = length(unique(lca.kog.B.toplot$B))

svg('output/common/KOG_B_Freqs.svg', height = 5, width = 4)
ggplot(lca.kog.B.toplot, aes(x = cluster, y = freq, fill = B)) +
  geom_bar(stat = "identity", position="stack") +
  scale_fill_manual(values = getPalette(n.colors), 
                    guide = guide_legend(ncol = 1, keyheight = 0.8, keywidth = 0.4)) +
  scale_y_continuous(expand = c(0,0)) +
  scale_x_discrete(expand = c(0,0)) +
  labs(x = "", y = "% KOGs", fill = "BRITE-B category") +
  theme(panel.spacing = unit(1, "lines"))
dev.off()
display_svg(file = 'output/common/KOG_B_Freqs.svg')

## Overview figure for supplementary figures

#### COG

In [None]:
cog.toplot = bind_rows(list('Gained' = gained.cog.toplot, 'Lost' = lost.cog.toplot, 'LCA' = lca.cog.toplot), .id = 'type')
cog.toplot$type = factor(cog.toplot$type, levels = c('Gained', 'Lost', 'LCA'))
cog.toplot

In [None]:
n.colors = length(unique(cog.toplot$category))
present.cogs = sort(intersect(unique(cog.toplot$category), rownames(cog.cats)))

svg('output/ASR_COGannotation_overview.svg', height = 4.5, width = 8.5)
ggplot(cog.toplot, aes(x = cluster, y = freq, fill = category)) +
  facet_grid2(~type, scales = 'free', space = 'free') +
  geom_bar(stat = "identity", position = "stack") +
  scale_fill_manual(values = getPalette(n.colors), 
                    guide = guide_legend(ncol = 1, keyheight = 0.8, keywidth = 0.4),
                    labels = factor(paste0("(", present.cogs, ") ", cog.cats[present.cogs,]))) +
  scale_y_continuous(expand = c(0,0)) +
  scale_x_discrete(expand = c(0,0)) +
  labs(x = "rRNA cluster MRCA", y = "% COGs", fill = "COG category") +
  theme(panel.spacing = unit(1, "lines"), text = element_text(size = 13))
dev.off()
display_svg(file = 'output/ASR_COGannotation_overview.svg')

#### KOG

In [None]:
kog.B.toplot = bind_rows(list('Gained' = gained.kog.B.toplot, 'Lost' = lost.kog.B.toplot, 'LCA' = lca.kog.B.toplot), .id = 'type')
kog.B.toplot$type = factor(kog.B.toplot$type, levels = c('Gained', 'Lost', 'LCA'))
kog.B.toplot

In [None]:
n.colors = length(unique(kog.B.toplot$B))

svg('output/ASR_KOGBannotation_overview.svg', height = 5.25, width = 7.5)
ggplot(kog.B.toplot, aes(x = cluster, y = freq, fill = B)) +
  facet_grid2(~type, scales = 'free', space = 'free') +
  geom_bar(stat = "identity", position = "stack") +
  scale_fill_manual(values = getPalette(n.colors), 
                    guide = guide_legend(ncol = 1, keyheight = 0.8, keywidth = 0.4)) +
  scale_y_continuous(expand = c(0,0)) +
  scale_x_discrete(expand = c(0,0)) +
  labs(x = "rRNA cluster MRCA", y = "% KOGs", fill = "BRITE-B category") +
  theme(panel.spacing = unit(1, "lines"), text = element_text(size = 13))
dev.off()
display_svg(file = 'output/ASR_KOGBannotation_overview.svg')

In [None]:
gained.kog.B.toplot
lost.kog.B.toplot
lca.kog.B.toplot

## Saving and session info

In [None]:
save.image(file = 'output/env_output.RData')

In [2]:
sessionInfo()

R version 4.1.2 (2021-11-01)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Linux Mint 21.2

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=nl_BE.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=nl_BE.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=nl_BE.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] stringr_1.5.1      ggtree_3.11.1      ape_5.8            IRdisplay_1.1     
 [5] pals_1.8           RColorBrewer_1.1-3 ggh4x_0.2.8        ggplot2_3.5.1     
 [9] Matrix_1.6-5       tidyr_1.3.1        dplyr_1.1.4        data.table_1.15.4 

load