This part of the pipeline collects the genome sizes of all genomes in the genome set and statistically compares their sizes by rRNA cluster.

### Paths and parameters

#### Pipeline input folders

In [None]:
metadata = "./genomes_metadata"

#### Pipeline output folders

In [None]:
task_root = "./10-MGEs/genome_sizes"

#### Tool pointers and parameters

#### Libraries and other setups

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools as it
import scipy.stats as sts
import numpy as np
from statannotations.Annotator import Annotator

Let's pick some colours from the HUSL colour palette for the plots

In [None]:
custom_palette = sns.husl_palette()
custom_palette = [custom_palette[0], custom_palette[2], custom_palette[4], custom_palette[5]]
custom_palette

### Reading metadata

In [None]:
data = pd.read_table(metadata, usecols = [0,2,4,13], sep = "\t", converters = {4: lambda x: int(x)/1000000})
data = data.rename(columns = {'Size (bp)': 'Size (Mb)'})
data

In [None]:
pd.DataFrame(data.groupby(['Taxonomic_cluster', 'Assumed_lifestyle']).count())

### Stats

#### Barplot

In [None]:
fig, ax = plt.subplots(figsize = (5,2))
ax = sns.barplot(ax = ax, data = data, estimator = "mean", errorbar = "se",
                 x = "Size (Mb)", y = "Taxonomic_cluster", palette = custom_palette,
                 width = 0.9, orient = "h")
plt.xlabel('Genome size (Mb)')
plt.ylabel('rRNA cluster')
plt.title('Genome sizes')
plt.savefig(task_root + "/" + "av_genome_sizes.svg")
plt.show()

#### Violinplot

In [None]:
fig, ax = plt.subplots(figsize = (5,3))
ax = sns.violinplot(ax = ax, data = data, x = 'Size (Mb)', y = 'Taxonomic_cluster', palette = custom_palette, orient = 'h', cut = 0)
plt.xlabel('Genome size (Mb)')
plt.ylabel('rRNA cluster')
plt.title('Genome sizes')

# Add statistical significance marks
pairs = list(it.combinations(data['Taxonomic_cluster'].unique(), 2)) # get all rRNA cluster pairs
annotator = Annotator(ax = ax, pairs = pairs, data = data, x = 'Size (Mb)', y = 'Taxonomic_cluster', orient = 'h', cut = 0)
annotator.configure(test = 'Mann-Whitney', text_format = 'star', loc = 'inside')
annotator.apply_and_annotate()

plt.savefig(task_root + "/" + 'genome_sizes.svg')
plt.show()

#### Stats values

Get all genome sizes and append them to the set of their rRNA cluster

In [None]:
size = data[['Size (Mb)', 'Taxonomic_cluster']].to_dict(orient = 'list')
size_data = list(zip(*size.values()))
size_stats = {}
for record in size_data:
    try:
        size_stats[record[1]].append(record[0])
    except KeyError:
        size_stats[record[1]] = [record[0]]
size_stats

General stats and Mann-Whitney U-testing

In [None]:
[(i, [np.mean(j), np.std(j)]) for i,j in size_stats.items()]

In [None]:
tests = it.combinations(size_stats.keys(), 2) # get all rRNA cluster pairs
for comb in tests:
    print(str(comb) + ': ' + 
          str(sts.mannwhitneyu(size_stats[comb[0]], 
                               size_stats[comb[1]])
              [1])
         )