This part of the pipeline processes the raw Pseudofinder output and statistically compares the normalised pseudogene counts by rRNA cluster.

### Paths and parameters

#### Pipeline input folders

In [None]:
metadata = "./genomes_metadata"

#### Pipeline output folders

In [None]:
task_root = "./10-MGEs/pseudogenes"
output_folder = task_root+"/output"
results_folder = task_root+"/processed_output"

#### Tool pointers and parameters

#### Libraries and other setups

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools as it
import scipy.stats as sts
import numpy as np
from statannotations.Annotator import Annotator

In [None]:
# custom_palette = sns.husl_palette()
# custom_palette = [custom_palette[0], custom_palette[2], custom_palette[4], custom_palette[5]]
custom_palette = {'1': '#ee6677',
                  '4': '#4477aa',
                  '14a': '#228833',
                  '14b': '#ccbb44'}
custom_palette

In [None]:
os.makedirs(results_folder, exist_ok=True)

## Reading input files

### Parsing pseudofinder results

In [None]:
os.makedirs(results_folder, exist_ok=True)
result_dirs = os.listdir(output_folder)
hits = []
# Pseudogene hits are listed in the "*_pseudos.fasta" file, so counting the records ('>') in those files
for dir in result_dirs:
    dir_conts = os.listdir(output_folder + '/' + dir)
    pseudo_fastas = [f for f in dir_conts if '_pseudos.fasta' in f]
    for pfa in pseudo_fastas:
        with open(output_folder + '/' + dir + '/' + pfa, "r") as handle:
            cont = handle.read()
            counts = cont.count('>')
        record = {'assembly_ID': dir, 'counts': counts}
        hits.append(record)
hits = pd.DataFrame(hits)
hits

### Loading the cluster annotations

In [None]:
cluster_annotations_0 = pd.read_table(metadata, sep = '\t', usecols = [1,2])
cluster_annotations_0.columns = ['assemblyID', 'cluster']
cluster_annotations = cluster_annotations_0.to_dict(orient = 'list')
cluster_annotations = dict(zip(*cluster_annotations.values()))
cluster_annotations

### Reading genome sizes

Necessary for normalising the pseudogene counts

In [None]:
genome_sizes_0 = pd.read_table(metadata, sep = '\t', usecols = [1,4])
genome_sizes_0.columns = ['assemblyID', 'size']

In [None]:
genome_sizes = genome_sizes_0.to_dict(orient = 'list')
genome_sizes = dict(zip(*genome_sizes.values()))
genome_sizes

### Adding metadata columns

In [None]:
hits['cluster'] = hits['assembly_ID'].apply(lambda x: cluster_annotations[x])
hits['size'] = hits['assembly_ID'].apply(lambda x: genome_sizes[x])
hits

In [None]:
hits.to_csv(results_folder + "/counts", sep = '\t', index = False)

### General count plots

Normalise by genome size

In [None]:
cluster_counts = hits.rename(columns = {'counts': 'No. pseudogenes'})
cluster_counts['Norm. no. pseudogenes'] = cluster_counts['No. pseudogenes']/cluster_counts['size']*1000000
cluster_counts

In [None]:
cluster_counts.to_csv(results_folder + "/" + 'counts_per_assembly_cluster', sep = '\t', index = False)

#### Barplot

In [None]:
fig, ax = plt.subplots(figsize = (5,2))
ax = sns.barplot(ax = ax, data = cluster_counts, estimator = "mean", errorbar = "se",
                 x = "Norm. no. pseudogenes", y = "cluster", palette = custom_palette,
                 width = 0.9, orient = "h", order = ['1','14a','4','14b'])
plt.xlabel('Avg. norm. no. pseudogenes')
plt.ylabel('rRNA cluster')
plt.title('Pseudogenes')
plt.savefig(results_folder + "/" + "av_counts_pseudogenes_cluster_bar.svg")
plt.show()

#### Violinplot

In [None]:
fig, ax = plt.subplots(figsize = (5,3))
ax = sns.violinplot(ax = ax, data = cluster_counts, x = 'Norm. no. pseudogenes', y = 'cluster', 
                    palette = custom_palette, orient = 'h', cut = 0, order = ['1','14a','4','14b'])
plt.xlabel('Norm. no. pseudogenes')
plt.ylabel('rRNA cluster')
plt.title('Pseudogenes')

# Add statistical significance markers
pairs = list(it.combinations(cluster_counts['cluster'].unique(), 2))
annotator = Annotator(ax = ax, pairs = pairs, data = cluster_counts, x = 'Norm. no. pseudogenes', y = 'cluster', orient = 'h', cut = 0,
                      order = ['1','14a','4','14b'])
annotator.configure(test = 'Mann-Whitney', text_format = 'star', loc = 'inside')
annotator.apply_and_annotate()

plt.savefig(results_folder + "/" + 'counts_pseudogenes_cluster_violin.svg')
plt.show()

#### Exact stats

Getting all the counts grouped by rRNA cluster

In [None]:
cluster_counts_stats = cluster_counts[['Norm. no. pseudogenes', 'cluster']].to_dict(orient = 'list')
cluster_counts_stats = list(zip(*cluster_counts_stats.values()))
counts_stats = {}
for record in cluster_counts_stats:
    try:
        counts_stats[record[1]].append(record[0])
    except KeyError:
        counts_stats[record[1]] = [record[0]]
counts_stats

In [None]:
[(i, [np.mean(j), np.std(j)]) for i,j in counts_stats.items()]

In [None]:
tests = it.combinations(counts_stats.keys(), 2)
for comb in tests:
    print(str(comb) + ': ' + 
          str(sts.mannwhitneyu(counts_stats[comb[0]], 
                           counts_stats[comb[1]])
              [1])
         )