This part of the pipeline processes the raw PHASTEST output and statistically compares the normalised prophage counts by rRNA cluster.

### Paths and parameters

#### Pipeline input folders

In [None]:
metadata = "./genomes_metadata"

#### Pipeline output folders

In [None]:
task_root = "./10-MGEs/prophages"
output_folder = task_root+"/output"
results_folder = task_root+"/processed_output"

#### Tool pointers and parameters

#### Libraries and other setups

In [None]:
import json
import os
import pandas as pd
import matplotlib.pyplot as plt
from supervenn import supervenn
import seaborn as sns
import itertools as it
import scipy.stats as sts
import numpy as np
from statannotations.Annotator import Annotator

In [None]:
# custom_palette = sns.husl_palette()
# custom_palette = [custom_palette[0], custom_palette[2], custom_palette[4], custom_palette[5]]
custom_palette = {'1': '#ee6677',
                  '4': '#4477aa',
                  '14a': '#228833',
                  '14b': '#ccbb44'}
custom_palette

In [None]:
os.makedirs(results_folder, exist_ok=True)

## Phage identifications

### Parsing PHASTEST phage regions output

In [None]:
result_dirs = filter(lambda x: '.log' not in x, os.listdir(output_folder))
hits = []
# Prophage hit regions can be read directly from the json_input_regions file
for dir in result_dirs:
    with open(output_folder + '/' + dir + '/' + "json_input_regions", "r") as handle:
        regions = json.load(handle)
    for r in regions:
        phage_info = r['most_common_phage']
        phage_genome_ID = '_'.join(phage_info.split('_')[-2:])
        phage_name = '_'.join(phage_info.split('_')[1:-2])
        record = {'assembly_ID': dir, 'length': r['stop'] - r['start'], 'completeness': r['completeness'], 
                  'name': phage_name, 'phage_genome_ID': phage_genome_ID}
        hits.append(record)
hits = pd.DataFrame(hits)
hits

In [None]:
hits.to_csv(results_folder + "/all_hits", sep = "\t", index = False)

#### Intact prophages only

In [None]:
hits_intact = hits[hits['completeness'] == 'intact'].reset_index(drop=True)
hits_intact

In [None]:
hits_intact.to_csv(results_folder + "/intact_hits", sep = "\t", index = False)

#### Decayed prophages only

In [None]:
hits_decayed = hits[hits['completeness'] != 'intact'].reset_index(drop=True)
hits_decayed

In [None]:
hits_decayed.to_csv(results_folder + "/decayed_hits", sep = "\t", index = False)

## Loading the cluster annotations

In [None]:
cluster_annotations_0 = pd.read_table(metadata, sep = '\t', usecols = [1,2,11])
cluster_annotations_0.columns = ['assemblyID', 'cluster', 'Failed_QC']
cluster_annotations_0.drop(cluster_annotations_0[cluster_annotations_0['Failed_QC'] != False].index, inplace = True)
cluster_annotations = cluster_annotations_0[['assemblyID', 'cluster']].to_dict(orient = 'list')
cluster_annotations = dict(zip(*cluster_annotations.values()))
cluster_annotations

In [None]:
annots = pd.DataFrame.from_records(list(zip(cluster_annotations.keys(), cluster_annotations.values())), columns = ('assembly_ID', 'cluster'))
annots

## Setting up the count pivot tables

### All prophages

In [None]:
counts = pd.DataFrame(hits.groupby(['assembly_ID', 'name'])['length'].count()
                     ).reset_index().rename(columns = {'length': 'counts'})
counts = counts.pivot(columns = 'name', index = 'assembly_ID', values = 'counts')
counts = counts.merge(annots, on = "assembly_ID", how = "right"
                     ).set_index('assembly_ID').fillna(0).drop('cluster', axis = 1).astype(int)
counts

In [None]:
counts.to_csv(results_folder + "/all_counts", sep = "\t", index = False)

### Intact prophages

In [None]:
counts_intact = pd.DataFrame(hits_intact.groupby(['assembly_ID', 'name'])['length'].count()
                            ).reset_index().rename(columns = {'length': 'counts'})
counts_intact = counts_intact.pivot(columns = 'name', index = 'assembly_ID', values = 'counts')
counts_intact = counts_intact.merge(annots, on = "assembly_ID", how = "right"
                                   ).set_index('assembly_ID').fillna(0).drop('cluster', axis = 1).astype(int)
counts_intact

In [None]:
counts_intact.to_csv(results_folder + "/intact_counts", sep = "\t", index = False)

### Decayed prophages

In [None]:
counts_decayed = pd.DataFrame(hits_decayed.groupby(['assembly_ID', 'name'])['length'].count()
                             ).reset_index().rename(columns = {'length': 'counts'})
counts_decayed = counts_decayed.pivot(columns = 'name', index = 'assembly_ID', values = 'counts')
counts_decayed = counts_decayed.merge(annots, on = "assembly_ID", how = "right"
                                     ).set_index('assembly_ID').fillna(0).drop('cluster', axis = 1).astype(int)
counts_decayed

In [None]:
counts_decayed.to_csv(results_folder + "/decayed_counts", sep = "\t", index = False)

## Count plotting

### all

Adding rRNA cluster annotations

In [None]:
counts_av = pd.DataFrame(counts.sum(axis = 1), columns = ['counts'])
counts_av['cluster'] = counts_av.index.to_series().apply(lambda x: cluster_annotations[x])
counts_av

#### Barplot

In [None]:
fig, ax = plt.subplots(figsize = (5,2))
ax = sns.barplot(ax = ax, data = counts_av, estimator = "mean", errorbar = "se",
                 x = "counts", y = "cluster", palette = custom_palette, 
                 width = 0.9, orient = "h")
plt.xlabel('Avg. no. prophages')
plt.ylabel('Taxonomic cluster')
plt.title('Prophages')
plt.savefig(results_folder + "/" + "av_counts_bar.svg")
plt.show()

#### Violinplot

In [None]:
fig, ax = plt.subplots(figsize = (5,3))
ax = sns.violinplot(ax = ax, data = counts_av, x = 'counts', y = 'cluster', orient = 'h', palette = custom_palette, cut = 0)
plt.xlabel('No. prophages')
plt.ylabel('rRNA cluster')
plt.title('Prophages')

# Adding statistical significance markers
pairs = list(it.combinations(counts_av['cluster'].unique(), 2))
annotator = Annotator(ax = ax, pairs = pairs, data = counts_av, x = 'counts', y = 'cluster', orient = 'h', cut = 0)
annotator.configure(test = 'Mann-Whitney', text_format = 'star', loc = 'inside')
annotator.apply_and_annotate()

plt.savefig(results_folder + "/" + 'av_counts_violin.svg')
plt.show()

#### Exact stats

Grouping the counts by rRNA cluster

In [None]:
counts_av_stats = counts_av[['counts', 'cluster']].to_dict(orient = 'list')
counts_av_stats = list(zip(*counts_av_stats.values()))
counts_av_stats_dict = {}
for record in counts_av_stats:
    try:
        counts_av_stats_dict[record[1]].append(record[0])
    except KeyError:
        counts_av_stats_dict[record[1]] = [record[0]]

In [None]:
[(i, [np.mean(j), np.std(j)]) for i,j in counts_av_stats_dict.items()]

In [None]:
tests = it.combinations(counts_av_stats_dict.keys(), 2)
for comb in tests:
    print(str(comb) + ': ' + 
          str(sts.mannwhitneyu(counts_av_stats_dict[comb[0]], 
                               counts_av_stats_dict[comb[1]])
              [1])
         )

### intact

Adding rRNA cluster annotations

In [None]:
counts_intact_av = pd.DataFrame(counts_intact.sum(axis = 1), columns = ['counts'])
counts_intact_av['cluster'] = counts_intact_av.index.to_series().apply(lambda x: cluster_annotations[x])
counts_intact_av

#### Barplot

In [None]:
fig, ax = plt.subplots(figsize = (5,2))
ax = sns.barplot(ax = ax, data = counts_intact_av, estimator = "mean", errorbar = "se",
                 x = "counts", y = "cluster", palette = custom_palette,
                 width = 0.9, orient = "h")
plt.xlabel('Avg. no. intact prophages')
plt.ylabel('Taxonomic cluster')
plt.title('Complete prophages')
plt.savefig(results_folder + "/" + "av_counts_intact_bar.svg")
plt.show()

#### Violinplot

In [None]:
fig, ax = plt.subplots(figsize = (5,3))
ax = sns.violinplot(ax = ax, data = counts_intact_av, x = 'counts', y = 'cluster', orient = 'h', palette = custom_palette, cut = 0)
plt.xlabel('No. intact prophages')
plt.ylabel('rRNA cluster')
plt.title('Complete prophages')

# Adding statistical significance markers
pairs = list(it.combinations(counts_intact_av['cluster'].unique(), 2))
annotator = Annotator(ax = ax, pairs = pairs, data = counts_intact_av, x = 'counts', y = 'cluster', orient = 'h', cut = 0)
annotator.configure(test = 'Mann-Whitney', text_format = 'star', loc = 'inside')
annotator.apply_and_annotate()

plt.savefig(results_folder + "/" + 'av_counts_intact_violin.svg')
plt.show()

#### Exact stats

Grouping the counts by rRNA cluster

In [None]:
counts_intact_av_stats = counts_intact_av[['counts', 'cluster']].to_dict(orient = 'list')
counts_intact_av_stats = list(zip(*counts_intact_av_stats.values()))
counts_intact_av_stats_dict = {}
for record in counts_intact_av_stats:
    try:
        counts_intact_av_stats_dict[record[1]].append(record[0])
    except KeyError:
        counts_intact_av_stats_dict[record[1]] = [record[0]]

In [None]:
[(i, [np.mean(j), np.std(j)]) for i,j in counts_intact_av_stats_dict.items()]

In [None]:
tests = it.combinations(counts_intact_av_stats_dict.keys(), 2)
for comb in tests:
    print(str(comb) + ': ' + 
          str(sts.mannwhitneyu(counts_intact_av_stats_dict[comb[0]], 
                               counts_intact_av_stats_dict[comb[1]])
              [1])
         )

### decayed

Adding rRNA cluster annotations

In [None]:
counts_decayed_av = pd.DataFrame(counts_decayed.sum(axis = 1), columns = ['counts'])
counts_decayed_av['cluster'] = counts_decayed_av.index.to_series().apply(lambda x: cluster_annotations[x])
counts_decayed_av

#### Barplot

In [None]:
fig, ax = plt.subplots(figsize = (5,2))
ax = sns.barplot(ax = ax, data = counts_decayed_av, estimator = "mean", errorbar = "se",
                 x = "counts", y = "cluster", palette = custom_palette,
                 width = 0.9, orient = "h")
plt.xlabel('Avg. no. decayed prophages')
plt.ylabel('Taxonomic cluster')
plt.title('Decayed prophages')
plt.savefig(results_folder + "/" + "av_counts_decayed_bar.svg")
plt.show()

#### Violinplot

In [None]:
fig, ax = plt.subplots(figsize = (5,3))
ax = sns.violinplot(ax = ax, data = counts_decayed_av, x = 'counts', y = 'cluster', orient = 'h', palette = custom_palette, cut = 0)
plt.xlabel('No. decayed prophages')
plt.ylabel('rRNA cluster')
plt.title('Decayed prophages')

pairs = list(it.combinations(counts_decayed_av['cluster'].unique(), 2))
annotator = Annotator(ax = ax, pairs = pairs, data = counts_decayed_av, x = 'counts', y = 'cluster', orient = 'h', cut = 0)
annotator.configure(test = 'Mann-Whitney', text_format = 'star', loc = 'inside')
annotator.apply_and_annotate()

plt.savefig(results_folder + "/" + 'av_counts_decayed_violin.svg')
plt.show()

#### Exact stats

Grouping the counts by rRNA cluster

In [None]:
counts_decayed_av_stats = counts_decayed_av[['counts', 'cluster']].to_dict(orient = 'list')
counts_decayed_av_stats = list(zip(*counts_decayed_av_stats.values()))
counts_decayed_av_stats_dict = {}
for record in counts_decayed_av_stats:
    try:
        counts_decayed_av_stats_dict[record[1]].append(record[0])
    except KeyError:
        counts_decayed_av_stats_dict[record[1]] = [record[0]]

In [None]:
[(i, [np.mean(j), np.std(j)]) for i,j in counts_decayed_av_stats_dict.items()]

In [None]:
tests = it.combinations(counts_decayed_av_stats_dict.keys(), 2)
for comb in tests:
    print(str(comb) + ': ' + 
          str(sts.mannwhitneyu(counts_decayed_av_stats_dict[comb[0]], 
                               counts_decayed_av_stats_dict[comb[1]])
              [1])
         )

## Type and infectivity plotting

### all

Adding rRNA cluster annotations

In [None]:
hits_molten = counts.reset_index().melt(id_vars = 'assembly_ID', var_name = "phage name", value_name = 'counts')
hits_molten['cluster'] = hits_molten['assembly_ID'].apply(lambda x: cluster_annotations[x])
hits_molten

#### Prophage types

In [None]:
fig, ax = plt.subplots(figsize = (6,25))
ax = sns.barplot(ax = ax, data = hits_molten, estimator = "mean", errorbar = "se",
                 x = "counts", y = "phage name", hue = "cluster", order = sorted(hits_molten['phage name'].unique()),
                 width = 0.9, orient = "h", palette = custom_palette)
plt.xlabel('Avg. no. prophages')
plt.ylabel('Prophage')
plt.savefig(results_folder + "/" + "av_counts_prophage.svg")
plt.show()

#### Prophage infectivity

In [None]:
infect_freq = pd.DataFrame(hits_molten.groupby(['phage name', 'cluster'])['counts'].mean()).reset_index().rename(columns = {'counts': 'freq'})
infect_freq['infects'] = infect_freq['freq'] > 0
infect_freq

Get all infecting phages by rRNA cluster of the host

In [None]:
infectivities = []
for c in infect_freq['cluster'].unique():
    to_add = infect_freq[(infect_freq['cluster'] == c) & (infect_freq['infects'] == True)]['phage name'].squeeze()
    to_add.name = 'Cluster ' + c
    infectivities.append(to_add)

infectivities_sets = [set(i) for i in infectivities]
labels = [i.name for i in infectivities]

In [None]:
fig, ax = plt.subplots(figsize = (7.5,4.5))
supervenn(infectivities_sets, labels, side_plots = 'right',
          chunks_ordering = 'minimize gaps', sets_ordering = 'minimize gaps', 
          widths_minmax_ratio = 0.15)
plt.xlabel('# infecting phages')
plt.ylabel('rRNA cluster')
plt.savefig(results_folder + "/" + "venn_infectivity.svg")
plt.show()

### intact

Adding rRNA cluster annotations

In [None]:
hits_intact_molten = counts_intact.reset_index().melt(id_vars = 'assembly_ID', var_name = "phage name", value_name = 'counts')
hits_intact_molten['cluster'] = hits_intact_molten['assembly_ID'].apply(lambda x: cluster_annotations[x])
hits_intact_molten

#### Prophage types

In [None]:
fig, ax = plt.subplots(figsize = (6,15))
ax = sns.barplot(ax = ax, data = hits_intact_molten, estimator = "mean", errorbar = "se",
                 x = "counts", y = "phage name", hue = "cluster", order = sorted(hits_intact_molten['phage name'].unique()),
                 width = 0.9, orient = "h", palette = custom_palette)
plt.xlabel('Avg. no. intact prophages')
plt.ylabel('Prophage')
plt.savefig(results_folder + "/" + "av_counts_prophage_intact.svg")
plt.show()

#### Prophage infectivity

In [None]:
infect_intact_freq = pd.DataFrame(hits_intact_molten.groupby(['phage name', 'cluster'])['counts'].mean()
                                 ).reset_index().rename(columns = {'counts': 'freq'})
infect_intact_freq['infects'] = infect_intact_freq['freq'] > 0
infect_intact_freq

In [None]:
infectivities_intact = []
for c in infect_intact_freq['cluster'].unique():
    to_add = infect_intact_freq[(infect_intact_freq['cluster'] == c) & (infect_intact_freq['infects'] == True)]['phage name'].squeeze()
    to_add.name = 'Cluster ' + c
    infectivities_intact.append(to_add)

infectivities_intact_sets = [set(i) for i in infectivities_intact]
labels_intact = [i.name for i in infectivities_intact]

In [None]:
fig, ax = plt.subplots(figsize = (7.5,4.5))
supervenn(infectivities_intact_sets, labels_intact, side_plots = 'right',
          chunks_ordering = 'minimize gaps', sets_ordering = 'minimize gaps', 
          widths_minmax_ratio = 0.15)
plt.xlabel('# infecting phages')
plt.ylabel('rRNA cluster')
plt.savefig(results_folder + "/" + "venn_infectivity_intact.svg")
plt.show()

### decayed

Adding rRNA cluster annotations

In [None]:
hits_decayed_molten = counts_decayed.reset_index().melt(id_vars = 'assembly_ID', var_name = "phage name", value_name = 'counts')
hits_decayed_molten['cluster'] = hits_decayed_molten['assembly_ID'].apply(lambda x: cluster_annotations[x])
hits_decayed_molten

#### Prophage types

In [None]:
fig, ax = plt.subplots(figsize = (6,15))
ax = sns.barplot(ax = ax, data = hits_decayed_molten, estimator = "mean", errorbar = "se",
                 x = "counts", y = "phage name", hue = "cluster", order = sorted(hits_decayed_molten['phage name'].unique()),
                 width = 0.9, orient = "h", palette = custom_palette)
plt.xlabel('Avg. no. decayed prophages')
plt.ylabel('Prophage')
plt.savefig(results_folder + "/" + "av_counts_prophage_decayed.svg")
plt.show()

#### Prophage infectivity

In [None]:
infect_decayed_freq = pd.DataFrame(hits_decayed_molten.groupby(['phage name', 'cluster'])['counts'].mean()
                                  ).reset_index().rename(columns = {'counts': 'freq'})
infect_decayed_freq['infects'] = infect_decayed_freq['freq'] > 0
infect_decayed_freq

In [None]:
infectivities_decayed = []
for c in infect_decayed_freq['cluster'].unique():
    to_add = infect_decayed_freq[(infect_decayed_freq['cluster'] == c) & (infect_decayed_freq['infects'] == True)]['phage name'].squeeze()
    to_add.name = 'Cluster ' + c
    infectivities_decayed.append(to_add)

infectivities_decayed_sets = [set(i) for i in infectivities_decayed]
labels_decayed = [i.name for i in infectivities_decayed]

In [None]:
fig, ax = plt.subplots(figsize = (7.5,4.5))
supervenn(infectivities_decayed_sets, labels_decayed, side_plots = 'right',
          chunks_ordering = 'minimize gaps', sets_ordering = 'minimize gaps', 
          widths_minmax_ratio = 0.15)
plt.xlabel('# infecting phages')
plt.ylabel('rRNA cluster')
plt.savefig(results_folder + "/" + "venn_infectivity_decayed.svg")
plt.show()

## Prophage protein annotations

### Parsing PHASTEST prophage region protein annotations

The protein annotation of the proteins in the prophage regions can be found in the `json_input` file.

In [None]:
result_dirs = filter(lambda x: '.log' not in x, os.listdir(output_folder))
hits = []
for dir in result_dirs:
    with open(output_folder + '/' + dir + '/' + "json_input", "r") as handle:
        proteins = json.load(handle)
    for p in proteins:
        protein_type = p['type']
        protein_tags = p['name']
        tags = p['name'].split(';')
        try:
            protein_annotation = tags[1]
        except IndexError:
            protein_annotation = tags[0]
        try:
            protein_length = int(p['sequence_length'].split(' ')[0])
        except ValueError: # skip non-delineated proteins
            continue
        protein_start = p['start']
        protein_stop = p['stop']
        record = {'strain': dir, 'type': protein_type, 'annotation': protein_annotation, 'size': protein_length, 
                  'start': protein_start, 'stop': protein_stop, 'tags': protein_tags}
        hits.append(record)
hits = pd.DataFrame(hits)
hits

In [None]:
hits.to_csv(results_folder + '/prophage_proteins', sep = "\t", index = False)

### Querying the prophage protein annotation dataframe

**Auxiliary function**

In [None]:
## Queries the prophage protein annotation dataframe for a given search query and returns how many hits there are for each assembly
##
## PARAMS
## query      the search query string
## hits       the prophage protein annotation dataframe
##
## OUTPUT
## a dataframe of the number of hits for that search query for each genome assembly
##
def query_hits(query, hits):
    strains = pd.Series(sorted(list(hits['strain'].unique())), name = 'strain')
    res = hits.loc[hits['annotation'].str.contains(query, regex = True, case = False),]
    res = res.groupby('strain')['type'].count()
    res = pd.merge(res, strains, on = "strain", how = "right").fillna(0).set_index('strain').astype(int)
    return list(res['type'])

#### Queries

In [None]:
sum(query_hits('Type [I|V|X]+ .*secretion|secretion.*Type [I|V|X]+', hits))

In [None]:
sum(query_hits('Type II .*secretion|secretion.*Type II', hits))

In [None]:
sum(query_hits('Type IV .*secretion|secretion.*Type IV', hits))

In [None]:
sum(query_hits('Type VI .*secretion|secretion.*Type VI', hits))

In [None]:
sum(query_hits('Type VII .*secretion|secretion.*Type VII', hits))

In [None]:
sum(query_hits('toxin', hits))

In [None]:
sum(query_hits('toxin-antitoxin', hits))

In [None]:
sum(query_hits('Type II .*toxin-antitoxin|toxin-antitoxin.*Type II', hits))

In [None]:
sum(query_hits('hemolysin', hits))

In [None]:
sum(query_hits('phage.*struct', hits))

In [None]:
sum(query_hits('head', hits))

In [None]:
sum(query_hits('tail', hits))

In [None]:
sum(query_hits('capsid', hits))

In [None]:
sum(query_hits('methyltransferase', hits))

In [None]:
sum(query_hits('acetyltransferase', hits))

In [None]:
sum(query_hits('type II methyltransferase', hits))

In [None]:
sum(query_hits('restriction', hits))

In [None]:
sum(query_hits('type 1 restriction', hits))

In [None]:
sum(query_hits('type I restriction', hits))

In [None]:
sum(query_hits('type 2 restriction', hits))

In [None]:
sum(query_hits('type II restriction', hits))

In [None]:
sum(query_hits('type 3 restriction', hits))

In [None]:
sum(query_hits('type III restriction', hits))

In [None]:
sum(query_hits('type 4 restriction', hits))

In [None]:
sum(query_hits('type IV restriction', hits))

In [None]:
sum(query_hits('type 5 restriction', hits))

In [None]:
sum(query_hits('type V restriction', hits))

In [None]:
sum(query_hits('transporter', hits))

In [None]:
sum(query_hits('ABC transporter', hits))

In [None]:
sum(query_hits('beta-lactam', hits))

In [None]:
sum(query_hits('aminoglycoside', hits))

In [None]:
sum(query_hits('tetracycline', hits))

In [None]:
sum(query_hits('erythromycin', hits))

In [None]:
sum(query_hits('chloramphenicol', hits))

In [None]:
sum(query_hits('resistance', hits))

In [None]:
sum(query_hits('arsen', hits))

In [None]:
sum(query_hits('copper', hits))

In [None]:
sum(query_hits('cadmium', hits))

In [None]:
sum(query_hits('zinc .*transport', hits))

In [None]:
sum(query_hits('cobalt .*transport', hits))

In [None]:
sum(query_hits('nitrate reductase', hits))

In [None]:
sum(query_hits('cytochrome', hits))