This part of the pipeline processes the raw ISEscan output and statistically compares the normalised ISE counts by rRNA cluster.

### Paths and parameters

#### Pipeline input folders

In [None]:
metadata = "./genomes_metadata"

#### Pipeline output folders

In [None]:
task_root = "./10-MGEs/ISEs"
output_folder = task_root+"/output"
results_folder = task_root+"/processed_output"

#### Tool pointers and parameters

#### Libraries and other setups

In [None]:
import os
import pandas as pd
import scipy.stats as sts
import itertools as it
import seaborn as sns
import matplotlib.pyplot as plt
from statannotations.Annotator import Annotator

In [None]:
# custom_palette = sns.husl_palette()
# custom_palette = [custom_palette[0], custom_palette[2], custom_palette[4], custom_palette[5]]
custom_palette = {'1': '#ee6677',
                  '4': '#4477aa',
                  '14a': '#228833',
                  '14b': '#ccbb44'}
custom_palette

In [None]:
os.makedirs(results_folder, exist_ok=True)

## Reading input files

### Reading cluster annotations of the assembly IDs

In [None]:
cluster_annotations_0 = pd.read_table(metadata, sep = '\t', usecols = [1,2])
cluster_annotations_0.columns = ['assemblyID', 'cluster']

In [None]:
cluster_annotations = cluster_annotations_0.to_dict(orient = 'list')
cluster_annotations = dict(zip(*cluster_annotations.values()))
cluster_annotations

### Reading genome sizes

Necessary for normalising the ISE counts

In [None]:
genome_sizes_0 = pd.read_table(metadata, sep = '\t', usecols = [1,4])
genome_sizes_0.columns = ['assemblyID', 'size']

In [None]:
genome_sizes = genome_sizes_0.to_dict(orient = 'list')
genome_sizes = dict(zip(*genome_sizes.values()))
genome_sizes

### Reading ISEscanner output

In [None]:
tables = {}
# ISE hits can just be taken over from the result tsv file
for file in os.listdir(output_folder):
    acc = file[:-4]
    table = pd.read_table(output_folder + '/' + acc + ".fna/genomes/" + acc + ".fna.tsv", sep = "\t", usecols=[0,1,5,19,21])
    table['assemblyID'] = acc
    table['cluster'] = cluster_annotations[acc]
    table['size'] = genome_sizes[acc]
    tables[acc] = table

### Concatenating output by assembly ID

In [None]:
out = pd.concat(tables, ignore_index=True)

In [None]:
out.to_csv(results_folder + "/" + 'full_list.tsv', sep='\t', index = False)

In [None]:
out

## IS counting

### General count stats

#### Number of ISes by assembly

In [None]:
IScounts_assembly = out.groupby(by = "assemblyID")['seqID'].count()
IScounts_assembly

#### Total number of ISes by cluster

In [None]:
IScounts_cluster = out.groupby(by = "cluster")['seqID'].count()
IScounts_cluster

#### Average number of ISes by cluster

In [None]:
out.groupby(by = ["cluster", "assemblyID"])['seqID'].count().reset_index().rename(columns = {'seqID': 'No. ISes'}
            ).groupby(by = "cluster")['No. ISes'].mean().reset_index()

#### Number of ISs by IS family

In [None]:
IScounts_family = pd.DataFrame(out.groupby(by = "family")["seqID"].count()).T
IScounts_family

### Counting by rRNA cluster

Add annotations and genome size columns

In [None]:
IScounts_assembly_cluster = pd.DataFrame(IScounts_assembly).merge(cluster_annotations_0, how = 'inner', on = 'assemblyID')
IScounts_assembly_cluster = pd.DataFrame(IScounts_assembly_cluster).merge(genome_sizes_0, how = 'inner', on = 'assemblyID')
IScounts_assembly_cluster = IScounts_assembly_cluster.rename(columns = {'seqID': 'No. ISes'})

Normalise by genome size

In [None]:
IScounts_assembly_cluster['Norm. no. ISes'] = IScounts_assembly_cluster['No. ISes']/IScounts_assembly_cluster['size']*1000000
IScounts_assembly_cluster

In [None]:
IScounts_assembly_cluster.to_csv(results_folder + "/" + 'counts_per_assembly_cluster', sep = '\t', index = False)

#### Barplot

In [None]:
fig, ax = plt.subplots(figsize = (5,2))
ax = sns.barplot(ax = ax, data = IScounts_assembly_cluster, estimator = "mean", errorbar = "se",
                 x = "Norm. no. ISes", y = "cluster", palette = custom_palette,
                 width = 0.9, orient = "h")
plt.xlabel('Avg. norm. no. ISes')
plt.ylabel('rRNA cluster')
plt.title('IS elements')
plt.savefig(results_folder + "/" + "av_counts_cluster_bar.svg")
plt.show()

#### Violinplot

In [None]:
fig, ax = plt.subplots(figsize = (5,3))
ax = sns.violinplot(ax = ax, data = IScounts_assembly_cluster, x = 'Norm. no. ISes', y = 'cluster',
                    palette = custom_palette, orient = 'h', cut = 0)
plt.xlabel('Norm. no. ISes')
plt.ylabel('rRNA cluster')
plt.title('IS elements')

# Add statistical significance markers
pairs = list(it.combinations(IScounts_assembly_cluster['cluster'].unique(), 2))
annotator = Annotator(ax = ax, pairs = pairs, data = IScounts_assembly_cluster, x = 'Norm. no. ISes', y = 'cluster', orient = 'h', cut = 0)
annotator.configure(test = 'Mann-Whitney', text_format = 'star', loc = 'inside')
annotator.apply_and_annotate()

plt.savefig(results_folder + "/" + 'counts_cluster_violin.svg')
plt.show()

#### Exact stats

In [None]:
IScounts_assembly_cluster_grouped = pd.Series(IScounts_assembly_cluster.groupby('cluster')['Norm. no. ISes'])
IScounts_assembly_cluster_grouped = {IScounts_assembly_cluster_grouped[i][0]:
                                     IScounts_assembly_cluster_grouped[i][1].to_list()
                                     for i in range(len(IScounts_assembly_cluster_grouped))}
IScounts_assembly_cluster_grouped

In [None]:
tests = it.combinations(IScounts_assembly_cluster_grouped.keys(), 2)
for comb in tests:
    print(str(comb) + ': ' + 
          str(sts.mannwhitneyu(IScounts_assembly_cluster_grouped[comb[0]], 
                           IScounts_assembly_cluster_grouped[comb[1]])
              [1])
         )

## ISE families per cluster

In [None]:
IScounts_family_av = out.groupby(['assemblyID', 'family']).count()['seqID'].reset_index().rename(columns = {'seqID': 'No. ISes'})
IScounts_family_av

In [None]:
IScounts_family_av_pivot = IScounts_family_av.pivot(columns = "family", index = "assemblyID", values = "No. ISes").fillna(0).astype(int)
IScounts_family_av_pivot['cluster'] = IScounts_family_av_pivot.index.to_series().apply(lambda x: cluster_annotations[x])
IScounts_family_av_pivot = IScounts_family_av_pivot.melt(id_vars = 'cluster').rename(columns = {'value': 'No. ISes'})
IScounts_family_av_pivot

In [None]:
fig, ax = plt.subplots(figsize = (8,10))
ax = sns.barplot(ax = ax, data = IScounts_family_av_pivot, estimator = "mean", errorbar = "se",
                 x = "No. ISes", y = "family", hue = "cluster", order = sorted(IScounts_family_av['family'].unique()),
                 palette = custom_palette, width = 0.9, orient = "h")
plt.xlabel("Avg. no. ISes")
plt.ylabel("IS family")
plt.savefig(results_folder + "/" + "av_counts_IStype.svg")
plt.show()

**IS families with known passenger activity:**
- **IS1595**: originally identified in *Xanthomonas* species. Can transport passenger genes: transcription regulators or ncDNA.
- **IS481**: evidence it has played a fundamental role in IS amplification and genome decay in *Bordetella pertussis*. Can carry passenger genes (antibiotic resistance, transcriptional regulators).
- **IS6**: Can carry passenger genes (nylon degradation in *Arthrobacter*, ARGs in *S. aureus*. No specific insertion target.
- **IS66**: Common among *Firmicutes*. May carry passenger genes.
- **ISL3**: One-orf IS. Passenger IS has been observed in *Enterococcus faecium*, transporting Hg resistance.

**other IS families with a substantially different count level:**
- **IS200/IS605**: *tnpA* is the essential transposase. An additional protein *tnpB* has an unknown function, maybe regulatory. No prominent passenger genes known, so no selection target.
- **IS110**: a diverse group of single Tpase IS. A diverse array of target sequences and no prominent passenger genes. No selection target.
- **IS1182**: diverse set of specificities, no passenger genes.
- **IS607**: the only IS to be found in eukaryotic genomes (protists that graze on bacteria) and large DNA viruses that infect those protists.
- **IS630**: A one-orf IS related to *mariner*. Specific insertion target (TA dinucleotide)
- **IS91**: Single-orf IS