# MGE detection

Uses the phyletic distribution pattern based mobility score to detect MGEs in genomes.

## Preparation

In [145]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import os

In [15]:
score_threshold = 0.3
n_threshold=10
fraction_threshold=0.1

In [146]:
project_path = Path().resolve().parent
input_path = project_path / "results" / "mobility_files"
output_path = project_path / "results" / "MGE_files"
os.mkdir(output_path)

In [147]:
genome_contig="GCA_000006865.1-AE005176.1"
genome_contig_file=str(genome+".csv")
genome2="GCA_000006785.2-AE004092.2"
genome2_file=str(genome2+".csv")

## Load mobility frame

In [18]:
frame=pd.read_csv(input_path / genome2_file, index_col=0)

In [103]:
###
i=0
exc=0
start_genes=[]
end_genes=[]
indices=[]
lengths=[]
for index,row in frame[::-1].iterrows():
    consecutive=(row.accessory_fraction>score_threshold)
    if consecutive:
        i=i+1
        #print(i)
        #print(index)
    else:
        exc=exc+1
        if(row.accessory|(row['count']>1)|(exc<fraction_threshold*i)):
            i=i+1
            #print(index)
        else:
            if i>n_threshold:
                indices.append(index)
                lengths.append(i)
                #print(index)
                #start_genes.append(index+1-i)
                #end_genes.append(index+1)
                print("Possible MGE: " + str(index+2) + "-" + str(index+2+i))
            i=0
            exc=0
                
# find consecutive genes with mobility score exceeding a threshold 

Possible MGE: 1696-1718
Possible MGE: 1541-1555
Possible MGE: 1146-1203
Possible MGE: 946-964
Possible MGE: 858-875
Possible MGE: 841-852
Possible MGE: 736-804
Possible MGE: 500-554
Possible MGE: 410-440
Possible MGE: 102-119


In [100]:
def end_detection(mob_frame):
    genes,lengths=get_indices_lengths(mob_frame[::-1])
    end_genes=list(np.array(genes)+np.array(lengths))
    return [gene+2 for gene in end_genes][::-1]

In [94]:
def start_detection(mob_frame):
    genes,lengths=get_indices_lengths(mob_frame)
    start_genes=list(np.array(genes)-np.array(lengths))
    return start_genes #[gene+1 for gene in end_genes]

In [95]:
def get_indices_lengths(frame, score_threshold=score_threshold, fraction_threshold=fraction_threshold, n_threshold=n_threshold):
    i=0
    exc=0
    indices=[]
    lengths=[]
    for index,row in frame.iterrows():
        consecutive=(row.accessory_fraction>score_threshold)
        if consecutive:
            # look for consecutive genes with an elevated mobility score
            i=i+1
        else:
            exc=exc+1
            #exc are exceptions, so genes that do not have an elevated mobility score
            if(row.accessory|(row['count']>1)|(exc<fraction_threshold*i)):
                # in case these exceptions are multi-copy or accessory, or when they do not appear too close to the start of the potential MGE, they are tolerated
                i=i+1
            else:
                # otherwise, they are not
                if i>n_threshold:
                    # if the number of consecutive genes that belong to a potential MGE is large enough, the potential MGE is predicted to be a true MGE
                    # in this case, the index and length of the MGE are saved
                    indices.append(index)
                    lengths.append(i)
                # parameters are set to zero again, to find a new (potential) MGE
                i=0
                exc=0
    return indices,lengths
                

In [148]:
def generate_output(genome_contig_file):
    contig_extension=genome_contig_file.split('-')[1]
    contig=str(contig_extension.split('.')[0]+'.'+contig_extension.split('.')[1])
    MGE_frame=pd.DataFrame(columns=['contig', 'MGE', 'gene_nr'])
    mob_frame=pd.read_csv(input_path / genome_contig_file, index_col=0)
    start_genes=start_detection(mob_frame)
    end_genes=end_detection(mob_frame)
    if len(start_genes)>0:
        for i in range(0,len(start_genes)):
            for gene in range(start_genes[i], end_genes[i]+1):

                MGE_frame=MGE_frame.append({'contig':contig,'MGE':i+1,'gene_nr':gene}, ignore_index=True)
    MGE_frame.to_csv(output_path / genome_contig_file)       
    #return MGE_frame

In [150]:
generate_output(genome_contig_file)