# Simulate incomplete and contaminated bins using the EMP500 dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from scipy.stats import pmean
from collections import defaultdict
import seaborn as sns

Load data

In [2]:
def load_mapping(file):
    contigs = defaultdict(list)
    with open(file, 'r') as f:
        for line in f:
            l = line.strip().split(',')
            contig_id = l[0]
            prokka_id = l[1]
            contigs[contig_id].append(prokka_id)
    return dict(contigs)

In [3]:
def save_sampled_genes(file, genes):
    with open(file, 'w') as f:
        for gene in genes:
            f.write(f'{gene}\n')

### Iterative approach: Given a threshold for the MAG incompleteness, randomly select a contig and iteratively remove ORFs until satisfying the threshold. If more contigs are necessary to satisfy the threshold, randomly select another contig and so on.

In [4]:
def sample_contigs_genes_v2(contigs, threshold, seed = 42):
    '''
    Remove ORFs sequentially from randomly sampled contig until incompleteness is satisfied.
    Here completeness refers to the total number of genes.
    Parameters
    ----------
        contigs : dict
            Contig names (keys) and list of prokka genes (values)
        threshold: float
            Completeness threshold--determines the number of genes to remove
    Returns
    -------
        genes_to_remove : list
            List of genes to be removed.
    '''
    # Fix seed for reproducibility
    rng = np.random.default_rng(seed)
    
    # Calculate total number of genes and the number of genes to remove
    num_genes_contig = sum(len(genes) for genes in contigs.values())
    num_genes_to_remove = int(threshold * num_genes_contig)
    
    # Flatten contig names
    contig_list = list(contigs.items())
    contig_indices = np.arange(len(contig_list))
    
    genes_to_remove = []
    # print(f'Number of genes to remove: {num_genes_to_remove} out of {num_genes_contig}')
    
    while num_genes_to_remove > 0 and len(contig_indices) > 0:
        # Randomly sample a contig
        sampled_contig_idx = rng.choice(contig_indices)
        contig_name, genes = contig_list[sampled_contig_idx]
        num_genes = len(genes)
        # print(f'\tSampled contig: {contig_name}, len: {num_genes}')
        
        if num_genes <= num_genes_to_remove:
            genes_to_remove.extend(genes)
            num_genes_to_remove -= num_genes
        else:
            for gene in genes:
                genes_to_remove.append(gene)
                num_genes_to_remove -= 1
                if num_genes_to_remove <= 0:
                    break
        
        # Remove the sampled contig index from the list
        contig_indices = contig_indices[contig_indices != sampled_contig_idx]
        # print(f'\tRemaining number of genes to remove: {num_genes_to_remove}')
        
    return genes_to_remove

**For all MAGs and thresholds, 10 replicates:**

In [5]:
replicates = np.arange(10)
thresholds = [0.05, 0.1, 0.2, 0.4]

Completeness

In [6]:
bins = np.loadtxt('./input_data/mags.txt', dtype = 'str')
bins.shape

(811,)

In [7]:
%%time
for bi in bins:
    print(f'Bin: {bi}')
    # Load mapping between contigs and ORFs, from GFF file of prokka output
    contigs = load_mapping(f'./input_data/mapping/{bi}.mapping')
    # Replicates
    for rep in replicates:
        # Sample ORFs to remove according to threshold
        for threshold in thresholds:
            sampled = sample_contigs_genes_v2(contigs, threshold, rep)
            save_sampled_genes(f'./output_data/incompleteness/sampled_genes/{bi}_sample_contigs_genes_th_{threshold}_rep_{rep}.txt', sampled)

Bin: Bioreactor.Metabat.Bin.1
Bin: Bioreactor.Metabat.Bin.10
Bin: Bioreactor.Metabat.Bin.11
Bin: Bioreactor.Metabat.Bin.14
Bin: Bioreactor.Metabat.Bin.16
Bin: Bioreactor.Metabat.Bin.19
Bin: Bioreactor.Metabat.Bin.2
Bin: Bioreactor.Metabat.Bin.20
Bin: Bioreactor.Metabat.Bin.22
Bin: Bioreactor.Metabat.Bin.23
Bin: Bioreactor.Metabat.Bin.24
Bin: Bioreactor.Metabat.Bin.28
Bin: Bioreactor.Metabat.Bin.31
Bin: Bioreactor.Metabat.Bin.32
Bin: Bioreactor.Metabat.Bin.33
Bin: Bioreactor.Metabat.Bin.34
Bin: Bioreactor.Metabat.Bin.35
Bin: Bioreactor.Metabat.Bin.39
Bin: Bioreactor.Metabat.Bin.4
Bin: Bioreactor.Metabat.Bin.40
Bin: Bioreactor.Metabat.Bin.43
Bin: Bioreactor.Metabat.Bin.44
Bin: Bioreactor.Metabat.Bin.46
Bin: Bioreactor.Metabat.Bin.47
Bin: Bioreactor.Metabat.Bin.48
Bin: Bioreactor.Metabat.Bin.49
Bin: Bioreactor.Metabat.Bin.5
Bin: Bioreactor.Metabat.Bin.51
Bin: Bioreactor.Metabat.Bin.56
Bin: Bioreactor.Metabat.Bin.58
Bin: Bioreactor.Metabat.Bin.62
Bin: Bioreactor.Metabat.Bin.63
Bin: Bioreac