In [None]:
def is_notebook() -> bool:
    try:
        shell = get_ipython().__class__.__name__
        if shell == 'ZMQInteractiveShell':
            return True   # Jupyter notebook or qtconsole
        elif shell == 'TerminalInteractiveShell':
            return False  # Terminal running IPython
        else:
            return False  # Other type (?)
    except NameError:
        return False      # Probably standard Python interpreter

In [None]:
# Здесь загружаются названия референса, файла с ридами (bam), файла с геномами
import argparse

if is_notebook() == False:
    parser = argparse.ArgumentParser(description='Supply reference fasta and bam file')
    parser.add_argument('ref',
                        help='reference fasta')
    parser.add_argument('bam',
                        help='bam file')
    parser.add_argument('cont',
                        help='list of contaminants fasta')


    args = parser.parse_args()
    ref_fname = args.ref
    bam_fname = args.bam
    genomes_fname = args.cont
    
else:
    ref_fname     = 'refchrm.fa'
    bam_fname     = 'iintest.bam'
    genomes_fname = 'contaminants.fa'

In [None]:
import os
from collections import Counter
import pysam
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from IPython.display import clear_output
from scipy.special import binom
import scipy.stats as st
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from multiprocess import Pool
import matplotlib.pyplot as plt
import seaborn as sns
from preprocess import *
from functions import *
os.system('python setup.py build_ext --inplace')
from MN import *

In [None]:
def do_mcmc(n_iterations = 50000, output_file='', n_threads=8, model=0, show_each=10):
    if output_file != '':
        res = open(output_file,'w')
    p_list = []
    num_reads, num_genomes  = MC.shape
    print(MC.shape)
    p = np.random.dirichlet([1]*num_genomes)
    # pool = Pool(n_threads)
    for i in tqdm(range(n_iterations) ):
        
        func = lambda x: get_Zi(MC, p, base_err, x)
        
        # Z = np.array(pool.map_async(func, range(num_reads)).get())
        Z = np.array([func(s) for s in range(num_reads) ])
        eta = get_eta(Z, num_genomes)
        if model == 0:
            p0 = np.random.beta(1 + eta[0],1+num_reads-eta[0])
            p_other = np.random.dirichlet(1+ eta[1:])
            p_other *= (1-p0)/p_other.sum()

            p[0] = p0
            p[1:] = p_other
            p_list.append(p[0])
        else:
            p = np.random.dirichlet(1+ eta)
            p_list.append(p[0])
        if output_file != '':
            res.write(f'iteration {i}')
            res.write(str(p[0]))
        if i % show_each == 0:
            # print(p[0], p[1:].sum()) 
            print(p)
    # pool.close()
    if output_file != '':
        res.close()
    return p_list

In [None]:
bam, genomes = preprocess(ref_fname, genomes_fname, bam_fname)

In [None]:
genomes_arr = make_genomes_arr(genomes)

In [None]:
same = get_same(genomes_arr)

In [None]:
genomes0 = (''.join( np.array(genomes_arr, dtype = str)[0])).upper()

In [None]:
aln_coords = get_aln_pos(genomes0)

In [None]:
M, N, base_err = get_MN(genomes_arr, bam, aln_coords, same)

In [None]:
print(f'#base error is {base_err}')

In [None]:
MC = get_mc(M, N, base_err)

In [None]:
idx = [i for i in range(len(MC)) if not np.all(MC[i]==MC[i,0])]
MC = MC[idx]

In [None]:
P = do_mcmc(10000, n_threads=1, model=1, show_each=100)