In [1]:
# Здесь загружаются названия референса, файла с ридами (bam), файла с геномами
# import argparse

# parser = argparse.ArgumentParser(description='Supply reference fasta and bam file')
# parser.add_argument('ref',
#                     help='reference fasta')
# parser.add_argument('bam',
#                     help='bam file')


# args = parser.parse_args()
# ref_fname = args.ref
# bam_fname = args.bam

ref_fname = 'refchrm.fa'
bam_fname = 'simulated_data.bam'
genomes_fname ='contaminants.fa'

In [2]:
# Импорт всех необходимых библиотек
import os
from collections import Counter
import pysam
import numpy as np
from tqdm import tqdm
from IPython.display import clear_output
from scipy.special import binom
import scipy.stats as st
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from multiprocess import Pool
%load_ext cython

In [3]:
#Cython part

In [4]:
%%cython -a --compile-args=-O3

from tqdm import tqdm
import pysam
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from collections import Counter
from IPython.display import clear_output
cimport cython
import numpy as np
from cython.parallel import prange
from libc.math cimport pow
from scipy.special import binom


def get_num_reads(str bam_fname):
    ''''
    This function calculate mapped reads
    '''
    samfile = pysam.AlignmentFile(bam_fname, "rb")
    num_reads = 0
    for read in samfile.fetch('chrM'):
        if read.is_mapped:
            num_reads += 1
    samfile.close()
    return num_reads


def bam2consensus(bam_fname, double ac_threshold=0, double af_threshold=0):
    '''
    Make consensus fasta
    
    Parameters:
        ref_name : str
        path to a file with reference
        bam_fname : str
        path to a bam file
        ac_threshold : float
        af_threshold : float
    '''
    cdef str consensus = ''
    cdef int max_count, total_count
    cdef str allele
    cdef str max_allele
   
        # consensus = "N" * len(record)

    with pysam.AlignmentFile(bam_fname, "rb") as bam:
        allele_counter = Counter()
        for pileup_column in tqdm(bam.pileup(), total=16569, desc = 'consensus dna'):
            assert pileup_column.reference_name == 'chrM'
            pos = pileup_column.reference_pos

            allele_counter.clear()
            for pileup_read in pileup_column.pileups:
                if pileup_read.is_del:
                    allele = "-"
                else:
                    allele = pileup_read.alignment.query_sequence[
                        pileup_read.query_position]
                allele_counter[allele] += 1

            max_allele = "N"
            max_count, total_count = 0, 0
            for allele, count in allele_counter.items():
                if count > max_count:
                    max_count = count
                    max_allele = allele
                total_count += count

            assert max_allele in "ACGTN-"
            if (max_count >= ac_threshold and
                max_count / total_count >= af_threshold):
                consensus += max_allele
            else:
                consensus += 'N'

    return consensus.replace('-','')


@cython.cdivision(True)
@cython.wraparound(False)
@cython.boundscheck(True)
def get_MN(char[:, :] genomes,str bam_fname, long[:] aln_coords, same, trunc = 0, verbosity = False):
    '''
    Make matrices and base error for the method.
    
    Parameters:
        genomes: char
        vector where on j row is j genome.
        
        bam_fname: str
        bam file
        
        same : dict
        dictionary of positions and bases where all genomes have the same base.
    
    Output:
        M : np.ndarray[float, float]
            matrix where M[i, j] is number of bases where read i has same bases with genome 
        N : np.ndarray[float, float]
            matrix where N[i, j] is number of bases where read i has different bases with genome j

    
    '''
    bam = pysam.AlignmentFile(bam_fname, "rb")
    cdef double[:, :] M, N
    cdef int k, i, j, pos, offset
    offset = 0
    cdef double correct, incorrect, P_cor
    cdef str seq
    cdef long num_reads = get_num_reads(bam_fname)
    cdef int num_genomes = genomes.shape[0]
    cdef double base_incorr = 0
    cdef double base_total = 0
    N = np.zeros((num_reads, num_genomes))
    M = np.zeros((num_reads, num_genomes))
    i = 0
    j = 0
    for read in tqdm(bam.fetch('chrM'), total = bam.count(), desc = 'MN tables'):
        
        if not read.is_mapped:
            continue
            
        seq = read.query_sequence
        pos = read.reference_start
        
        if read.cigartuples[0][0] == 4: #read is soft clipped
            left_trim = read.cigartuples[0][1]
            seq = seq[left_trim:]
            
        # if read.cigartuples[-1][0] == 4: #read is soft clipped
        #     right_trim = read.cigartuples[-1][1]
        #     seq = seq[:-right_trim]
        
        
        oldest_pos = pos
        
        qual = read.query_qualities
        
        ins_pos = 0
        
        for j in range(num_genomes):
            if "I" in read.cigarstring: # Check for indels
                if j == 0:
                    # print('INDEL')
                    M[i, j] = -1
                    N[i, j] = -1
                else:
                    cigar = read.cigartuples

                    for m in range(len(cigar)):
                        if cigar[m][0] == 1:
                            for s in range(m):
                                ins_pos += cigar[s][1]
                            break
                            
                    # if pos + ins_pos < genomes.shape[1] and chr(genomes[s, aln_coords[pos] + ins_pos]) != seq[aln_coords[pos]+ins_pos]:
                    #     M[i, j] = -1
                    #     N[i, j] = -1
            
            
            
            if "D" in read.cigarstring: # Check for indels
                if j == 0:
                    M[i, j] = -1
                    N[i, j] = -1
                else: # пока нуждается в доработке но посмотрим
                    cigar = read.cigartuples
                    ins_pos = 0
                    for m in range(len(cigar)):
                        if cigar[m][0] == 1:
                            for s in range(m):
                                ins_pos += cigar[s][1]
                            # break
                    if chr(genomes[s, aln_coords[pos+ins_pos]]) != '-':
                        M[i, j] = -1
                        N[i, j] = -1
                        

                        
            correct = 0
            incorrect = 0
            offset = 0
            debug_str = ''

            for k in range(trunc,len(seq)-trunc):
                
                if k + pos + offset >= genomes.shape[1]:
                    break
                # if k + pos + offset >= len(aln_coords):
                    # break;
                    
                while chr(genomes[j, aln_coords[pos] + k +offset]).upper() == '-' or chr(genomes[0, aln_coords[pos] + k + offset]).upper() == '-':
                    if chr(genomes[j, aln_coords[pos] + k + offset]).upper() == '-' and chr(genomes[0, aln_coords[pos] + k + offset]).upper() == '-':
                        offset += 1
                    
                    elif chr(genomes[j, aln_coords[pos] + k + offset]).upper() == '-':
                        if 'D' not in read.cigarstring:
                            M[i, j] = -1
                            N[i, j] = -1
                            break
                        else:
                            correct += 1
                            offset += 1 
                        
                    elif chr(genomes[0, aln_coords[pos] + k + offset]).upper() == '-':
                        if 'I' not in read.cigarstring:
                            M[i, j] = -1
                            N[i, j] = -1
                            break
                        else:
                            break
                    if k + pos + offset >= genomes.shape[1]:
                        break
                            
                    # print(pos + k + offset)
                
                if chr(genomes[j, aln_coords[pos] + k + offset]).upper() == 'N' or seq[k] == 'N':
                    # correct += 0.25
                    # incorrect += 0.75
                    # correct +=1
                    pass
                                
                elif seq[k] == chr(genomes[j, aln_coords[pos] + k + offset]).upper(): #means that read has same base with j genome
                    P_cor = 1 - 10**(- qual[k]/10)
                    correct += P_cor
                    incorrect += 1 - P_cor
                    if verbosity:
                        debug_str += f'{seq[k]}, {chr(genomes[j][aln_coords[pos] + k + offset ]).upper()} => +1\n'
                    # correct += 1 #old version for debugging
                    if aln_coords[pos] + k +offset in same:
                        base_total += 1
                        
                else:
                    # print(seq[k], chr(genomes[j][k+pos]).upper()) #means that read has same difference with j genome
                    P_cor = (10**(- qual[k]/10))/3
                    incorrect += 1 - P_cor
                    correct += P_cor
                    if verbosity:
                        debug_str += f'{seq[k]}, {chr(genomes[j][aln_coords[pos] + k + offset]).upper()} => -1\n'
                    # incorrect += 1
                    if aln_coords[pos] + k + offset in same:
                        # print("plus incorrect")
                        base_total += 1
                        base_incorr += 1
                        

            if M[i, j] != -1:
                M[i, j] = correct
                N[i, j] = incorrect

                
                
            # print("total:", M[i,j],N[i,j], sep='\n')
            if verbosity:
                if M[i, j] < N[i, j] and pos != 0:
                    print('ERROR',i, j, seq, oldest_pos, pos, debug_str, M[i, j], N[i, j], sep = '\n')
                
        
        i += 1
        
    bam.close()
    cdef double base_err = base_incorr / base_total
    return np.array(M, dtype=np.float64), np.array(N, dtype=np.float64), base_err

@cython.cdivision(True)
@cython.wraparound(False)
@cython.boundscheck(False)
def get_mc(double[:, ::1] m, double[:, ::1] n, double eps):
    ''''
    Преобразует M N и base_err в матрицу mc,  которая в дальнейшем используется для остальных вычислений
    '''
    cdef double[:,::1] mc
    cdef long num_reads = m.shape[0]
    cdef long num_genomes = m.shape[1] 
    cdef long j
    mc = np.zeros((num_reads, num_genomes))
    for i in range(num_reads):
        for j in range(num_genomes):
            if m[i, j] == -1:
                mc[i, j] = 0
            else:
                mc[i, j] = binom(m[i, j] + n[i, j], m[i, j]) * (1 - eps)**(m[i, j]) * eps**n[i, j]
    return np.asarray(mc)
    




@cython.cdivision(True)
# @cython.wraparound(False)
# @cython.boundscheck(False)
def get_Zi(double[:,::1] mc, double[::1] p,double eps, long i):
    cdef long num_reads = mc.shape[0]
    cdef long num_genomes = mc.shape[1] 
    cdef long j
    cdef long Z
    cdef double[:] probs
    cdef double s
    s = 0
    probs = np.zeros(num_genomes, dtype = float)
    

    for j in range(num_genomes):
        probs[j] = mc[i, j] * p[j]
        s += probs[j]

        
    for j in range(num_genomes):
        probs[j] =probs[j] / s
        
    Z = random_choice(probs)

    return Z


@cython.cdivision(True)
@cython.wraparound(False)
@cython.boundscheck(False)
def get_eta(long[:] z,int num_genomes):
    '''
    create vector eta
    eta[j] is number of reads that were predicted to be from j-th genome 
    '''
    
    cdef long[:] eta
    eta = np.zeros(num_genomes, dtype = int)
    
    cdef int num_reads
    num_reads = z.shape[0]
    
    cdef int i
    
    for i in range(num_reads):
        eta[z[i]] += 1
        
    return np.array(eta)


cdef extern from "stdlib.h":
    double drand48()
    void srand48(long int seedval)

    
@cython.cdivision(True)
@cython.wraparound(False)
@cython.boundscheck(False)
def random_choice(double[:] probs):
    '''Returns random number from 0 to n-1 according to probs
        Check if probs do not sum up to 1!'''
    
    cdef double s = 0
    cdef int i
    cdef int l = len(probs)
    
    

    
    cdef double x = drand48()
    cdef double cum_probs = 0
    cdef long n = 0
    while x > cum_probs:
        cum_probs += probs[n]
        n += 1
    n -= 1
    return n

In [5]:
def get_base_err(bam_fname, ref, aln_pos, same_set):
    bam = pysam.AlignmentFile(bam_fname, "rb")
    correct = 0
    incorrect = 0
    for readId, read in enumerate(bam.fetch('chrM')):
        
        if not read.is_mapped or 'D' in read.cigarstring or 'I' in read.cigarstring:
            continue
            
        seq = read.query_sequence
        pos = read.reference_start
        
        if read.cigartuples[0][0] == 4: #read is soft clipped
            left_trim = read.cigartuples[0][1]
            seq = seq[left_trim:]
                        

                        
        
        offset = 0
        debug_str = ''

        for k in range(len(seq)):
            if aln_pos[pos+k] in same_set:
                if seq[k].upper() == ref[aln_pos[pos+k]]:
                    correct+=1
                else:
                    # print(pos, k, readId)
                    incorrect += 1
    return correct, incorrect, incorrect/(correct + incorrect)
                
    

In [6]:
# def consensus_caller(ref_fname, bam_fname):
#     base = bam_fname[:-4]
#     os.system(f"samtools view {bam_fname} chrM -o {base+'_mt.bam'}")
#     base = base + '_mt'
#     os.system(f'samtools consensus -o {base}_st.fa {bam_fname}') #st means samtools
#     os.system(f'bwa index -a bwtsw {base}_st.fa') #indexing consensus
#     os.system(f'samtools faidx {base}_st.fa')
#     os.system(f'rm {base}.dict')
#     os.system(f'picard CreateSequenceDictionary R={base}.fa O={base}.dict')
#     os.system(f'samtools fastq {bam_fname} > {base}.fq')
#     os.system(f'bwa aln -l 1000 -t 10 {base}_st.fa {base}.fq > {base}_ra.sai')
#     os.system(f"bwa samse -r '@RG\\tID:{base}\\tLB:{base}_L1\\tPL:ILLUMINA\\tSM:{base}' {base}_st.fa {base}_ra.sai {base}.fq |  samtools sort -O BAM -o {base}_ra.sort.bam")
#     os.system(f'samtools index {base}_ra.sort.bam')
#     consensus = bam2consensus(f'{base}_st.fa', f'{base}_ra.sort.bam')
#     consensus_fa = '>chrM\n'+''.join(consensus) +'\n'
#     with open(f'{base}.fa', 'w') as new_genomes:
#         new_genomes.write(consensus_fa)
#     return consensus
    
    

In [7]:
def preprocess(ref_fname, genomes_fname, bam_fname):
    
    base = bam_fname[:-4]
    
    pysam.index(bam_fname);
    
    os.system(f"samtools view {bam_fname} chrM -o {base+'_mt.bam'}")
    base = base + '_mt'
    
    print('#EXTRACTING MTDNA OK')
    
    # Один из вариантов получения консенсуса, работает не очень хорошо.
    '''consensus = bam2consensus(ref_fname, bam_fname)
    consensus_fa = '>chrM\n'+''.join(consensus) +'\n'
    with open(f'{base}.fa', 'w') as new_genomes:
        new_genomes.write(consensus_fa)'''
    
    
    # os.system(f'samtools consensus -o {base}.fa {bam_fname}')
    
#     os.system(f'sh gatkconsensus.sh {ref_fname} {base}.bam  {base}1.fa')
#     os.system(f'''awk -i '/^>/{{print ">chrM"; next}}{{print}}' {base}1.fa > {base}.fa''')
#     os.system(f'rm {base}1.fa')
    
    # os.system(f'cp genome_0.fa {base}.fa')
    
    os.system(f'bcftools mpileup  -d 2000 -m 3 -C50 -q 30 -EQ 20 -f {ref_fname} {base}.bam | bcftools call -m --ploidy 1 > {base}.vcf')
    os.system(f'perl CnsMaj3_1.pl -i {base}.vcf -o {base}.fa -l 16569 -cov 1 -diff 0.5 -idiff 0.5 -h {base} -callindels no > {base}.cns')
    
    # return 0
    
    print('#CONSENSUS IS READY')
    
    os.system(f'cat {base}.fa {genomes_fname} > {base}_genomes.fa'); # gather consensus and possible contaminants together
    os.system(f'mafft {base}_genomes.fa >  {base}_aligned.fa') # do multiple alignment
    aligned_genomes = f'{base}_aligned.fa'
    print("#ALL GENOMES ARE READY")
    # new_cons = list(SeqIO.parse(f'{base}_aligned.fa', "fasta"))[0]
    # SeqIO.write(new_cons, f'{base}.real.fa', "fasta") #gives you reference after it have been realigned with MAFFT
    os.system(f'bwa index -a bwtsw {base}.fa') #indexing consensus
    os.system(f'samtools faidx {base}.fa')
    os.system(f'rm {base}.dict')
    os.system(f'picard CreateSequenceDictionary R={base}.fa O={base}.dict')
    os.system(f'samtools fastq {bam_fname} > {base}.fq')
    os.system(f'bwa aln -l 1000 -t 10 {base}.fa {base}.fq > {base}_ra.sai')
    os.system(f"bwa samse -r '@RG\\tID:{base}\\tLB:{base}_L1\\tPL:ILLUMINA\\tSM:{base}' {base}.fa {base}_ra.sai {base}.fq | samtools sort -O BAM -o {base}_ra.sort.bam")
    # os.system(f'picard MarkDuplicates I={base}_ra.sort.bam O={base}_ra.sort.rmdup.bam METRICS_FILE=metrics.txt TMP_DIR=temp REMOVE_DUPLICATES=true ASSUME_SORTED=true ') #VALIDATION_STRINGENCY=LENIENT
    
    total_reads = int(pysam.view('-c', bam_fname))
    need_reads = 64000
    proportion = need_reads / total_reads
    # print(proportion)

    # -s {123+proportion}
    os.system(f'samtools view  -O BAM {base}_ra.sort.bam | samtools sort > {base}_ra.sort.rmdup.bam')
    
    pysam.index(f'{base}_ra.sort.rmdup.bam');
    os.system(f'samtools calmd -Erb {base}_ra.sort.rmdup.bam {base}.fa > {base}_ra.final.bam 2>/dev/null');
    bam_final = f'{base}_ra.final.bam'
    os.system(f'samtools index {base}_ra.final.bam')
    os.system(f'rm {base}_ra.sai')
    print("#BAM FILE IS READY")
    return bam_final, aligned_genomes

In [8]:
def make_genomes_arr(genomes_fname):
    genomes = list()
    for record in SeqIO.parse(genomes_fname, "fasta"):
        genomes.append(str(record.seq))
    genomes_arr = np.array([list(x) for x in genomes], dtype = 'S1')
    return genomes_arr

In [9]:
def get_same(genomes_arr):
    same_positions = []
    for i in range(genomes_arr.shape[1]):
        if len(np.unique(genomes_arr[:,i])) == 1:
            same_positions.append(i)
    return set(same_positions)

In [10]:
def get_base_err(bam_fname, same_dict):
    samfile = pysam.AlignmentFile(bam_fname, "rb" )
    same_positions = list(same_dict.keys())
    correct = 0
    total = 0
    samfile = pysam.AlignmentFile(bam_fname, "rb" )
    for pileupcolumn in tqdm(samfile.pileup("chrM")):
        pos = pileupcolumn.pos
        if pos not in same_positions:
            continue

        for pileupread in pileupcolumn.pileups:
            if not pileupread.is_del and not pileupread.is_refskip:
                total += 1
                # query position is None if is_del or is_refskip is set.
                nbase =  pileupread.alignment.query_sequence[pileupread.query_position]
                if nbase == same[pos].decode('ascii').upper():
                    correct += 1
    base_err = 1 - correct/total
    samfile.close()
    return base_err

In [11]:
def get_aln_pos(reference):
    aln_coor = []
    for i in range(len(reference)):
        if reference[i] != '-':
            aln_coor.append(i)
            
    return np.asarray(aln_coor)

In [12]:
def do_mcmc(n_iterations = 50000, output_file='', n_threads=8, model=0, show_each=10):
    if output_file != '':
        res = open(output_file,'w')
    num_reads, num_genomes  = MC.shape
    print(MC.shape)
    p = np.random.dirichlet([1]*num_genomes)
    # pool = Pool(n_threads)
    for i in tqdm(range(n_iterations) ):
        
        func = lambda x: get_Zi(MC, p, base_err, x)
        
        # Z = np.array(pool.map_async(func, range(num_reads)).get())
        Z = np.array([func(s) for s in range(num_reads) ])
        eta = get_eta(Z, num_genomes)
        if model == 0:
            p0 = np.random.beta(1 + eta[0],1+num_reads-eta[0])
            p_other = np.random.dirichlet(1+ eta[1:])
            p_other *= (1-p0)/p_other.sum()

            p[0] = p0
            p[1:] = p_other
        else:
            p = np.random.dirichlet(1+ eta)
        if output_file != '':
            res.write(f'iteration {i}')
            res.write(str(p[0]))
        if i % show_each == 0:
            # print(p[0], p[1:].sum()) 
            print(p)
    # pool.close()
    if output_file != '':
        res.close()
    return p

In [13]:
bam, genomes = preprocess(ref_fname, genomes_fname, bam_fname)

#EXTRACTING MTDNA OK


[mpileup] 1 samples in 1 input files
[mpileup] maximum number of reads per input file set to -d 2000


#CONSENSUS IS READY
#ALL GENOMES ARE READY


nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8176 kb
generating a scoring matrix for nucleotide (dist=200) ... done
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..

There are 19 ambiguous characters.
    1 / 2
done.

Constructing a UPGMA tree (efffree=1) ... 
    0 / 2
done.

Progressive alignment 1/1... 
STEP     1 / 1 
done.

disttbfast (nuc) Version 7.490
alg=A, model=DNA200 (2), 1.53 (4.59), -0.00 (-0.00), noshift, amax=0.0
0 thread(s)


Strategy:
 FFT-NS-1 (Very fast but very rough)
 Progressive method (rough guide tree was used.)

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring scheme has been changed in version 7.110 (2013 Oct).
It tends to insert more gaps into gap-rich regions than previous versions.
To disable this change, add the --leavegappyregion option.

[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT 

#BAM FILE IS READY


In [14]:
# bam = 'simulated_data.bam'

In [15]:
genomes_arr = make_genomes_arr(genomes)

In [16]:
np.sum(genomes_arr[0] != genomes_arr[1])

19

In [18]:
genomes_arr.shape

(2, 16569)

In [24]:
genomes_arr

array([[b'g', b'a', b't', ..., b'a', b't', b'g'],
       [b'g', b'a', b't', ..., b'a', b't', b'g']], dtype='|S1')

In [181]:
genomes

'simulated_data_mt_aligned.fa'

In [183]:
genomes_arr

array([[b'g', b'a', b't', ..., b'a', b't', b'g'],
       [b'g', b'a', b't', ..., b'a', b't', b'g'],
       [b'g', b'a', b't', ..., b'a', b't', b'g']], dtype='|S1')

In [184]:
same = get_same(genomes_arr)

In [185]:
genomes0 = (''.join( np.array(genomes_arr, dtype = str)[0])).upper()

In [186]:
pysam.view('-c', bam)

'100000\n'

In [187]:
print(pysam.stats(bam))

# This file was produced by samtools stats (1.15.1 (pysam)+htslib-1.15.1 (pysam)) and can be plotted using plot-bamstats
# This file contains statistics for all reads.
# The command line was:  stats simulated_data_mt_ra.final.bam
# CHK, Checksum	[2]Read Names	[3]Sequences	[4]Qualities
# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow)
CHK	21db2fc0	553baf33	2396f1e2
# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.
SN	raw total sequences:	100000	# excluding supplementary and secondary reads
SN	filtered sequences:	0
SN	sequences:	100000
SN	is sorted:	1
SN	1st fragments:	100000
SN	last fragments:	0
SN	reads mapped:	100000
SN	reads mapped and paired:	0	# paired-end technology bit set + both mates mapped
SN	reads unmapped:	0
SN	reads properly paired:	0	# proper-pair bit set
SN	reads paired:	0	# paired-end technology bit set
SN	reads duplicated:	0	# PCR or optical duplicate bit set
SN	reads MQ0:	0	# mapped and MQ=0
SN	reads QC failed:	0
SN	no

In [188]:
aln_coords = get_aln_pos(genomes0)

In [189]:
get_num_reads(bam)

100000

In [190]:
M, N, base_err = get_MN(genomes_arr, bam, aln_coords, same)

MN tables: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:08<00:00, 11652.64it/s]


In [191]:
base_err*100

0.016130889730923116

In [192]:
M.shape

(100000, 3)

In [193]:
(M[:,1] == np.max(M,axis=1)).mean()

0.9882

In [194]:
(M[:,1] < N[:,1]).sum()

0

In [195]:
base_err

0.00016130889730923116

In [196]:
((M[:, 0] < N[:, 0])).sum() + (M[:,0] == -1).sum()

0

In [197]:
it = 4147
samfile = pysam.AlignmentFile(bam, "rb" )
read = list(samfile.fetch('chrM'))[it]
print(M[it], N[it])
print(read.cigarstring)
aln_pos = aln_coords[read.pos]
print(aln_coords[read.pos])
print(read.seq)
genome = (''.join( np.array(genomes_arr, dtype = str)[1])).upper()
print(genome[aln_coords[read.pos]: aln_coords[read.pos] + 120])

[99.99972794 99.99972794 98.99973005] [2.72061001e-04 2.72061001e-04 1.00026995e+00]
100M
685
AGATTACACATGCAAGCATCCCCGTTCCAGTGAGTTCACCCTCTAAATCACCACGATCAAAAGGGACAAGCATCAAGCACGCAGCAATGCAGCTCAAAAC
AGATTACACATGCAAGCATCCCCGTTCCAGTGAGTTCACCCTCTAAATCACCACGATCAAAAGGGACAAGCATCAAGCACGCAGCAATGCAGCTCAAAACGCTTAGCCTAGCCACACCCC


In [198]:
M

array([[99.99999953, 99.99999953, 99.99999953],
       [99.99919211, 99.99919211, 99.99919211],
       [99.97345027, 99.97345027, 99.97345027],
       ...,
       [99.99999941, 99.99999941, 99.99999941],
       [99.99999961, 99.99999961, 99.99999961],
       [99.99857819, 99.99857819, 99.99857819]])

In [199]:
genomes0.count('-')

0

In [200]:
# При большой ошибке большое число ридов не картируется, из-за чего точность оценки base_error падаеты

In [201]:
for i in range(M.shape[0]):
    for j in range(M.shape[1]):
        if M[i, j] < N[i, j]:
            M[i, j] = -1
            N[i, j] = -1

In [202]:
# (M[:,0] > M[:,1]).sum()/(M[:,0] != M[:,1]).sum()

In [203]:
print(f'#base error is {base_err}')

#base error is 0.00016130889730923116


In [204]:
MC = get_mc(M, N, base_err)

In [205]:
MC

array([[0.98399561, 0.98399561, 0.98399561],
       [0.98118289, 0.98118289, 0.98118289],
       [0.89510202, 0.89510202, 0.89510202],
       ...,
       [0.98399519, 0.98399519, 0.98399519],
       [0.98399586, 0.98399586, 0.98399586],
       [0.97904891, 0.97904891, 0.97904891]])

In [206]:
# (MC[:,0] != MC[:,1]) * (MC[:,0] != -1)

In [207]:
idx = np.where((MC[:,0]!=MC[:,1]))[0]

In [208]:
# vec = np.array([2,2,2,2])

In [209]:
# vec[0] == vec[1:]

In [210]:
# idx

In [211]:
 # (MC[:,0]!=0) *  (MC[:,1]!=0) * (M[:,1]>N[:,1])

In [212]:
MC = MC[idx]

In [213]:
MC

array([[8.07029864e-01, 1.24520046e-02, 5.12722869e-07],
       [9.82765452e-01, 1.58498064e-02, 6.66901140e-07],
       [9.83996669e-01, 1.58753002e-02, 6.68172636e-07],
       ...,
       [1.58753082e-02, 9.83997061e-01, 1.26781457e-04],
       [1.32680338e-02, 8.53393898e-01, 1.04157882e-04],
       [1.58753082e-02, 9.83997061e-01, 1.26781457e-04]])

In [214]:
(MC[:,0]>MC[:,1]).mean()

0.6976331360946746

In [215]:
genomes_arr.shape

(3, 16569)

In [216]:
# np.sum((M[:,1]>M[:,0] + 4))

In [217]:
P = do_mcmc(5000, n_threads=1, model=1, show_each=10)

(1690, 3)


  2%|███▌                                                                                                                                                                                                  | 91/5000 [00:00<00:10, 468.13it/s]

[0.69839326 0.30058909 0.00101764]
[0.68058445 0.31761229 0.00180326]
[0.69050791 0.30828678 0.00120531]
[0.70187291 0.29678281 0.00134427]
[7.19031987e-01 2.80806029e-01 1.61983984e-04]
[6.86338747e-01 3.13360230e-01 3.01023176e-04]
[0.69101714 0.30769683 0.00128603]
[6.90809999e-01 3.08593448e-01 5.96553698e-04]
[0.69856983 0.30068134 0.00074883]
[7.12628321e-01 2.86923291e-01 4.48388404e-04]


  3%|█████▋                                                                                                                                                                                               | 145/5000 [00:00<00:09, 497.92it/s]

[7.14512606e-01 2.85459933e-01 2.74610108e-05]
[0.69367046 0.30537569 0.00095385]
[0.6919461  0.30703086 0.00102304]
[7.04543992e-01 2.95058200e-01 3.97807183e-04]
[6.91656782e-01 3.08255016e-01 8.82024600e-05]


  5%|█████████▉                                                                                                                                                                                           | 253/5000 [00:00<00:09, 520.41it/s]

[7.14746523e-01 2.85064820e-01 1.88657357e-04]
[7.16366524e-01 2.83620238e-01 1.32373732e-05]
[7.27337109e-01 2.72661188e-01 1.70278019e-06]
[0.7026977  0.29611297 0.00118933]
[0.69971292 0.29920148 0.00108561]
[0.71423545 0.28431454 0.00145001]
[0.71407381 0.2849275  0.00099869]
[0.70323364 0.2957154  0.00105097]
[0.71790523 0.28038426 0.0017105 ]
[6.80021040e-01 3.19309995e-01 6.68964576e-04]
[0.69498472 0.30387042 0.00114487]


  6%|████████████                                                                                                                                                                                         | 307/5000 [00:00<00:08, 524.61it/s]

[0.68793001 0.31047278 0.00159721]
[0.71300138 0.28475117 0.00224745]
[0.7039813  0.29377148 0.00224722]
[7.02986720e-01 2.96316218e-01 6.97061276e-04]
[6.91294381e-01 3.08445875e-01 2.59744338e-04]


  8%|████████████████▍                                                                                                                                                                                    | 416/5000 [00:00<00:08, 530.54it/s]

[6.93393463e-01 3.06595664e-01 1.08732722e-05]
[0.72343945 0.27579922 0.00076133]
[0.71321846 0.28586288 0.00091867]
[7.11654745e-01 2.88270187e-01 7.50683390e-05]
[0.71434649 0.28326159 0.00239192]
[0.69873578 0.29896871 0.00229552]
[0.70007925 0.29902173 0.00089902]
[0.70766201 0.29100102 0.00133697]
[0.67823485 0.32090702 0.00085812]
[6.90061870e-01 3.09909599e-01 2.85311428e-05]
[0.6977812  0.30110144 0.00111736]


  9%|██████████████████▌                                                                                                                                                                                  | 470/5000 [00:00<00:08, 530.13it/s]

[7.10830601e-01 2.88968429e-01 2.00969986e-04]
[7.10163599e-01 2.89747275e-01 8.91259059e-05]
[0.70084673 0.29781333 0.00133995]
[0.70994244 0.28914074 0.00091682]
[7.19314144e-01 2.80490768e-01 1.95088203e-04]


 12%|██████████████████████▋                                                                                                                                                                              | 577/5000 [00:01<00:08, 527.43it/s]

[7.20228207e-01 2.79164056e-01 6.07736840e-04]
[0.71640369 0.28245297 0.00114334]
[0.70643191 0.29209453 0.00147357]
[0.71967597 0.27837293 0.0019511 ]
[0.69710915 0.30202052 0.00087033]
[0.69892821 0.30000464 0.00106715]
[0.69290021 0.30638017 0.00071962]
[0.70394023 0.29533938 0.00072039]
[6.93606745e-01 3.06277512e-01 1.15742983e-04]
[7.11498453e-01 2.88104863e-01 3.96684029e-04]
[0.70551707 0.29337645 0.00110648]


 13%|████████████████████████▊                                                                                                                                                                            | 630/5000 [00:01<00:08, 526.49it/s]

[0.70959272 0.28869573 0.00171155]
[7.15548586e-01 2.84437700e-01 1.37136589e-05]
[0.71391937 0.28163127 0.00444936]
[7.01819542e-01 2.98108164e-01 7.22940880e-05]
[6.89219741e-01 3.10335161e-01 4.45098568e-04]


 15%|█████████████████████████████                                                                                                                                                                        | 737/5000 [00:01<00:08, 526.47it/s]

[0.70738523 0.29043958 0.0021752 ]
[7.01305568e-01 2.98275360e-01 4.19072059e-04]
[0.68743985 0.31172428 0.00083587]
[0.70716008 0.29109959 0.00174033]
[7.16453104e-01 2.83222205e-01 3.24691152e-04]
[7.07024388e-01 2.92668831e-01 3.06781181e-04]
[7.13904393e-01 2.86044277e-01 5.13293880e-05]
[0.71876977 0.28019518 0.00103505]
[6.87444403e-01 3.12274045e-01 2.81551924e-04]
[7.05128085e-01 2.94598051e-01 2.73864034e-04]
[0.70645741 0.29272669 0.0008159 ]


 16%|███████████████████████████████▏                                                                                                                                                                     | 791/5000 [00:01<00:07, 529.41it/s]

[7.12144462e-01 2.87675192e-01 1.80345448e-04]
[0.7056281  0.29331019 0.00106171]
[0.6919549  0.30733254 0.00071257]
[7.04623117e-01 2.95372413e-01 4.47059691e-06]
[6.98279496e-01 3.01685283e-01 3.52211279e-05]
[0.72018964 0.27832195 0.00148841]


 18%|███████████████████████████████████▍                                                                                                                                                                 | 899/5000 [00:01<00:07, 531.90it/s]

[7.22676765e-01 2.77130234e-01 1.93000509e-04]
[7.15457557e-01 2.84325641e-01 2.16801929e-04]
[0.71594855 0.28211216 0.00193929]
[0.70529366 0.2933082  0.00139814]
[0.69712777 0.30186579 0.00100644]
[6.88381715e-01 3.11517779e-01 1.00506917e-04]
[0.69113002 0.30751738 0.0013526 ]
[6.94980259e-01 3.05019476e-01 2.65087155e-07]
[0.71474115 0.28282087 0.00243798]
[0.69278648 0.30624579 0.00096774]


 19%|█████████████████████████████████████▌                                                                                                                                                               | 954/5000 [00:01<00:07, 534.43it/s]

[0.70135295 0.29597067 0.00267637]
[7.04223484e-01 2.95747548e-01 2.89681216e-05]
[6.88918957e-01 3.10561471e-01 5.19572371e-04]
[0.70804551 0.29003608 0.00191841]
[6.95741872e-01 3.03741600e-01 5.16528305e-04]
[0.7000327  0.2989577  0.00100961]


 20%|███████████████████████████████████████▌                                                                                                                                                            | 1008/5000 [00:01<00:07, 532.63it/s]

[7.11628953e-01 2.88150869e-01 2.20178622e-04]
[0.6990415  0.30021685 0.00074165]
[7.01895520e-01 2.97513854e-01 5.90626732e-04]
[6.94082797e-01 3.05484865e-01 4.32337942e-04]
[0.7069934  0.29197397 0.00103263]
[6.98398485e-01 3.01407185e-01 1.94330330e-04]
[6.86874131e-01 3.12816086e-01 3.09783539e-04]
[7.19863405e-01 2.79663001e-01 4.73594088e-04]
[0.70226868 0.29698615 0.00074517]
[6.96472137e-01 3.03126674e-01 4.01188920e-04]
[7.10733467e-01 2.88809991e-01 4.56542865e-04]

 22%|███████████████████████████████████████████▋                                                                                                                                                        | 1116/5000 [00:02<00:07, 530.36it/s]


[6.89869425e-01 3.09449043e-01 6.81531771e-04]
[6.78303214e-01 3.21106689e-01 5.90096705e-04]
[0.69322579 0.30173051 0.0050437 ]
[7.09297522e-01 2.90352246e-01 3.50232290e-04]
[0.6851015  0.31226006 0.00263844]


 23%|█████████████████████████████████████████████▉                                                                                                                                                      | 1171/5000 [00:02<00:07, 533.40it/s]

[0.69514193 0.30392541 0.00093266]
[7.41273172e-01 2.58708909e-01 1.79190027e-05]
[0.6917688  0.30480301 0.00342819]
[0.72109623 0.27800705 0.00089672]
[7.08342903e-01 2.91654472e-01 2.62522341e-06]
[7.05227665e-01 2.94676642e-01 9.56936184e-05]


 26%|██████████████████████████████████████████████████▏                                                                                                                                                 | 1279/5000 [00:02<00:06, 535.10it/s]

[0.70577461 0.29351846 0.00070693]
[7.09486497e-01 2.90294009e-01 2.19494918e-04]
[0.68515413 0.31320363 0.00164225]
[0.69871354 0.30040044 0.00088602]
[0.69540369 0.30356608 0.00103023]
[0.70984657 0.28872856 0.00142487]
[7.30801829e-01 2.68971924e-01 2.26246906e-04]
[6.99499819e-01 3.00219052e-01 2.81129311e-04]
[0.69465667 0.30449536 0.00084797]
[7.28975916e-01 2.70732845e-01 2.91239176e-04]


 27%|████████████████████████████████████████████████████▎                                                                                                                                               | 1334/5000 [00:02<00:06, 537.66it/s]

[6.98867374e-01 3.00928937e-01 2.03689024e-04]
[7.02450703e-01 2.97090827e-01 4.58470948e-04]
[0.70823806 0.29072948 0.00103245]
[6.96261207e-01 3.03638421e-01 1.00371904e-04]
[6.90837094e-01 3.09033578e-01 1.29328030e-04]
[7.01577547e-01 2.98151289e-01 2.71163535e-04]


 29%|████████████████████████████████████████████████████████▌                                                                                                                                           | 1443/5000 [00:02<00:06, 537.93it/s]

[0.69135531 0.3076907  0.00095399]
[6.98862726e-01 3.01116647e-01 2.06269660e-05]
[0.69507735 0.30412763 0.00079502]
[6.90902467e-01 3.08411703e-01 6.85830124e-04]
[7.12958147e-01 2.86971951e-01 6.99019962e-05]
[0.69446666 0.30419452 0.00133882]
[7.07207684e-01 2.92481896e-01 3.10420241e-04]
[0.6844151 0.3140776 0.0015073]
[0.71985124 0.27817502 0.00197374]
[7.08082645e-01 2.91688635e-01 2.28720308e-04]
[0.70767563 0.29153695 0.00078742]


 30%|██████████████████████████████████████████████████████████▋                                                                                                                                         | 1497/5000 [00:02<00:06, 535.21it/s]

[7.24928468e-01 2.74764903e-01 3.06629418e-04]
[0.70151321 0.29664024 0.00184655]
[6.86488959e-01 3.13069527e-01 4.41514179e-04]
[0.68029698 0.31876481 0.00093821]
[0.72401544 0.27378874 0.00219582]


 31%|████████████████████████████████████████████████████████████▊                                                                                                                                       | 1551/5000 [00:02<00:06, 528.62it/s]

[0.70976268 0.28822817 0.00200915]
[7.10467788e-01 2.89054516e-01 4.77695605e-04]
[0.7018504  0.29554597 0.00260363]
[0.70652339 0.29276694 0.00070968]
[7.13121473e-01 2.86222698e-01 6.55829334e-04]
[0.70210381 0.29658515 0.00131104]
[0.70023341 0.29539071 0.00437588]
[0.70385647 0.2949242  0.00121933]
[6.85284671e-01 3.14277814e-01 4.37514616e-04]
[0.68587347 0.31299635 0.00113018]
[0.7112123  0.28657021 0.00221748]


 32%|██████████████████████████████████████████████████████████████▉                                                                                                                                     | 1604/5000 [00:03<00:06, 528.27it/s]

[7.04579917e-01 2.95091391e-01 3.28691789e-04]
[6.99041528e-01 3.00761259e-01 1.97212957e-04]
[0.70232112 0.29643427 0.00124461]
[7.15271898e-01 2.84480543e-01 2.47558275e-04]
[6.97330977e-01 3.02053116e-01 6.15906891e-04]


 35%|█████████████████████████████████████████████████████████████████████▏                                                                                                                              | 1765/5000 [00:03<00:06, 529.50it/s]

[0.6958631  0.30035698 0.00377992]
[0.71540336 0.283227   0.00136964]
[7.08064205e-01 2.91868717e-01 6.70782619e-05]
[6.96182594e-01 3.03540735e-01 2.76671199e-04]
[0.69890677 0.29941314 0.00168009]
[7.30301383e-01 2.69675754e-01 2.28635136e-05]
[7.11669666e-01 2.88162625e-01 1.67708515e-04]
[7.24588358e-01 2.75105777e-01 3.05865171e-04]
[0.71796075 0.27901583 0.00302342]
[0.71085113 0.28662757 0.0025213 ]
[7.06279632e-01 2.93188339e-01 5.32028879e-04]


 36%|███████████████████████████████████████████████████████████████████████▎                                                                                                                            | 1819/5000 [00:03<00:05, 531.50it/s]

[0.69636129 0.29977346 0.00386525]
[6.86842922e-01 3.12828801e-01 3.28276140e-04]
[7.21467899e-01 2.78376586e-01 1.55514878e-04]
[0.7114727  0.28593595 0.00259135]
[0.69807172 0.30093829 0.00098999]


 39%|███████████████████████████████████████████████████████████████████████████▌                                                                                                                        | 1927/5000 [00:03<00:05, 532.41it/s]

[0.69557545 0.30258723 0.00183732]
[7.04913543e-01 2.94929163e-01 1.57294439e-04]
[7.09619226e-01 2.90256918e-01 1.23855458e-04]
[6.99168293e-01 3.00546761e-01 2.84945632e-04]
[7.13019952e-01 2.86303331e-01 6.76716811e-04]
[0.67084376 0.32827236 0.00088389]
[7.12498736e-01 2.86948234e-01 5.53029312e-04]
[0.71979789 0.27854322 0.00165889]
[7.1505028e-01 2.8437397e-01 5.7574970e-04]
[7.04701689e-01 2.95159911e-01 1.38400596e-04]
[7.00008299e-01 2.99697941e-01 2.93759985e-04]


 40%|█████████████████████████████████████████████████████████████████████████████▋                                                                                                                      | 1981/5000 [00:03<00:05, 533.11it/s]

[0.68158718 0.31716756 0.00124527]
[7.07157723e-01 2.92151248e-01 6.91029531e-04]
[7.16235708e-01 2.83211446e-01 5.52845339e-04]
[0.68518223 0.31398923 0.00082854]
[6.91274879e-01 3.08515577e-01 2.09543808e-04]
[0.69767488 0.30037371 0.00195141]


 42%|█████████████████████████████████████████████████████████████████████████████████▉                                                                                                                  | 2089/5000 [00:03<00:05, 531.74it/s]

[7.17733688e-01 2.81849758e-01 4.16554503e-04]
[0.70650329 0.29144142 0.00205528]
[0.72675841 0.26798748 0.00525411]
[0.71572474 0.28228536 0.0019899 ]
[7.14788555e-01 2.85198793e-01 1.26519650e-05]
[0.71544308 0.28321671 0.00134021]
[6.95606413e-01 3.04190047e-01 2.03540052e-04]
[6.94909607e-01 3.04673610e-01 4.16782600e-04]
[6.90606202e-01 3.09300316e-01 9.34812899e-05]
[0.7094543  0.28887884 0.00166686]


 43%|████████████████████████████████████████████████████████████████████████████████████                                                                                                                | 2143/5000 [00:04<00:05, 533.76it/s]

[6.88556173e-01 3.10937646e-01 5.06180793e-04]
[7.08874501e-01 2.90967861e-01 1.57638449e-04]
[6.98495905e-01 3.01163341e-01 3.40754048e-04]
[6.87027459e-01 3.12485418e-01 4.87123097e-04]
[0.70618235 0.29263273 0.00118492]
[6.98288358e-01 3.01427652e-01 2.83990649e-04]


 45%|████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                           | 2253/5000 [00:04<00:05, 538.27it/s]

[0.68847225 0.31062102 0.00090672]
[7.04064386e-01 2.95897941e-01 3.76737408e-05]
[6.75334455e-01 3.24484390e-01 1.81154642e-04]
[0.69849597 0.3002392  0.00126483]
[7.13406168e-01 2.86428221e-01 1.65611410e-04]
[0.6973052 0.3005737 0.0021211]
[6.96371981e-01 3.03192074e-01 4.35944659e-04]
[7.16330906e-01 2.83448036e-01 2.21057762e-04]
[6.96254002e-01 3.03340224e-01 4.05773849e-04]
[0.70688736 0.29125    0.00186264]
[6.88369440e-01 3.11549235e-01 8.13247307e-05]


 46%|██████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                         | 2307/5000 [00:04<00:05, 536.12it/s]

[7.06732978e-01 2.93212710e-01 5.43117687e-05]
[0.69164442 0.3061074  0.00224817]
[0.7240443  0.27431376 0.00164194]
[6.98335078e-01 3.01259716e-01 4.05206261e-04]
[0.69932266 0.29986704 0.0008103 ]


 48%|██████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                     | 2415/5000 [00:04<00:04, 534.43it/s]

[6.99558427e-01 3.00332555e-01 1.09017837e-04]
[0.70460534 0.29434594 0.00104872]
[6.91046330e-01 3.08938439e-01 1.52315218e-05]
[7.08964787e-01 2.90876221e-01 1.58992211e-04]
[7.05212590e-01 2.94326825e-01 4.60584963e-04]
[6.98273149e-01 3.01089020e-01 6.37831257e-04]
[0.69371395 0.30503074 0.00125531]
[0.69006552 0.30871394 0.00122054]
[0.70206768 0.29632123 0.00161109]
[6.99879194e-01 2.99794169e-01 3.26636793e-04]
[6.97596223e-01 3.01897847e-01 5.05930457e-04]


 49%|████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                   | 2469/5000 [00:04<00:04, 534.87it/s]

[0.69444994 0.30436751 0.00118255]
[7.04734922e-01 2.94812384e-01 4.52693891e-04]
[0.69676419 0.30186228 0.00137353]
[7.01135154e-01 2.98504317e-01 3.60528484e-04]
[7.00547629e-01 2.98928710e-01 5.23660995e-04]


 50%|██████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                 | 2523/5000 [00:04<00:04, 534.41it/s]

[7.06824076e-01 2.92686146e-01 4.89778033e-04]
[6.98658034e-01 3.01217473e-01 1.24493131e-04]
[0.68031926 0.31838253 0.00129821]
[6.85188638e-01 3.14548838e-01 2.62523915e-04]
[6.91455339e-01 3.08467009e-01 7.76520631e-05]
[6.91005150e-01 3.08961967e-01 3.28824561e-05]
[0.69231351 0.30678055 0.00090595]
[0.70577842 0.29315336 0.00106822]
[0.6950205  0.30284577 0.00213372]
[7.30853658e-01 2.68773228e-01 3.73114446e-04]
[7.29445803e-01 2.70395409e-01 1.58788193e-04]


 52%|█████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                               | 2577/5000 [00:04<00:04, 534.91it/s]

[6.94795122e-01 3.05002505e-01 2.02372527e-04]
[7.12995989e-01 2.86588434e-01 4.15576969e-04]
[0.70760721 0.29059079 0.001802  ]
[7.16578976e-01 2.83197034e-01 2.23989384e-04]
[7.07407635e-01 2.92236491e-01 3.55873727e-04]
[7.09929447e-01 2.89886716e-01 1.83837412e-04]

 55%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                        | 2739/5000 [00:05<00:04, 534.91it/s]


[7.07728085e-01 2.91896769e-01 3.75146114e-04]
[0.70960576 0.28879576 0.00159848]
[6.97486582e-01 3.02167111e-01 3.46306822e-04]
[0.70794548 0.29129082 0.0007637 ]
[0.69800177 0.30129874 0.00069949]
[6.82272102e-01 3.17222261e-01 5.05637036e-04]
[7.04576114e-01 2.95407168e-01 1.67176891e-05]
[6.99025940e-01 3.00953104e-01 2.09557409e-05]
[7.01051023e-01 2.98759276e-01 1.89700833e-04]
[7.08982945e-01 2.90964042e-01 5.30136514e-05]


 58%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                  | 2901/5000 [00:05<00:03, 532.45it/s]

[0.70865869 0.28693782 0.00440349]
[7.04039731e-01 2.95929987e-01 3.02816234e-05]
[0.69271121 0.30657365 0.00071514]
[6.98031622e-01 3.01553446e-01 4.14932725e-04]
[7.13425406e-01 2.86424191e-01 1.50402855e-04]
[0.70634602 0.2923702  0.00128378]
[7.06486525e-01 2.93472579e-01 4.08958011e-05]
[0.69828034 0.29787665 0.00384301]
[0.69158767 0.30576764 0.00264469]
[7.02550140e-01 2.97139016e-01 3.10844016e-04]
[7.13492863e-01 2.86148101e-01 3.59035521e-04]
[0.7130262  0.28553869 0.00143511]
[0.69565892 0.30298058 0.0013605 ]
[6.90252109e-01 3.09445916e-01 3.01974850e-04]
[7.23773296e-01 2.75574227e-01 6.52477260e-04]
[6.95266666e-01 3.04521187e-01 2.12146813e-04]
[6.99351066e-01 3.00160570e-01 4.88364030e-04]


 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                            | 3064/5000 [00:05<00:03, 534.83it/s]

[0.69676519 0.3024386  0.00079622]
[7.28939217e-01 2.70983168e-01 7.76149675e-05]
[0.71282934 0.28607519 0.00109547]
[6.86801204e-01 3.12801385e-01 3.97411465e-04]
[7.10334585e-01 2.89244677e-01 4.20738271e-04]
[0.70596199 0.29070463 0.00333338]
[6.99228432e-01 3.00555207e-01 2.16361540e-04]
[7.17023350e-01 2.82798559e-01 1.78091220e-04]
[6.91362045e-01 3.08369821e-01 2.68134752e-04]
[0.70094894 0.29801282 0.00103824]
[0.71870325 0.27884996 0.00244679]
[6.84793232e-01 3.14846626e-01 3.60141989e-04]
[0.70454122 0.29470208 0.00075669]
[0.7052642  0.29339702 0.00133879]
[7.12060785e-01 2.87751359e-01 1.87855814e-04]
[7.11523538e-01 2.88465703e-01 1.07592513e-05]


 65%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                     | 3227/5000 [00:06<00:03, 536.77it/s]

[6.88545697e-01 3.11391232e-01 6.30703961e-05]
[7.13554833e-01 2.86324103e-01 1.21064059e-04]
[0.71297739 0.28513376 0.00188886]
[0.68457901 0.31368973 0.00173126]
[6.97775285e-01 3.01744089e-01 4.80625636e-04]
[6.98960548e-01 3.00643069e-01 3.96382858e-04]
[0.70869181 0.28883751 0.00247068]
[7.07196459e-01 2.92207053e-01 5.96487727e-04]
[6.90669057e-01 3.09252824e-01 7.81187763e-05]
[7.11203700e-01 2.88712455e-01 8.38449175e-05]
[0.70876022 0.29023856 0.00100122]
[0.7005994 0.2979192 0.0014814]
[7.07874093e-01 2.91724149e-01 4.01758873e-04]
[7.00607681e-01 2.98806536e-01 5.85782673e-04]
[0.70393204 0.29454506 0.0015229 ]
[0.6948267  0.30422844 0.00094486]


 68%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                               | 3389/5000 [00:06<00:03, 531.79it/s]

[6.89384327e-01 3.10231848e-01 3.83825320e-04]
[0.7108123  0.28809769 0.00109001]
[6.92254932e-01 3.07591135e-01 1.53933548e-04]
[6.90718767e-01 3.09187115e-01 9.41173094e-05]
[0.7202645  0.27901364 0.00072187]
[0.70717991 0.29193634 0.00088375]
[0.68709561 0.311014   0.0018904 ]
[0.71037771 0.28748552 0.00213677]
[0.71319259 0.28608384 0.00072357]
[0.70569114 0.29295315 0.00135571]
[6.97768786e-01 3.01935347e-01 2.95866747e-04]
[6.88644395e-01 3.11121732e-01 2.33873215e-04]
[6.83017982e-01 3.16919284e-01 6.27342753e-05]
[6.96400715e-01 3.02953142e-01 6.46142925e-04]
[0.69971295 0.29949235 0.0007947 ]
[0.70108619 0.29362462 0.00528919]


 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                           | 3497/5000 [00:06<00:02, 531.61it/s]

[7.12094681e-01 2.87370600e-01 5.34719448e-04]
[6.97068813e-01 3.02830570e-01 1.00617449e-04]
[0.68106186 0.3175811  0.00135705]
[0.67677548 0.32189189 0.00133264]
[7.10184687e-01 2.89804167e-01 1.11456496e-05]
[6.92531723e-01 3.07424602e-01 4.36752547e-05]
[0.69330558 0.30518971 0.00150471]
[0.71324289 0.28567727 0.00107985]
[7.14965165e-01 2.84421670e-01 6.13164563e-04]
[0.70987036 0.28788516 0.00224447]
[0.72280594 0.27604326 0.00115081]
[0.6715989  0.32697611 0.00142499]
[7.02390930e-01 2.97446436e-01 1.62634364e-04]
[7.13386263e-01 2.86373259e-01 2.40477399e-04]
[0.70392844 0.29495575 0.0011158 ]
[7.06453524e-01 2.93483401e-01 6.30753304e-05]

 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                    | 3660/5000 [00:06<00:02, 535.53it/s]


[0.7025932  0.29602357 0.00138323]
[7.02721870e-01 2.96846842e-01 4.31287647e-04]
[0.69753171 0.30158001 0.00088828]
[0.70403235 0.29263534 0.00333231]
[0.71461187 0.28275102 0.00263711]
[7.02546272e-01 2.97389823e-01 6.39050893e-05]
[0.69456966 0.30467433 0.00075601]
[0.70469855 0.29359091 0.00171054]
[0.70930722 0.28983555 0.00085723]
[7.03408318e-01 2.96572294e-01 1.93889782e-05]
[0.70013901 0.29909537 0.00076563]


 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                | 3769/5000 [00:07<00:02, 534.04it/s]

[0.70725409 0.29048918 0.00225672]
[7.23735009e-01 2.75725396e-01 5.39594519e-04]
[7.14561724e-01 2.84822504e-01 6.15771988e-04]
[7.08169416e-01 2.91375347e-01 4.55237327e-04]
[7.11621186e-01 2.88122390e-01 2.56423680e-04]
[0.719009   0.2802237  0.00076729]
[7.05733652e-01 2.93960572e-01 3.05776205e-04]
[0.71640237 0.28259803 0.0009996 ]
[0.72893196 0.27013154 0.0009365 ]
[0.69768156 0.29729556 0.00502288]
[0.66463576 0.33319309 0.00217115]
[0.68661713 0.31267681 0.00070606]
[0.71720405 0.2807975  0.00199845]
[6.86561812e-01 3.12820671e-01 6.17516955e-04]
[0.69751081 0.30143069 0.0010585 ]
[0.70882539 0.28929937 0.00187523]
[0.71654329 0.28226939 0.00118732]

 78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                            | 3877/5000 [00:07<00:02, 530.36it/s]


[6.88455065e-01 3.11482669e-01 6.22669201e-05]
[0.70429221 0.29474522 0.00096257]
[6.89055645e-01 3.10414498e-01 5.29857235e-04]
[7.03697264e-01 2.95990975e-01 3.11761151e-04]
[0.69427994 0.30314379 0.00257626]
[7.13388831e-01 2.86341707e-01 2.69461988e-04]
[6.90916486e-01 3.08656207e-01 4.27306617e-04]
[6.84654204e-01 3.14967457e-01 3.78339847e-04]
[0.69954601 0.29944547 0.00100852]
[7.08018865e-01 2.91587514e-01 3.93621023e-04]
[6.98826469e-01 3.01079541e-01 9.39900348e-05]

 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                       | 3984/5000 [00:07<00:01, 525.79it/s]


[7.20051126e-01 2.79567235e-01 3.81638985e-04]
[0.70397233 0.2952753  0.00075237]
[0.70701998 0.29039639 0.00258363]
[0.70963577 0.28932158 0.00104265]
[0.71250076 0.28517142 0.00232782]
[0.68853316 0.3073199  0.00414694]
[6.97642649e-01 3.02105687e-01 2.51663298e-04]
[0.7044449  0.292612   0.00294309]
[0.69593581 0.30311008 0.00095411]
[7.01426074e-01 2.98024136e-01 5.49789542e-04]


 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                 | 4145/5000 [00:07<00:01, 527.40it/s]

[6.97029010e-01 3.02897317e-01 7.36731285e-05]
[0.69421075 0.30487408 0.00091517]
[6.71343807e-01 3.28556687e-01 9.95051303e-05]
[7.11154094e-01 2.88792276e-01 5.36302896e-05]
[0.69453882 0.30459355 0.00086764]
[0.70190735 0.29652223 0.00157042]
[7.16827180e-01 2.82701086e-01 4.71734238e-04]
[7.01707317e-01 2.98193639e-01 9.90435817e-05]
[7.17132843e-01 2.82597001e-01 2.70156170e-04]
[6.83658614e-01 3.16248341e-01 9.30449952e-05]
[7.05307507e-01 2.94410188e-01 2.82305100e-04]
[7.11194260e-01 2.88246938e-01 5.58801967e-04]
[0.68160275 0.31703251 0.00136473]

 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                           | 4305/5000 [00:08<00:01, 527.53it/s]


[0.71472652 0.28379492 0.00147857]
[6.97440603e-01 3.01897765e-01 6.61632036e-04]
[6.99625810e-01 2.99940439e-01 4.33750920e-04]
[0.68926369 0.310015   0.00072131]
[0.71466311 0.2843902  0.00094669]
[7.11098820e-01 2.88405216e-01 4.95963994e-04]
[7.09700964e-01 2.89926190e-01 3.72845746e-04]
[0.69410004 0.30484218 0.00105778]
[7.23619434e-01 2.75866133e-01 5.14432770e-04]
[6.88040130e-01 3.11411296e-01 5.48573736e-04]
[0.69531837 0.30387074 0.00081089]
[0.67672591 0.32129544 0.00197865]
[0.69875347 0.30044645 0.00080009]
[0.69853093 0.30030311 0.00116595]
[0.68774143 0.31089039 0.00136818]

 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                     | 4464/5000 [00:08<00:01, 525.62it/s]


[6.92585751e-01 3.07219222e-01 1.95027005e-04]
[7.04387628e-01 2.95089308e-01 5.23063778e-04]
[0.71237331 0.28590644 0.00172025]
[0.69301418 0.30555331 0.00143251]
[6.72381773e-01 3.27451205e-01 1.67021790e-04]
[0.70673329 0.29191923 0.00134748]
[0.689823   0.30907191 0.00110509]
[7.07243660e-01 2.92180614e-01 5.75725929e-04]
[6.94348825e-01 3.05527100e-01 1.24074988e-04]
[0.72093769 0.27808282 0.0009795 ]
[6.93828617e-01 3.05726333e-01 4.45049759e-04]
[7.22696141e-01 2.76753917e-01 5.49941785e-04]
[6.96938931e-01 3.02427765e-01 6.33303765e-04]
[6.93211203e-01 3.06386297e-01 4.02500464e-04]
[0.71637148 0.28055465 0.00307387]
[7.16148511e-01 2.83662507e-01 1.88982629e-04]

 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎              | 4626/5000 [00:08<00:00, 532.77it/s]


[7.04630623e-01 2.95272796e-01 9.65812462e-05]
[6.92804589e-01 3.06571904e-01 6.23507753e-04]
[6.90096644e-01 3.09837215e-01 6.61413344e-05]
[6.87919588e-01 3.11557781e-01 5.22630383e-04]
[6.98962993e-01 3.00904908e-01 1.32098644e-04]
[6.95125463e-01 3.04721581e-01 1.52955841e-04]
[0.69943829 0.29922062 0.00134109]
[7.01953396e-01 2.97616629e-01 4.29975402e-04]
[7.23770418e-01 2.76115607e-01 1.13974889e-04]
[6.90111263e-01 3.09220960e-01 6.67776960e-04]
[7.01571977e-01 2.98342246e-01 8.57769984e-05]
[6.81501388e-01 3.18304385e-01 1.94226738e-04]
[0.6977128  0.3014229  0.00086431]
[7.21674015e-01 2.77679523e-01 6.46461711e-04]
[7.05879811e-01 2.93914232e-01 2.05956567e-04]
[7.03590045e-01 2.96200126e-01 2.09828746e-04]

 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌          | 4734/5000 [00:08<00:00, 528.28it/s]


[7.20967193e-01 2.78985949e-01 4.68582935e-05]
[0.70826339 0.2907416  0.00099501]
[0.7017595  0.29664097 0.00159953]
[7.04055596e-01 2.95599722e-01 3.44682366e-04]
[7.12994497e-01 2.86679121e-01 3.26382767e-04]
[6.99561946e-01 2.99895488e-01 5.42566208e-04]
[0.71325816 0.28443264 0.0023092 ]
[0.701025   0.29777055 0.00120445]
[7.02449866e-01 2.97108902e-01 4.41232164e-04]
[0.7180687  0.28096465 0.00096665]
[6.97950383e-01 3.01716411e-01 3.33205995e-04]
[0.70203567 0.29707926 0.00088507]
[7.09504356e-01 2.90199013e-01 2.96631042e-04]
[0.69415704 0.30418061 0.00166236]
[0.70913751 0.28708802 0.00377447]


 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉  | 4947/5000 [00:09<00:00, 523.58it/s]

[0.70187871 0.29681237 0.00130893]
[7.05364677e-01 2.94512020e-01 1.23303468e-04]
[6.91449179e-01 3.08523488e-01 2.73331524e-05]
[6.92751028e-01 3.07193681e-01 5.52909475e-05]
[7.14623359e-01 2.85240604e-01 1.36036853e-04]
[0.70566198 0.29352421 0.00081382]
[0.69414533 0.30079196 0.00506271]
[0.7060212  0.29283845 0.00114035]
[6.99994500e-01 2.99597069e-01 4.08431395e-04]
[6.86754405e-01 3.12684877e-01 5.60717064e-04]
[0.69657642 0.30247355 0.00095003]
[0.7177972  0.28059933 0.00160347]
[7.04347829e-01 2.95648555e-01 3.61680574e-06]
[6.94369227e-01 3.05523327e-01 1.07446241e-04]
[0.71181773 0.28652436 0.00165791]
[0.70305943 0.2961915  0.00074908]
[7.05527721e-01 2.93877340e-01 5.94939141e-04]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:09<00:00, 529.74it/s]


[7.17692595e-01 2.82015907e-01 2.91497606e-04]
[7.03590069e-01 2.96402813e-01 7.11807512e-06]
[6.96934999e-01 3.03024954e-01 4.00476506e-05]
[7.05351423e-01 2.94378048e-01 2.70529361e-04]





In [None]:
(M[:,1]==-1).sum()

In [None]:
num_reads, num_genomes = MC.shape

In [None]:
p = np.array([0.5] + [1/(num_genomes-1)]*(num_genomes-1))

In [None]:
%timeit get_Zi(MC, p, num_genomes, 2)

In [None]:
dif = [i for i in range(16569) if contamix_ref[i] != cons[i]]

In [None]:
dif

In [None]:
contamix_ref[514]

In [None]:
cons[16179]

In [None]:
def get_num_indels(bam_fname, trunc = 7):
    samfile = pysam.AlignmentFile(bam_fname, "rb" )
    num_reads = 0
    for read in samfile.fetch('chrM'):
        if not read.is_mapped or read.pos < trunc:
            continue
        if "I" in read.cigarstring:
            num_reads += 1
    samfile.close()
    return num_reads

In [None]:
get_num_indels(bam)

In [None]:
def get_cigar_string(bam_fname):
    ''''
    This function calculate mapped reads
    '''
    samfile = pysam.AlignmentFile(bam_fname, "rb" )
    
    for read in samfile.fetch('chrM'):
    print(read.cigartuples)
    samfile.close()

In [None]:
samfile = pysam.AlignmentFile(bam, "rb" )
read = list(samfile.fetch('chrM'))[432]
print(read.cigartuples)
print(aln_coords[read.pos])
print(read.seq)
genome = (''.join( np.array(genomes_arr, dtype = str)[1])).upper()
print(genome[213: 326].replace('-',''))

In [None]:
j = 0
for i in range(num_reads):
    if (MC[i,:].sum()) == 0:
        print(i)

In [None]:
get_num_reads(bam)

In [None]:
def calculate_likelihood(probs, mc):
    probs = np.asarray(probs)
    num_reads, num_genomes = mc.shape
    log_l = 0
    for i in range(num_reads):
        log_l += np.log((probs*MC[i,:]).sum())
    return log_l

            

In [None]:
calculate_likelihood([0.8, 0.2, 0], MC)

In [None]:
calculate_likelihood([0.7, 0.1, 0.2], MC)

In [None]:
calculate_likelihood([0.99, 0.01, 0], MC)

In [None]:
p3 = np.asarray([0.9] + [0.1]*(num_genomes-1))

In [None]:
np.where(M[:,0]>M[:,1]+5)

In [None]:
calculate_likelihood(p3, MC)

In [None]:
p4 = np.asarray([0.7] + [0.3/(num_genomes-1)]*(num_genomes-1))

In [None]:
calculate_likelihood([0.8, 0.2], MC)

In [None]:
p5 = np.asarray([0.6] + [0.4/(num_genomes-1)]*(num_genomes-1))

In [None]:
calculate_likelihood(p5, MC)

In [None]:
seq1==seq2

In [None]:
genomes_arr

In [None]:
def get_probs(mc, p):
    num_reads, num_genomes = mc.shape
    p = np.asarray(p)
    
    probs = np.zeros_like(mc)
    # probs = np.zeros(num_genomes, dtype = float)
    for i in range(num_reads):
        s = 0
        for j in range(num_genomes):
            probs[i, j] = mc[i, j] * p[j]
            s += probs[i, j]
        for j in range(num_genomes):
            probs[i, j] = probs[i, j] / s
    return probs

In [None]:
f = get_probs(MC,[0.5, 0.5])

In [None]:
(f[:,0]>f[:,1]).mean()

In [None]:
p = np.array([0.5, 0.5])

In [None]:
np.where(np.bitwise_and((-1<M[:,0]),  M[:,0]+1<M[:,1]))[0].shape

In [None]:
np.where(M[:,0]<M[:,1])[0].shape

In [None]:
M[1590]
# N[1590]

In [None]:
genome[801:801+100]

In [None]:
read.pos

In [None]:
genome1 = (''.join( np.array(genomes_arr, dtype = str)[0])).upper()
genome2 = (''.join( np.array(genomes_arr, dtype = str)[1])).upper()

In [None]:
s = 100
print(genome1[s: s+100])
print(genome2[s: s+100])

In [None]:
M[19245, 1]

In [None]:
import sys

In [None]:
np.set_printoptions(threshold=300)

In [None]:
np.where(-1 < M[:,0])[0][0:]

In [None]:
M[598]

In [None]:
np.where(np.bitwise_and(M[:,1]<N[:,1], M[:,0] != -1))

In [None]:
p

In [None]:
get_probs(MC, p, 158)

In [None]:
np.where(np.bitwise_and(M[:,1]>M[:,0]+1, M[:,0]>0))[0].shape

In [None]:
np.where(M[:,0]<M[:,1])[0]

In [None]:
M[101]

In [None]:
(M[:,0]>M[:,1]).sum()

In [None]:
N[157]

In [None]:
M.shape

In [None]:
genomes0.count('-')

In [None]:
genome1 = (''.join( np.array(genomes_arr, dtype = str)[1])).upper()

In [None]:
in10_1 =  genome.replace('-', '')

In [None]:
f = open('data/bam/in10.fa')
in10 = f.read()[5:].replace('\n', '')
f = open('data/bam/in1.fa')
in1 = f.read()[5:].replace('\n', '')

In [None]:
dif = [i for i in range(16569) if consensus[i] != in10[i]]

In [None]:
dif

In [None]:
cons = bam2consensus('data/bam/output40_in1_60_in10.bam', 1, 0.5)

In [None]:
cons1 = pysam.consensus('data/bam/output40_in1_60_in10.bam')[5:].replace('\n','')

In [None]:
len(cons)

In [None]:
len(cons1)

In [None]:
dif = [i for i in range(len(cons1)) if cons[i] !=cons1[i]]

In [None]:
A = 300
B = 30
print(cons[A: A + B])
print(cons1[A: A + B])

In [None]:
cons.count('N')