In [358]:
from anytree import NodeMixin, RenderTree
import numpy as np
import pysam
import allel
from pysam import VariantFile, FastaFile

In [324]:
class Node(NodeMixin):
    def __init__(self, name, snps, parent=None, children=None):
#         super(MyClass, self).__init__()
        self.name = name
        self.snps = snps
        self.parent = parent
        self.lh = 0
        if children:  # set children only if given
             self.children = children
    
    def __repr__(self):
             return self.name +" "+ str(self.lh)
    def __str__(self):
        return self.name

In [323]:
def get_snp(node):
    issnp = False
    snp = list()
    atgc = set(['A','T','G','C','a','t','g','c'])
    for i in range(2,len(node)):
        if node[i][0] in atgc and node[i][-1] in atgc:
            snp.append([node[i][0],node[i][-1],int(node[i][1:-1])])
    return snp

In [135]:
def make_tree(tree,node,pos=0):     
    posit = pos + 1
    i = posit
    while i<len(tree) and tree[i][0] >= tree[pos][0]+1:
        if tree[i][0] == tree[pos][0]+1:
#             print(node.name,tree[i][1])
            snps = get_snp(tree[i])
            tmp = Node(tree[i][1],snps,parent=node)
            make_tree(tree,tmp,i)

        i += 1        

In [133]:
import json

with open('PhyloTree.org-parser/array.json') as f:
    d = json.load(f) # d - это список python
#     print(d)
for i in d:
    i[0] += 1
d[0][0]=0

# print(d)

In [238]:
def get_log_monozygous(bcf: VariantFile):
    '''
    Логарифмы вероятностей нуклеотидов для vcf файла
    
    '''
    gls = np.full((16569,4),-1)
    for rec in bcf.fetch():
        pos = rec.pos
        pls = rec.samples.values()[0]['PL']
        alt = rec.alleles
        k = 0
        s = 2
        for i in range(len(alt)):
            if alt[i] == 'A':
                gls[pos-1][0] = pls[k]
            elif alt[i] == 'T':
                gls[pos-1][1] = pls[k]
            elif alt[i] == 'G':
                gls[pos-1][2] = pls[k]
            elif alt[i] == 'C':
                gls[pos-1][3] = pls[k]
            elif alt[i] == '<*>':
                for j in range(4):
                    if gls[pos-1][j] == -1:
                        gls[pos-1][j] = pls[k]
            k += s
            s += 1
    return -gls/10

In [325]:
a = Node(d[0][1],[])
make_tree(d,a,0)

In [326]:
a.children[0].snps

[['G', 'A', 263],
 ['C', 'T', 1048],
 ['C', 'a', 3516],
 ['T', 'C', 5442],
 ['T', 'C', 6185],
 ['C', 'T', 9042],
 ['A', 'G', 9347],
 ['G', 'A', 10589],
 ['G', 'A', 12007],
 ['A', 'G', 12720]]

In [314]:
ref.fetch('chrM')[25]

'C'

In [310]:
print(RenderTree(a))

mt-MRCA (RSRS) -2206.2
├── L0 -2376.1
│   ├── L0a'b'f'g'k -2419.7
│   │   ├── L0a'b'f'g -2583.4999999999995
│   │   │   ├── L0a'b'g -2653.3999999999996
│   │   │   │   ├── L0a'g -2678.8999999999996
│   │   │   │   │   ├── L0a -2704.3999999999996
│   │   │   │   │   │   ├── L0a1'4 -2729.8999999999996
│   │   │   │   │   │   │   ├── L0a1 -2729.8999999999996
│   │   │   │   │   │   │   │   ├── L0a1a -2729.8999999999996
│   │   │   │   │   │   │   │   │   └── - -2748.7
│   │   │   │   │   │   │   │   │       ├── L0a1a1 -2774.2
│   │   │   │   │   │   │   │   │       ├── L0a1a2 -2792.3999999999996
│   │   │   │   │   │   │   │   │       └── L0a1a3 -2774.2
│   │   │   │   │   │   │   │   ├── - -2755.3999999999996
│   │   │   │   │   │   │   │   │   ├── L0a1b -2806.3999999999996
│   │   │   │   │   │   │   │   │   │   ├── L0a1b1 -2850.0999999999995
│   │   │   │   │   │   │   │   │   │   │   └── L0a1b1a -2875.5999999999995
│   │   │   │   │   │   │   │   │   │   │       └── L0a1b1a1 -2875.599

In [126]:
bcf_in = VariantFile("PhyloTree.org-parser/out4.vcf") 

In [144]:
ref = FastaFile('ref.fa')

In [141]:
callset = allel.read_vcf('PhyloTree.org-parser/out4.vcf', fields=['calldata/PL'])
a = np.asarray(callset['calldata/PL'])



In [329]:
def calculate_likelihood(vcf, ref):
    lh = 0
    ref = ref.fetch('chrM')
    gls = get_log_monozygous(vcf)
#     gls[gls<0]=-10**6
    for i in range(len(ref)):
        if ref[i].capitalize() == 'A':
            lh += gls[i,0]
        if ref[i].capitalize() == 'T':
            lh += gls[i,1]
        if ref[i].capitalize() == 'G':
            lh += gls[i,2]
        if ref[i].capitalize() == 'C':
            lh += gls[i,3]
    return lh

In [246]:
ref_lh = calculate_likelihood(bcf_in,ref)
ref_lh

-2206.2

In [330]:
def call_likelihood(gls,node,ref,lh=0):
    snps = node.snps
#     gls = get_log_monozygous(vcf)
    for snp in snps:
#       snp = [old, new, pos]
        pos = snp[2]-1
        if snp[1].capitalize() == 'A':
            lh = lh - calculate_pl(gls,ref, pos)+ gls[pos,0]
#             print(1,snp[1].capitalize())
        if snp[1].capitalize() == 'T':
            lh = lh - calculate_pl(gls,ref, pos)+ gls[pos,1]
#             print(2,snp[1].capitalize())
        if snp[1].capitalize() == 'G':
            lh = lh - calculate_pl(gls,ref, pos)+ gls[pos,2]
#             print(3,snp[1].capitalize())
        if snp[1].capitalize() == 'C':
            lh = lh - calculate_pl(gls,ref, pos)+ gls[pos,3]
#             print(4,snp[1].capitalize())
        
    return lh
        
    
    

In [277]:
def calculate_pl(gls, ref,pos):
    lh = 0
    ref = ref.fetch('chrM')
#     gls = get_log_monozygous(vcf)
    i = pos
    if ref[i].capitalize() == 'A':
        lh += gls[i,0]
    if ref[i].capitalize() == 'T':
        lh += gls[i,1]
    if ref[i].capitalize() == 'G':
        lh += gls[i,2]
    if ref[i].capitalize() == 'C':
        lh += gls[i,3]
    return lh

In [331]:
def prunung(node,ref,gls):
    if node.parent == None:
        node.lh = ref_lh
    else:
        node.lh = call_likelihood(gls,node,ref,node.parent.lh)
    for i in node.children:
        prunung(i,ref,gls)

In [332]:
gls = get_log_monozygous(bcf_in)
prunung(a,ref,gls)

In [333]:
ref.fetch('chrM')[0]

'G'

In [334]:
print(RenderTree(a))

mt-MRCA (RSRS) -2206.2
├── L0 -2435.7
│   ├── L0a'b'f'g'k -2486.7
│   │   ├── L0a'b'f'g -2614.2
│   │   │   ├── L0a'b'g -2767.2
│   │   │   │   ├── L0a'g -2818.2
│   │   │   │   │   ├── L0a -2920.2
│   │   │   │   │   │   ├── L0a1'4 -2945.7
│   │   │   │   │   │   │   ├── L0a1 -2971.2
│   │   │   │   │   │   │   │   ├── L0a1a -2996.7
│   │   │   │   │   │   │   │   │   └── - -3022.2
│   │   │   │   │   │   │   │   │       ├── L0a1a1 -3047.7
│   │   │   │   │   │   │   │   │       ├── L0a1a2 -3047.7
│   │   │   │   │   │   │   │   │       └── L0a1a3 -3047.7
│   │   │   │   │   │   │   │   ├── - -2945.7
│   │   │   │   │   │   │   │   │   ├── L0a1b -2996.7
│   │   │   │   │   │   │   │   │   │   ├── L0a1b1 -3022.2
│   │   │   │   │   │   │   │   │   │   │   └── L0a1b1a -3047.7
│   │   │   │   │   │   │   │   │   │   │       └── L0a1b1a1 -3073.2
│   │   │   │   │   │   │   │   │   │   │           └── L0a1b1a1a -3124.2
│   │   │   │   │   │   │   │   │   │   └── L0a1b2 -3073.2
│   │   │   

In [185]:
a.parent == None

True

In [281]:
from anytree import find_by_attr, PreOrderIter

In [335]:
minx = -1000000
for i in PreOrderIter(a):
    if i.lh >= minx:
        minx = i.lh
for i in PreOrderIter(a):
    if i.lh == minx:
        print(i.name)

L1b2


In [327]:
s = find_by_attr(a,'L1b2')

In [328]:
s.snps

[['A', 'G', 189],
 ['C', 'T', 12891],
 ['A', 'G', 13893],
 ['G', 'A', 14323],
 ['C', 'T', 16239]]

In [322]:
ref.fetch('chrM')[18]

'C'

In [344]:
PreOrderIter(a)

mt-MRCA (RSRS) -2206.2

In [352]:
key = lambda x: x.lh

In [353]:
S = list()
for i in PreOrderIter(a):
    S.append(i)

In [356]:
S.sort(key = key,reverse=True)

In [357]:
S

[L1b2 -1390.1999999999998,
 L1b2a -1441.1999999999998,
 L1b -1517.6999999999998,
 L1b2'3 -1517.6999999999998,
 L1b3 -1619.6999999999998,
 L1b1 -1645.1999999999998,
 - -1645.1999999999998,
 L1b1a -1670.6999999999998,
 L1b1a1'4 -1670.6999999999998,
 L1b1a3 -1670.6999999999998,
 L1b1a9 -1670.6999999999998,
 L1b1a17 -1670.6999999999998,
 L1b1a4 -1696.1999999999998,
 L1b1a2 -1696.1999999999998,
 L1b1a3a -1696.1999999999998,
 L1b1a15 -1696.1999999999998,
 L1b1a18 -1696.1999999999998,
 L1b1a5 -1696.1999999999998,
 L1b1a7 -1696.1999999999998,
 L1b1a8 -1696.1999999999998,
 L1b1a10 -1696.1999999999998,
 L1b1a13 -1696.1999999999998,
 L1b1a14 -1696.1999999999998,
 L1b1a4a -1721.6999999999998,
 L1b1a2a -1721.6999999999998,
 L1b1a3a1 -1721.6999999999998,
 L1b1a3b -1721.6999999999998,
 L1b1a15a -1721.6999999999998,
 L1b1a6 -1721.6999999999998,
 L1b1a10a -1721.6999999999998,
 L1b1a10b -1721.6999999999998,
 L1b1a12 -1721.6999999999998,
 L1b1a16 -1721.6999999999998,
 L1b1a1 -1747.1999999999998,
 L1b1a7a