In [57]:
from collections import Counter
import sys
import numpy as np
from scipy import stats
from Bio import SeqIO
import Bio

import random

In [2]:
class AaCount():
    def __init__(self):
        self.aa_lst=["A","R","N","D","C","E","Q","G","H","I","L","K","M","F","P","S","T","W","Y","V"]
        self.aaCount_dct={}
        for aa in self.aa_lst:
            self.aaCount_dct[aa]=0
    
    def update_count(self,aaSeq):
        counter=Counter(aaSeq)
        for k,v in counter.items():
            if k in self.aa_lst:
                self.aaCount_dct[k]+=v
            elif k in ["*", "X"]:
                pass
            else:
                print(k)
                sys.exit(1)
         
    def kl(self, other):
        a1=np.zeros(len(self.aa_lst))
        a2=np.zeros(len(self.aa_lst))
        for i,aa in enumerate(self.aa_lst):
            a1[i]=self.aaCount_dct[aa]+1
            a2[i]=other.aaCount_dct[aa]+1#to avoid 0 division
        return stats.entropy(a1,a2)
                
    def _debug_show(self):
        for k in self.aa_lst:
            print(k, self.aaCount_dct[k])

In [4]:
def is_typical(seq_record):
    if len(seq_record)>=6 and len(seq_record)%3==0:
        return True
    else:
        return False

In [74]:
ac_lst=[AaCount() for _ in range(6)]
filepath="/data/mitsuki/data/refseq/cds_from_genomic/GCF_000010665.1_ASM1066v1_cds_from_genomic.fna"
for seq_record in SeqIO.parse(filepath, "fasta"):
    if is_typical(seq_record):
        target=[(1,(0,len(seq_record)),   False),
                (2,(1,len(seq_record)-2),False),#Frame Number, (start, end), is reverse complement 
                (3,(2,len(seq_record)-1),False),
                (4,(0,len(seq_record)),  True),
                (5,(1,len(seq_record)-2),True),
                (6,(2,len(seq_record)-1),True)]

        #shuflle
        #seq_record.seq=Bio.Seq.Seq(''.join(random.sample(str(seq_record.seq),len(seq_record))))
        
        for frameNum, (start,end), revComp in target:
            seq=seq_record.seq[start:end]
            if revComp:
                seq=seq.reverse_complement()
            aaSeq=str(seq.translate(table=11)).replace('*','X')
            ac_lst[frameNum-1].update_count(aaSeq)

In [75]:
for frameNum in range(1,7):
    print("Frame{0}: {1}".format(frameNum, AaCount.kl(ac_lst[0], ac_lst[frameNum-1])))

Frame1: 0.0
Frame2: 0.4936553789333777
Frame3: 0.31546072982491935
Frame4: 0.14404278226542183
Frame5: 0.42528162213507475
Frame6: 0.19723081858265545


In [70]:
ac_lst[3]._debug_show()

A 154799
R 200312
N 25080
D 42642
C 43018
E 45755
Q 41573
G 172590
H 39611
I 35281
L 108322
K 26251
M 15596
F 26575
P 141491
S 123494
T 81390
W 29297
Y 25476
V 88618


In [73]:
ac_lst[0]._debug_show()

A 154026
R 194440
N 26875
D 41537
C 42135
E 39632
Q 43405
G 140925
H 45161
I 36374
L 113400
K 26555
M 14767
F 26081
P 172150
S 130206
T 88599
W 25039
Y 25922
V 81668


In [None]:
ac1=AaCount()
seq="LLGRALAREKRLANDHPLVEQFWDIYEYIT"
ac1.update_count(seq)

In [None]:
ac2=AaCount()
seq="ILARAETREQRLAHDHPIVEQFWESYHYLN"
ac2.update_count(seq)

In [None]:
AaCount.kl(ac1,ac2)

In [None]:
s="ATGCA"
counter=Counter(s)
counter
for k,v in counter.items():
    print(k,v)

In [None]:
counter.items()