In [1]:
from collections import Counter
import sys
import numpy as np
from scipy import stats
import Bio
from Bio import SeqIO
import pandas as pd
import random

In [2]:
class AaBias():
    """ remember amino acid bias
    
    Attributes:
        self.aa_lst:      list of 1 char representation of 20 Amino Acids
        self.aaCount_dct: dictionary of amino acid (+ X) count
        self.aaFreq_arr:  numpy array of culmitive frequency of each Amino Acids
    """
    
    def __init__(self):
        self.aa_lst=["A","R","N","D","C","E","Q","G","H","I","L","K","M","F","P","S","T","W","Y","V"]
        self.all_lst=self.aa_lst+["X"]
        self.aaCount_dct={}
        for aa in self.all_lst:
            self.aaCount_dct[aa]=0
        self.aaFreq_arr=np.zeros(len(self.all_lst))
    
    def update_count(self,aaSeq):
        """get amino acid sequence and update count dictionary"""
        
        counter=Counter(aaSeq)
        for k,v in counter.items():
            if k in self.aa_lst:
                self.aaCount_dct[k]+=v
            elif k in ["X"]:
                self.aaCount_dct[k]+=v
            else:
                self.aaCount_dct["X"]+=v
                #sys.stderr("CAUGHT unexpected amino acid: {}".format((k)))
                #sys.exit(1)
                
    def calc_freq(self):
        """calculate frequency based on count dictionary"""
        
        totalCount=0
        for _,v in self.aaCount_dct.items():
            totalCount+=v
        
        culmFreq=0.
        for i,aa in enumerate(self.all_lst):
            culmFreq+=self.aaCount_dct[aa]/totalCount
            self.aaFreq_arr[i]=culmFreq
        self.aaFreq_arr[-1]=1 #correct rounding error
             
    def generate_random(self,seqLen):
        """generate random sequence of length seqLen, according to frequency"""
        
        retVal=""
        rand_arr=np.random.rand(seqLen)
        for rand in rand_arr:
            for i,culmFreq in enumerate(self.aaFreq_arr):
                if culmFreq>=rand:
                    break
            retVal+=self.all_lst[i]
            
        return retVal
        
        
    def kl(self, other):
        """calculate kullback leibler distance between two AaBias instance"""
        
        assert(self.aa_lst==other.aa_lst)
        
        #convert count dictionary to numpy array
        a1=np.zeros(len(self.aa_lst))
        a2=np.zeros(len(other.aa_lst))
        for i,aa in enumerate(self.aa_lst):
            a1[i]=self.aaCount_dct[aa]+1
            a2[i]=other.aaCount_dct[aa]+1#to avoid 0 division
        return stats.entropy(a1,a2)
                
    def _show(self):
        """[DEBUG] show """
        for i,aa in enumerate(self.all_lst):
            print(aa, self.aaCount_dct[aa], self.aaFreq_arr[i])

In [3]:
def is_typical(seq_record):
    if len(seq_record)>=6 and len(seq_record)%3==0:
        return True
    else:
        return False

In [10]:
ab_lst=[AaBias() for _ in range(6)]

filepath="/data/mitsuki/data/refseq/cds_from_genomic/GCF_000010665.1_ASM1066v1_cds_from_genomic.fna"
for seq_record in SeqIO.parse(filepath, "fasta"):
    if is_typical(seq_record):
        
        #Frame Number, (start, end), is reverse complement 
        target=[(1,(0,len(seq_record)-3),False),# -3 to drop stop codon
                (2,(1,len(seq_record)-2),False),
                (3,(2,len(seq_record)-1),False),
                (4,(0,len(seq_record)),  True),
                (5,(1,len(seq_record)-2),True),
                (6,(2,len(seq_record)-1),True)]

        #shuflle
        #seq_record.seq=Bio.Seq.Seq(''.join(random.sample(str(seq_record.seq),len(seq_record))))
        
        for frameNum, (start,end), revComp in target:
            seq=seq_record.seq[start:end]
            if revComp:
                seq=seq.reverse_complement()
            aaSeq=str(seq.translate(table=11)).replace('*','X')
            ab_lst[frameNum-1].update_count(aaSeq)
            
for i in range(6):
    ab_lst[i].calc_freq()

In [13]:
ab_lst[1].generate_random(100)

'GAKGAGPTRKIPNSAACMWTRRCASSIWKGCPTITPRGIRAQVYIXTTRRILLRSPRSRSITXAXPPVPGWSPDPVSPGGSPTWCPWTWNGTSPITPAEP'

In [None]:
import sys
import pandas as pd
import random
from Bio import SeqIO