In [1]:
import sys
from Bio import SeqIO

def is_typical(seq_record):
	if len(seq_record)>=6 and len(seq_record)%3==0:
		return True
	else:
		return False

In [2]:
def main(inFilepath, outFilepath):
    with open(outFilepath,'w') as f:
        totalCDS=0
        processedCDS=0
        for seq_record in SeqIO.parse(inFilepath, "fasta"):
            totalCDS+=1
            if is_typical(seq_record):
                processedCDS+=1
                target=[("frame2",(1,len(seq_record)-2),False),
                        ("frame3",(2,len(seq_record)-1),False),
                        ("frame4",(0,len(seq_record)),  True),
                        ("frame5",(1,len(seq_record)-2),True),
                        ("frame6",(2,len(seq_record)-1),True)]

                for name, (start,end), revComp in target:
                    subSeq=seq_record.seq[start:end]
                    if revComp:
                        subSeq=subSeq.reverse_complement()
                    product=subSeq.translate(table=11)
                    for i,subPro in enumerate(product.split('*')):
                        if len(subPro)>=10:
                            header=">{0}|{1}.{2}".format(name,seq_record.id,i)
                            f.write(header+'\n')
                            f.write(str(subPro)+'\n')
    print("\tDONE: extraction from {0}/{1} CDSs".format(processedCDS, totalCDS))

In [3]:
inFilepath="GCF_000265365.1_ASM26536v1_cds_from_genomic.fna"
outFilepath="query.fna"
main(inFilepath,outFilepath)

	DONE: extraction from 3164/3203 CDSs


In [None]:
lengthDist=[[] for _ in range(5)]

with open(outFilepath,'w') as f:
    totalCDS=0
    processedCDS=0
    for seq_record in SeqIO.parse(inFilepath, "fasta"):
        totalCDS+=1
        if is_typical(seq_record):
            processedCDS+=1
            target=[(2,(1,len(seq_record)-2),False),
                    (3,(2,len(seq_record)-1),False),
                    (4,(0,len(seq_record)),  True),
                    (5,(1,len(seq_record)-2),True),
                    (6,(2,len(seq_record)-1),True)]

            for frameNum, (start,end), revComp in target:
                subSeq=seq_record.seq[start:end]
                if revComp:
                    subSeq=subSeq.reverse_complement()
                product=subSeq.translate(table=11)
                for i,subPro in enumerate(product.split('*')):
                    lengthDist[frameNum-2].append(len(subPro))
                        
print("\tDONE: extraction from {0}/{1} CDSs".format(processedCDS, totalCDS))

In [None]:
for dist_lst in lengthDist:
    
    print(len(dist_lst))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
for dist in lengthDist:
    sns.distplot(dist, kde=False, bins =50)
    plt.show()

In [None]:
lengthDist[0].append(2)

In [None]:
lengthDist