In [1]:
import sys
import pandas as pd
import random
from Bio import SeqIO

In [12]:
def create_query(inFilepath, outFilepath, addDecoy=False):
    def is_typical(seq_record):
        if len(seq_record)>=6 and len(seq_record)%3==0:
            return True
        else:
            return False

    print("\tIN  :{}".format(inFilepath))
    print("\tOUT :{}".format(outFilepath))

    query_lst=[]
    decoy_lst=[]
    
    totalCDS=0
    processedCDS=0
    for seq_record in SeqIO.parse(inFilepath, "fasta"):
        totalCDS+=1
        if is_typical(seq_record):
            processedCDS+=1

            target=[(2,(1,len(seq_record)-2),False),#Frame Number, (start, end), is reverse complement 
                    (3,(2,len(seq_record)-1),False),
                    (4,(0,len(seq_record)),  True),
                    (5,(1,len(seq_record)-2),True),
                    (6,(2,len(seq_record)-1),True)]

            for frameNum, (start,end), revComp in target:
                seq=seq_record.seq[start:end]
                if revComp:
                    seq=seq.reverse_complement()
                
                queryHeader=">F{0}|{1}".format(frameNum,seq_record.description.strip())
                querySeq=str(seq.translate(table=11)).replace('*','X')
                query_lst.append((queryHeader,querySeq))
                
                if addDecoy:
                    #decoy reverse
                    drHeader=">DRF{0}|{1}".format(frameNum,seq_record.id.strip())
                    drSeq=querySeq[::-1]
                    decoy_lst.append((drHeader,drSeq))
                   
                    #decoy shuffle
                    dsHeader=">DSF{0}|{1}".format(frameNum,seq_record.id.strip())
                    dsSeq=''.join(random.sample(querySeq,len(querySeq)))
                    decoy_lst.append((dsHeader,dsSeq))

    #output
    with open(outFilepath,'w') as f:
        for header,seq in query_lst:
            f.write(header+'\n')
            f.write(seq+'\n')
        if addDecoy:
            for header,seq in decoy_lst:
                f.write(header+'\n')
                f.write(seq+'\n')
        
    
    print("\tDONE:extraction from {0}/{1} CDSs".format(processedCDS, totalCDS))

In [4]:
targetFilepath="../out/target_genus.list"
target_df=pd.read_csv(targetFilepath)
print(target_df.shape)
target_df.head()

(10, 15)


Unnamed: 0,taxid,kingdom,phylum,class,order,family,genus,species,count_real,count_sim,diff,ftp_basename,organism_name,genetic_code,G+C
0,525146,-1,1224,28221,213115,194924,872,876,0.851789,0.77774,0.074048,GCF_000022125.1_ASM2212v1,Desulfovibrio desulfuricans subsp. desulfurica...,11,0.580722
1,883,-1,1224,28221,213115,194924,872,881,1.619893,1.407806,0.212088,GCF_000021385.1_ASM2138v1,Desulfovibrio vulgaris str. 'Miyazaki F',11,0.67109
2,901,-1,1224,28221,213115,194924,872,901,1.387866,1.183206,0.204661,GCF_900116045.1_DESPIGER,Desulfovibrio piger,11,0.641799
3,526222,-1,1224,28221,213115,194924,872,880,0.594646,0.536652,0.057994,GCF_000023445.1_ASM2344v1,Desulfovibrio salexigens DSM 2638,11,0.470928
4,641491,-1,1224,28221,213115,194924,872,876,1.622397,1.369687,0.252709,GCF_000189295.2_ASM18929v2,Desulfovibrio desulfuricans ND132,11,0.652094


In [13]:
for basename in target_df["ftp_basename"]:
    inFilepath="/data/mitsuki/data/refseq/cds_from_genomic/{}_cds_from_genomic.fna".format(basename)
    outFilepath="/data/mitsuki/out/altorf/evolve/query/{}.query".format(basename)
    create_query(inFilepath, outFilepath, addDecoy=True)
    break

	IN  :/data/mitsuki/data/refseq/cds_from_genomic/GCF_000022125.1_ASM2212v1_cds_from_genomic.fna
	OUT :/data/mitsuki/out/altorf/evolve/query/GCF_000022125.1_ASM2212v1.query
	DONE:extraction from 2429/2444 CDSs


In [7]:
inFilepath="/data/mitsuki/data/refseq/cds_from_genomic/GCF_000022125.1_ASM2212v1_cds_from_genomic.fna"
for seq_record in SeqIO.parse(inFilepath, "fasta"):
    print(seq_record)
    break

ID: lcl|NC_011883.1_cds_WP_012623653.1_1
Name: lcl|NC_011883.1_cds_WP_012623653.1_1
Description: lcl|NC_011883.1_cds_WP_012623653.1_1 [locus_tag=DDES_RS00005] [protein=hypothetical protein] [protein_id=WP_012623653.1] [location=complement(394..618)] [gbkey=CDS]
Number of features: 0
Seq('ATGATGTACTTTCAGCCACTTTCGGGCATTGCTCTGGTCGCCCAGAATACTCCG...TAA', SingleLetterAlphabet())


In [10]:
seq_record.description

'lcl|NC_011883.1_cds_WP_012623653.1_1 [locus_tag=DDES_RS00005] [protein=hypothetical protein] [protein_id=WP_012623653.1] [location=complement(394..618)] [gbkey=CDS]'