# Extract DNA sequences of CDSs and their translations

In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from tqdm.notebook import tqdm
from pyscripts.config import path2
from pyscripts.datasets import Metadata, DatasetLoader
metadata = Metadata()
dloader  = DatasetLoader()

In [2]:
def extract_cds(gcf):
    nucl, prot = [], []
    for rec in dloader.load_genome(gcf):
        for cds in filter(lambda feat: feat.type == 'CDS', rec.features):
            if (transl := cds.qualifiers.get('translation')) is not None:
                tag = cds.qualifiers['locus_tag'][0]
                header = dict(id=tag, name=tag, description=tag)
                nucl.append(SeqRecord(cds.location.extract(rec.seq), **header))
                prot.append(SeqRecord(Seq(transl[0]), **header))
    return nucl, prot

for gcf in tqdm(metadata.acc['refseq']):
    nucl, prot = extract_cds(gcf)
    SeqIO.write(nucl, path2.data/'cds_nucl'/f'{gcf}.fna', 'fasta')
    SeqIO.write(prot, path2.data/'cds_prot'/f'{gcf}.faa', 'fasta')
    

  0%|          | 0/2624 [00:00<?, ?it/s]