# Extract DNA sequences of 16S rRNAs

In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqFeature import ExactPosition
from Bio.SeqRecord import SeqRecord
from tqdm.notebook import tqdm
from pyscripts.config import path2
from pyscripts.datasets import Metadata, DatasetLoader
metadata = Metadata()
dloader  = DatasetLoader()

In [2]:
def extract_16SrRNA(gcf):
    rrs = []
    for rec in dloader.load_genome(gcf):
        for rrna in filter(lambda feat: feat.type == 'rRNA', rec.features):
            # There are some orthographic variants in the 'product' section to describe 16S rRNAs
            # e.g. '16S Ribosomal RNA', '16S ribosomal RNA', 'ribosomal RNA-16S', 'Small Subunit Ribosomal RNA; ssuRNA; 16S ribosomal RNA'
            if '16S' not in rrna.qualifiers['product'][0]: continue    
            loc = rrna.location
            if type(loc.start) is ExactPosition and type(loc.end) is ExactPosition:
                tag = rrna.qualifiers['locus_tag'][0]
                rrs.append(SeqRecord(loc.extract(rec.seq), id=tag, name=tag, description=tag))
            else:
                print('the following partial record was removed')
                print(rrna)
    return rrs

for gcf in tqdm(metadata.acc['refseq']):
    rrs = extract_16SrRNA(gcf)
    SeqIO.write(rrs, path2.data/'16S'/f'{gcf}.fna', 'fasta')
    

  0%|          | 0/2624 [00:00<?, ?it/s]

the following partial record was removed
type: rRNA
location: [<0:1434](-)
qualifiers:
    Key: db_xref, Value: ['RFAM:RF00177']
    Key: inference, Value: ['COORDINATES: nucleotide motif:Rfam:12.0:RF00177', 'COORDINATES: profile:INFERNAL:1.1.1']
    Key: locus_tag, Value: ['BQ4451_RS00005']
    Key: note, Value: ['Derived by automated computational analysis using gene prediction method: cmsearch.']
    Key: product, Value: ['16S ribosomal RNA']

