In [1]:
# https://biopython.org/DIST/docs/api/Bio.Entrez-module.html
# https://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc109
from Bio import Entrez
from Bio import SeqIO

import os

In [2]:
Entrez.email = 'Olga_Petrova2@epam.com'
Entrez.api_key = 'e2a498a443e26ff32293a82976e116179507'

In [3]:
folder = '../data/ncbi_16S_genes/'

## Getting data from NCBI

## First 4 taxons

In [4]:
taxon_names = ['Holospora', 'Gortzia', 'Finniella', 'Paracaedibacter']

subtaxs = [['Candidatus Holospora parva',
            'Holospora acuminata',
            'Holospora caryophila',
            'Holospora curviuscula',
            'Holospora elegans',
            'Holospora obtusa'],
          ['Candidatus Gortzia shahrazadis',
           'Candidatus Gortzia infectiva'],
          ['Candidatus Finniella inopinata',
           'Candidatus Finniella lucida'],
          ['Candidatus Paracaedibacter acanthamoebae',
           'Candidatus Paracaedibacter symbiosus']]

taxons = [(t_name, subtax) for (t_name, subtax) in zip(taxon_names, subtaxs)]

In [5]:
for taxon in taxons:
    ids = []
    
    for org in taxon[1]:
        handle = Entrez.esearch(db='taxonomy', term=org, idtype='acc', retmax=100)
        record = Entrez.read(handle)
        handle.close()
        
        item = 'txid' + record['IdList'][0] + '[Organism]' + ' 16S[Title]'
        handle = Entrez.esearch(db='nucleotide', term=item, idtype='acc', retmax=10000)
        record = Entrez.read(handle)
        handle.close()
        
        ids.append(','.join(record['IdList']))
        
    ids = ','.join(ids)
    handle = Entrez.efetch(db='nucleotide', id=ids, rettype='fasta', retmode='text')
    
    title = folder + taxon[0] + '.fasta'
    with open(title, 'w') as file:
        file.write(handle.read())
    
    handle.close()

In [6]:
for taxon_name in taxon_names:
    os.mkdir(folder + taxon_name)

In [7]:
for taxon_name in taxon_names:
    path = folder + taxon_name + '.fasta'
    orgs = SeqIO.parse(path, 'fasta')
    for gene_record in orgs:
        if 'uncultured' not in gene_record.description.lower() and \
           ' sp.' not in gene_record.description.lower() and \
           'environmental' not in gene_record.description.lower():
            new_path = folder + taxon_name + '/' + gene_record.id +'.fasta'
            with open(new_path, 'w') as file:
                SeqIO.write(gene_record, file, 'fasta')

## 2 last taxons

In [8]:
def retrieve_complex_taxon(taxon, subtaxons):
    os.mkdir(folder + taxon)
    
    for subtax in subtaxons:
        handle = Entrez.esearch(db='taxonomy', term=subtax, idtype='acc', retmax=100)
        record = Entrez.read(handle)
        handle.close()
        
        item = 'txid' + record['IdList'][0] + '[Organism]' + ' 16S[Title]'
        
        handle = Entrez.esearch(db='nucleotide', term=item, idtype='acc', retmax=20000)
        record = Entrez.read(handle)
        handle.close()
        
        ids = ','.join(record['IdList'])
        handle = Entrez.efetch(db='nucleotide', id=ids, rettype='fasta', retmode='text', retmax=20000)\
        
        subtax = subtax.replace(' ', '_')
        
        subfolder = folder + taxon + '/' + subtax
        os.mkdir(subfolder)
        title = subfolder + '/' + subtax + '.fasta'
        
        with open(title, 'w') as file:
            file.write(handle.read())
            
        handle.close()

In [9]:
def split_genes_16S(taxon, subtaxons):
    for subtax in subtaxons:
        subtax = subtax.replace(' ', '_')
        path = folder + taxon + '/' + subtax + '/' + subtax + '.fasta'
        
        orgs = SeqIO.parse(path, 'fasta')
        for gene_record in orgs:
            new_path = folder + taxon + '/' + subtax + '/' + gene_record.id + '.fasta'
            
            if 'uncultured' not in gene_record.description.lower() and \
               ' sp.' not in gene_record.description.lower() and \
               'environmental' not in gene_record.description.lower():
                
                with open(new_path, 'w') as file:
                    SeqIO.write(gene_record, file, 'fasta')

### taxon Rickettsiales

In [10]:
taxons_rick = ['Anaplasmataceae',
               'Candidatus Midichloriaceae',
               'Rickettsiaceae',
               'unclassified Rickettsiales',
               'Rickettsiales genera incertae sedis']

In [11]:
retrieve_complex_taxon('Rickettsiales', taxons_rick)
split_genes_16S('Rickettsiales', taxons_rick)

### taxon Rhizobiales

In [12]:
taxons_rhiz = ['Ancalomicrobiaceae',
               'Aurantimonadaceae',
               'Bartonellaceae',
               'Beijerinckiaceae',
               'Bradyrhizobiaceae',
               'Brucellaceae',
               'Chelatococcaceae',
               'Cohaesibacteraceae',
               'Hyphomicrobiaceae',
               'Mabikibacteraceae',
               'Methylobacteriaceae',
               'Methylocystaceae',
               'Notoacmeibacteraceae',
               'Phyllobacteriaceae',
               'Rhizobiaceae',
               'Rhodobiaceae',
               'Roseiarcaceae',
               'Salinarimonadaceae',
               'Xanthobacteraceae',
               'unclassified Rhizobiales']

In [None]:
retrieve_complex_taxon('Rhizobiales', taxons_rhiz)
split_genes_16S('Rhizobiales', taxons_rhiz)