In [1]:
from Bio import Entrez

from io import StringIO

import os
from os import path
from os.path import isfile, join, dirname, isdir

import urllib

In [2]:
#helper function for preparing directories. do not change this.

def make_dir(*argv):
    mydir = path.join(*argv)
    if not path.exists(mydir):
        os.mkdir(mydir)
    return mydir


In [11]:
#You can change the search query for the NCBI assembly database

ncbi_search_term = "magnaporthe oryzae[orgn]" #change this
genus = ncbi_search_term.split(' ')[0] #e.g. 'magnaporthe'

In [12]:
#search for assemblies
Entrez.email = "boxu@berkeley.edu"
handle = Entrez.esearch(db="assembly", retmax=300, term=ncbi_search_term, sort='significance', report='full')
record = Entrez.read(handle)

In [13]:
print(record['IdList'])

['388878', '7050461', '2344491', '2561261', '2958', '280088', '8864911', '8631421', '8864891', '8631411', '6469541', '8864981', '8631401', '8864941', '2193431', '1795381', '8631441', '8864971', '8631371', '8865001', '8631381', '8864901', '8631391', '6469551', '5481461', '6902891', '6469561', '6469531', '5481451', '2193401', '1814071', '5481471', '2193451', '1795361', '8864921', '8631431', '2193391', '1789051', '2193441', '1795371', '1232641', '8864931', '8631451', '2193411', '1789091', '1232651', '2193421', '1789081', '5481441', '2193461', '1789061', '6902881', '1152771', '67731', '1649761', '81051', '1649751', '1649781', '1152841', '1152821', '1152801', '1649771', '1650321', '1650601', '1649801', '1650181', '1650591', '1650641', '1650621', '1558631', '1649811', '1650211', '1650191', '1650371', '1650711', '1650681', '236561', '1650381', '1650391', '1650341', '1650091', '1650331', '1650201', '1152831', '1650171', '1650761', '1232751', '1650361', '1650251', '1650561', '1650281', '1649831

In [None]:
#loop to download genomes. might take awhile

downloaded = []
yet_to_download = list(record['IdList'])

genomes_dir = make_dir('genomes', genus)

for uid in list(yet_to_download):
    handle = Entrez.esummary(db="assembly", id=uid, report='full')
    summary = Entrez.read(handle)
    url = summary['DocumentSummarySet']['DocumentSummary'][0]['FtpPath_GenBank']
    if url =='':
        print('no Genbank ftp for uid ', uid)
        url = summary['DocumentSummarySet']['DocumentSummary'][0]['FtpPath_RefSeq']
        if url =='':
            print('no refseq ftp for uid', uid, "... skipping")
            print('skipping')
            continue        
    label = os.path.basename(url)
    link = join(url,label+'_genomic.fna.gz')
    urllib.request.urlretrieve(link, join(genomes_dir, f'{label}.fna.gz'))
    downloaded.append(uid)
    yet_to_download.remove(uid)

#unzip all downloaded genomes
!gunzip $genomes_dir/*

In [71]:
##get reference proteome

proteomes_dir = make_dir('proteomes/', genus)

Entrez.email = "boxu@berkeley.edu"
handle = Entrez.esearch(db="genome", retmax=10, term=ncbi_search_term, sort='significance', report='full')
record = Entrez.read(handle)

handle = Entrez.esummary(db="genome", id=record['IdList'][0], report='full')
summary = Entrez.read(handle)

handle = Entrez.esummary(db="assembly", id=summary[0]['AssemblyID'], report='full')
summary = Entrez.read(handle)

url = summary['DocumentSummarySet']['DocumentSummary'][0]['FtpPath_RefSeq']

label = os.path.basename(url)
link = join(url,label+'_protein.faa.gz')
urllib.request.urlretrieve(link, join(proteomes_dir, f'{label}_protein.faa.gz'))

!gunzip $proteomes_dir/*