In [4]:
from Bio import Entrez

from io import StringIO

import os
from os import path
from os.path import isfile, join, dirname, isdir

import urllib

In [1]:
#helper function for preparing directories. do not change this.

def make_dir(*argv):
    mydir = path.join(*argv)
    if not path.exists(mydir):
        os.mkdir(mydir)
    return mydir


In [2]:
#You can change the search query for the NCBI assembly database

ncbi_search_term = "magnaporthe oryzae[orgn]" #change this
genus = ncbi_search_term.split(' ')[0] #e.g. 'magnaporthe'

In [5]:
#search for assemblies
Entrez.email = "boxu@berkeley.edu"
handle = Entrez.esearch(db="assembly", retmax=300, term=ncbi_search_term, sort='significance', report='full')
record = Entrez.read(handle)

In [8]:
len(record['IdList'])

253

In [8]:
#loop to download genomes. might take awhile

downloaded = []
yet_to_download = list(record['IdList'])

genomes_dir = make_dir('genomes', genus)

for uid in list(yet_to_download):
    handle = Entrez.esummary(db="assembly", id=uid, report='full')
    summary = Entrez.read(handle)
    url = summary['DocumentSummarySet']['DocumentSummary'][0]['FtpPath_GenBank']
    if url =='':
        print('no Genbank ftp for uid ', uid)
        url = summary['DocumentSummarySet']['DocumentSummary'][0]['FtpPath_RefSeq']
        if url =='':
            print('no refseq ftp for uid', uid, "... skipping")
            print('skipping')
            continue        
    label = os.path.basename(url)
    link = join(url,label+'_genomic.fna.gz')
    urllib.request.urlretrieve(link, join(genomes_dir, f'{label}.fna.gz'))
    downloaded.append(uid)
    yet_to_download.remove(uid)

#unzip all downloaded genomes
!gunzip $genomes_dir/*

no Genbank ftp for uid  2958
no refseq ftp for uid 2958 ... skipping
skipping
no Genbank ftp for uid  280088
no refseq ftp for uid 280088 ... skipping
skipping


KeyboardInterrupt: 

In [71]:
##get reference proteome

proteomes_dir = make_dir('proteomes/', genus)

Entrez.email = "boxu@berkeley.edu"
handle = Entrez.esearch(db="genome", retmax=10, term=ncbi_search_term, sort='significance', report='full')
record = Entrez.read(handle)

handle = Entrez.esummary(db="genome", id=record['IdList'][0], report='full')
summary = Entrez.read(handle)

handle = Entrez.esummary(db="assembly", id=summary[0]['AssemblyID'], report='full')
summary = Entrez.read(handle)

url = summary['DocumentSummarySet']['DocumentSummary'][0]['FtpPath_RefSeq']

label = os.path.basename(url)
link = join(url,label+'_protein.faa.gz')
urllib.request.urlretrieve(link, join(proteomes_dir, f'{label}_protein.faa.gz'))

!gunzip $proteomes_dir/*