-
Notifications
You must be signed in to change notification settings - Fork 0
/
genomefetcher.py
61 lines (52 loc) · 2.49 KB
/
genomefetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from Bio import Entrez
import os
Entrez.email = "jba9@leicester.ac.uk"
def main() :
#search = searchAssembliesSpecial("Neisseria lactamica", ["meningitidis", "gonorrhoeae"])
search=searchAssemblies('Campylobacter fetus', True)
links = identifyLinks(search)
downloadLinks("../PolyG/coli_fetus", links)
def searchAssemblies(organism, completeOnly = True) :
searchText = '("' + organism + '"[Organism])'
if completeOnly :
searchText = searchText + 'AND ("complete genome"[Assembly Level])'
searchHandle = Entrez.esearch(db="assembly", retmax = 250, term=searchText)
return Entrez.read(searchHandle)
def searchAssembliesSpecial(organism, completeFor) :
# searchText = '("'+ organism + '"[Organism] AND "complete genome"[Assembly Level])'
# searchText += ' OR ("' + organism + '"[Organism] NOT '+' NOT '.join('"'+organism+' '+x+'"[Organism]' for x in completeFor)+')'
searchText = '("'+ organism + '"[Organism] AND "scaffold"[Assembly Level])'
#searchHandle = Entrez.esearch(db="assembly", retmax = 250, term=searchText)
searchHandle = Entrez.esearch(db="assembly", retmax = 1, term=searchText)
return Entrez.read(searchHandle)
def identifyLinks(searchRecord) :
linkHandle = Entrez.elink(dbfrom="assembly", db="nucleotide", id=searchRecord['IdList'])
linkRecord = Entrez.read(linkHandle)
return linkRecord
def downloadLinks(intoDir, linkRecord) :
# Create the directory
if not os.path.exists(intoDir):
os.makedirs(intoDir)
# Create download list
downloadIDs = []
downloadParents = []
for link in linkRecord :
id = link['IdList'][0]
linkSetDb = link['LinkSetDb']
for linkSet in linkSetDb :
if linkSet['LinkName'] == 'assembly_nuccore_insdc' :
for l in linkSet['Link'] :
downloadIDs.append(l['Id'])
downloadParents.append(id)
break
else :
print("Error: no INSDC assembly found for ", id)
for i, id in enumerate(downloadIDs) :
with open(os.path.join(intoDir, downloadParents[i] + "-" + id + ".gb"), "w") as gbOut :
print('Downloading '+str(i+1)+'/'+str(len(downloadIDs))+' '+downloadParents[i] + '-' + id)
fetchHandle = Entrez.efetch(db="nuccore", id=id, rettype="gb", retmode="text")
gbOut.write(fetchHandle.read())
fetchHandle.close()
# Remove auto-run
if __name__ == "__main__":
main()