In [4]:
# install biopython on Jupyter server.
import sys
!python -m pip install biopython



In [None]:
import time
from Bio import Entrez
from urllib.error import HTTPError

In [12]:
Entrez.email = "lszeliga@ethz.ch" # provide your user email 
# RECOMMENDED: apply for API key from NCBI (https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/). 
# 10 queries per second with a valid API key, otherwise 3 queries per seconds are allowed for 'None'
# Entrez.api_key = None

# dbSNP supported query terms (https://www.ncbi.nlm.nih.gov/snp/docs/entrez_help/) can be build and test online using web query builder (https://www.ncbi.nlm.nih.gov/snp/advanced) 
# esearch handle
eShandle = Entrez.esearch(db="snp",  # search dbSNP
                          #complex query for missense and pathogenic variants in LPL gene with global MAF betweeen 0 and 0.01.
                          term='Y[Allele] AND pathogenic[Clinical_Significance]', 
                          usehistory="y", #cache result on server for download in batches
                          retmax=20 # return 20 RSID max
                         )


In [13]:
# get esearch result
eSresult = Entrez.read(eShandle)

In [14]:
# review results 
for k in eSresult:
    print (k, ":", eSresult[k])
    
#Output: Web environment (&WebEnv) and query key (&query_key) parameters specifying the location on the Entrez history server of the list of UIDs matching the Entrez query
#https://www.ncbi.nlm.nih.gov/books/NBK25500/#chapter1.Storing_Search_Results
    

Count : 14139
RetMax : 20
RetStart : 0
QueryKey : 1
WebEnv : MCID_6745a31fa4330cc02a08967c
IdList : ['2154580601', '2154533497', '2154257791', '2154190685', '2154178615', '2154175371', '2154157186', '2154147664', '2154135091', '2154133792', '2154115529', '2154072180', '2154035107', '2154018964', '2153999602', '2153960793', '2153960638', '2153953739', '2153939368', '2153935815']
TranslationSet : []
TranslationStack : [{'Term': 'Y[Allele]', 'Field': 'Allele', 'Count': '274073470', 'Explode': 'N'}, {'Term': 'pathogenic[Clinical_Significance]', 'Field': 'Clinical_Significance', 'Count': '130269', 'Explode': 'N'}, 'AND']
QueryTranslation : Y[Allele] AND pathogenic[Clinical_Significance]


In [15]:
# get result RSIDs list 'Idlist'
# total rs count 
rslist = (eSresult['IdList'])

In [16]:
# get the WebEnv session cookie, and the QueryKey:
webenv = eSresult["WebEnv"]
query_key = eSresult["QueryKey"]
total_count = int(eSresult["Count"])
query_key = eSresult["QueryKey"]
retmax = 2 # return 2 rs per batch example

In [17]:
# sample codes adopted with modifications from http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc139.
fetch_count = 0
for start in range(0, total_count, retmax):
    end = min(total_count, start+retmax)
    print("Going to download record %i to %i" % (start+1, end))
    attempt = 0
    #fetch_count += 1
    while (attempt < 3):
        attempt += 1
        try:
            fetch_handle = Entrez.efetch(db="snp",
                                         #rettype="uilist", #available types [uilist | xml (use retmode=xml))
                                         retmode="xml",
                                         retstart=start,
                                         retmax=retmax,
                                         webenv=webenv,
                                         query_key=query_key )
        except HTTPError as err:
            if 500 <= err.code <= 599:
                print("Received error from server %s" % err)
                print("Attempt %i of 3" % attempt)
                time.sleep(15)
            else:
                raise
    if (fetch_handle):
        #print(fetch_handle)            
        data = fetch_handle.read()
        print(data)
        fetch_handle.close()



Going to download record 1 to 2
b'<?xml version="1.0" ?>\n<ExchangeSet xmlns:xsi="https://www.w3.org/2001/XMLSchema-instance" xmlns="https://www.ncbi.nlm.nih.gov/SNP/docsum" xsi:schemaLocation="https://www.ncbi.nlm.nih.gov/SNP/docsum ftp://ftp.ncbi.nlm.nih.gov/snp/specs/docsum_eutils.xsd" ><DocumentSummary uid="2154580601"><SNP_ID>2154580601</SNP_ID><ALLELE_ORIGIN/><GLOBAL_MAFS/><GLOBAL_POPULATION/><GLOBAL_SAMPLESIZE>0</GLOBAL_SAMPLESIZE><SUSPECTED/><CLINICAL_SIGNIFICANCE>pathogenic</CLINICAL_SIGNIFICANCE><GENES><GENE_E><NAME>PCDHGC3</NAME><GENE_ID>5098</GENE_ID></GENE_E><GENE_E><NAME>PCDHGB4</NAME><GENE_ID>8641</GENE_ID></GENE_E><GENE_E><NAME>PCDHGA8</NAME><GENE_ID>9708</GENE_ID></GENE_E><GENE_E><NAME>PCDHGA12</NAME><GENE_ID>26025</GENE_ID></GENE_E><GENE_E><NAME>PCDHGC4</NAME><GENE_ID>56098</GENE_ID></GENE_E><GENE_E><NAME>PCDHGB7</NAME><GENE_ID>56099</GENE_ID></GENE_E><GENE_E><NAME>PCDHGB6</NAME><GENE_ID>56100</GENE_ID></GENE_E><GENE_E><NAME>PCDHGB5</NAME><GENE_ID>56101</GENE_ID></GEN

NameError: name 'HTTPError' is not defined