In [2]:
import urllib
from lxml import etree

# Auto ingestion of SRA metadata

1. Get a list of SRA UIDs to scrape
    1. get all publically accessible SRA (SRX) accessions
    2. remove SRA UIDs already been ingested into MetaSeek DB (all 'source_db_uid' where 'source_db' = 'SRA'
    
2. Split UIDs to scrape into batches of 500

3. For each batch, scrape metadata:
    1. efetch scrape SRA
    2. elink to BioSample, Pubmed, Nuccore
    3. efetch scrape BioSample, if exists
    4. efetch scrape Pubmed, if exists
    5. efetch scrape nuccore, if exists
    
4. Merge redundant cols to MIxS-compliant MetaSeek fields (separate script)

5. Parse fields with controlled vocabularies to MIxS CV, if possible (NOTE: future error parsing)

6. Insert scraped data into MetaSeek DB directly

# 1.A - Get a list of SRA UIDs to scrape

* find out how many publicly available samples there are; make list retstarts to use 
* Call esearch api, with rotating retstarts, to get full UID list

NOTE - most comprehensive API call to find ALL the SRA samples I can find (since an empty search term returns no accessions, not all) is term=public&field=ACS; returns all publically accessible SRA UIDs

In [3]:
def get_retstart_list(url):
    #define retstarts need for get_uid_list eutilities requests - since can only get 100,000 at a time, need to make multiple queries to get total list
    #find out count of UIDs going to pull from SRA
    g = urllib.urlopen(url)
    count_tree = etree.parse(g)
    g.close()
    count_xml = count_tree.getroot()
    num_uids = count_xml.findtext("Count")
    print 'number of publicly available UIDs in SRA: %s' % num_uids
    num_queries = 1+int(num_uids)/100000  #number of queries to do, with shifting retstart
    retstart_list = [i*100000 for i in range(num_queries)]
    print 'retstarts to use: %s' % retstart_list
    return retstart_list

def get_uid_list(ret_list):
    #scrape UIDs into list
    uid_list = []
    for retstart in ret_list:
        f = urllib.urlopen('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term=public&field=ACS&tool=metaseq&email=metaseekcloud%40gmail.com&retmax=100000&retstart='+str(retstart))
        uid_tree = etree.parse(f)
        f.close()
        uid_xml = uid_tree.getroot()
        print "appending %s accessions" % len(uid_xml.find("IdList").findall("Id"))
        #add uids to list of accessions
        for id in uid_xml.find("IdList").iterchildren():
            value = id.text
            uid_list.append(value)
    return uid_list

In [4]:
retstart_list = get_retstart_list(url='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term=public&field=ACS&rettype=count&tool=metaseq&email=metaseekcloud%40gmail.com')

number of publicly available UIDs in SRA: 2694787
retstarts to use: [0, 100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1100000, 1200000, 1300000, 1400000, 1500000, 1600000, 1700000, 1800000, 1900000, 2000000, 2100000, 2200000, 2300000, 2400000, 2500000, 2600000]


In [5]:
uid_list = get_uid_list(ret_list=retstart_list)

appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 94787 accessions


# TODO 1.B - Remove SRA UIDs already in in MetaSeek DB

db_source_uid where db_source = 'SRA'

# 2 - Split UIDs to scrape into batches of 500

In [6]:
def get_batches(uid_list):
    starts = range(0,len(uid_list),500)
    ends = range(500,len(uid_list),500)
    ends.append(len(uid_list))
    batches = [list(a) for a in zip(starts, ends)]
    return batches

In [7]:
batches = get_batches(uid_list)

In [8]:
print len(batches)
print batches[0:10]
print batches[-10:]


5390
[[0, 500], [500, 1000], [1000, 1500], [1500, 2000], [2000, 2500], [2500, 3000], [3000, 3500], [3500, 4000], [4000, 4500], [4500, 5000]]
[[2690000, 2690500], [2690500, 2691000], [2691000, 2691500], [2691500, 2692000], [2692000, 2692500], [2692500, 2693000], [2693000, 2693500], [2693500, 2694000], [2694000, 2694500], [2694500, 2694787]]


# 3.A - Efetch scrape SRA metadata

In [68]:
def get_srx_metadata(batch_uid_list):
    srx_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sra&tool=metaseq&email=metaseekcloud%40gmail.com&id='+str(batch_uid_list)[1:-1]
    s = urllib.urlopen(srx_url)
    sra_tree = etree.parse(s)
    s.close()
    sra_xml = sra_tree.getroot()

    sra_samples = sra_xml.findall("EXPERIMENT_PACKAGE")
    sdict = {}

    for which,sra_sample in enumerate(sra_samples): #the order of experiment_packages ARE in order of sra ids given - that's good
        srx_dict = {}

        sdict_id = str(batch_uid_list[which])
        srx_dict['db_source_uid'] = sdict_id
        srx_dict['db_source'] = 'SRA'
        srx_dict['expt_link'] = "https://www.ncbi.nlm.nih.gov/sra/"+str(sdict_id)

        #There are 7 top tag groups. Have to scrape data a little different for each: ['EXPERIMENT','SUBMISSION','Organization','STUDY','SAMPLE','Pool','RUN_SET']

        ###EXPERIMENT -
        if sra_sample.find("EXPERIMENT").find("IDENTIFIERS").findtext("PRIMARY_ID") is not None:
            srx_dict['expt_id'] = sra_sample.find("EXPERIMENT").find("IDENTIFIERS").findtext("PRIMARY_ID")
        if sra_sample.find("EXPERIMENT").findtext("TITLE") is not None:
            srx_dict['expt_title'] = sra_sample.find("EXPERIMENT").findtext("TITLE")
        if sra_sample.find("EXPERIMENT").find("STUDY_REF").find("IDENTIFIERS") is not None:
            if sra_sample.find("EXPERIMENT").find("STUDY_REF").find("IDENTIFIERS").findtext("PRIMARY_ID") is not None:
                srx_dict["project_id"] = sra_sample.find("EXPERIMENT").find("STUDY_REF").find("IDENTIFIERS").findtext("PRIMARY_ID")   
        if sra_sample.find("EXPERIMENT").find("DESIGN").findtext("DESIGN_DESCRIPTION") is not None:
            srx_dict['expt_design_description'] = sra_sample.find("EXPERIMENT").find("DESIGN").findtext("DESIGN_DESCRIPTION")
        if sra_sample.find("EXPERIMENT").find("DESIGN").find("SAMPLE_DESCRIPTOR").find("IDENTIFIERS").findtext("PRIMARY_ID") is not None:
            srx_dict['sample_id'] = sra_sample.find("EXPERIMENT").find("DESIGN").find("SAMPLE_DESCRIPTOR").find("IDENTIFIERS").findtext("PRIMARY_ID")
        if sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_NAME") is not None:
            srx_dict['library_name'] = sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_NAME")
        if sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_STRATEGY") is not None:
            srx_dict['library_strategy'] = sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_STRATEGY")
        if sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_SOURCE").lower() is not None:
            srx_dict['library_source'] = sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_SOURCE").lower()
        ###change library_selection to MIxS field library_screening_strategy (cv for SRA, not for MIxS)
        if sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_SELECTION") is not None:
            srx_dict['library_screening_strategy'] = sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_SELECTION")
        ###change library_layout to MIxS field library_construction_method - cv single | paired
        if sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_LAYOUT") is not None:
            srx_dict['library_construction_method'] = sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").find("LIBRARY_LAYOUT").getchildren()[0].tag.lower()
        if sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_CONSTRUCTION_PROTOCOL") is not None:
            srx_dict['library_construction_protocol'] = sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_CONSTRUCTION_PROTOCOL")
        ###change platform to MIxS field sequencing_method - cv in SRA (not in MIxS)
        if sra_sample.find("EXPERIMENT").find("PLATFORM").getchildren() is not None:
            srx_dict['sequencing_method'] = sra_sample.find("EXPERIMENT").find("PLATFORM").getchildren()[0].tag.lower()
            if sra_sample.find("EXPERIMENT").find("PLATFORM").getchildren()[0].findtext("INSTRUMENT_MODEL") is not None:
                srx_dict['instrument_model'] = sra_sample.find("EXPERIMENT").find("PLATFORM").getchildren()[0].findtext("INSTRUMENT_MODEL")
        
        ###SUBMISSION - just need the submission id
        if sra_sample.find("SUBMISSION").find("IDENTIFIERS").findtext("PRIMARY_ID") is not None:
            srx_dict['submission_id'] = sra_sample.find("SUBMISSION").find("IDENTIFIERS").findtext("PRIMARY_ID")

        ###Organization - name, address, and contact
        if sra_sample.find("Organization").findtext("Name") is not None:
            srx_dict['organization_name'] = sra_sample.find("Organization").findtext("Name")
        if sra_sample.find("Organization").find("Address") is not None:
            address = ''
            for line in sra_sample.find("Organization").find("Address").iterchildren():
                address = address+line.text+', '
            address = address[:-2]
            srx_dict['organization_address'] = address
        if len(sra_sample.find("Organization").findall("Contact"))>0:
            contacts = []
            for contact in sra_sample.find("Organization").findall("Contact"):
                name = contact.find("Name").find("First").text+' '+contact.find("Name").find("Last").text
                email = contact.get('email')
                contacts.append(name+', '+email)
            srx_dict['organization_contacts'] = contacts
        
        ###STUDY -
        if sra_sample.find("STUDY").find("IDENTIFIERS").findtext("PRIMARY_ID") is not None: 
            srx_dict['study_id'] = sra_sample.find("STUDY").find("IDENTIFIERS").findtext("PRIMARY_ID")
        if sra_sample.find("STUDY").find("DESCRIPTOR").findtext("STUDY_TITLE") is not None:
            srx_dict['study_title'] = sra_sample.find("STUDY").find("DESCRIPTOR").findtext("STUDY_TITLE")
        ###rename existing_study_type to study_type
        if sra_sample.find("STUDY").find("DESCRIPTOR").find("STUDY_TYPE") is not None:
            if sra_sample.find("STUDY").find("DESCRIPTOR").find("STUDY_TYPE").get("existing_study_type")=="Other":
                srx_dict['study_type'] = 'Other'
                if sra_sample.find("STUDY").find("DESCRIPTOR").find("STUDY_TYPE").get("add_study_type") is not None:
                    srx_dict['study_type_other'] = sra_sample.find("STUDY").find("DESCRIPTOR").find("STUDY_TYPE").get("add_study_type")
            else:
                srx_dict['study_type'] = sra_sample.find("STUDY").find("DESCRIPTOR").find("STUDY_TYPE").get("existing_study_type")
        if sra_sample.find("STUDY").find("DESCRIPTOR").findtext("STUDY_ABSTRACT"):
            srx_dict['study_abstract'] = sra_sample.find("STUDY").find("DESCRIPTOR").findtext("STUDY_ABSTRACT")
        if len(sra_sample.find("STUDY").find("STUDY_LINKS").getchildren())>0:
            study_links = {}
            for study_link in sra_sample.find("STUDY").find("STUDY_LINKS").iterchildren():
                if study_link.find("XREF_LINK") is not None:
                    study_links[study_link.find("XREF_LINK").findtext("DB")] = study_link.find("XREF_LINK").findtext("ID")
                if study_link.find("URL_LINK") is not None:
                    study_links[study_link.find("URL_LINK").findtext("LABEL")] = study_link.find("URL_LINK").findtext("URL")
            srx_dict['study_links'] = study_links
        if len(sra_sample.find("STUDY").find("STUDY_ATTRIBUTES").getchildren())>0:
            study_attributes = {}
            for attr in sra_sample.find("STUDY").find("STUDY_ATTRIBUTES").iterchildren():
                study_attributes[attr.findtext("TAG")] = attr.findtext("VALUE")
            srx_dict['study_attributes'] = study_attributes

        ###SAMPLE - just need sample id
        if sra_sample.find("SAMPLE").find("IDENTIFIERS").findtext("PRIMARY_ID") is not None:
            srx_dict['sample_id'] = sra_sample.find("SAMPLE").find("IDENTIFIERS").findtext("PRIMARY_ID")

        ###Pool - skip, redundant

        ###RUN_SET - record stats for each run as list, for best run (maxrun, run for which total_num_reads is largest) as single value
        run_ids = []
        total_num_reads = []
        total_num_bases = []
        download_size = []
        avg_read_length = []
        baseA_count = []
        baseC_count = []
        baseG_count = []
        baseT_count = []
        baseN_count = []
        gc_percent = []
        read_quality_counts = []

        if len(sra_sample.find("RUN_SET").findall("RUN"))>0:
            srx_dict['num_runs_in_accession'] = len(sra_sample.find("RUN_SET").findall("RUN"))
            for run in sra_sample.find("RUN_SET").findall("RUN"):
                run_ids.append(run.get("accession"))
                total_num_reads.append(int(run.get("total_spots")))
                total_num_bases.append(int(run.get("total_bases")))
                download_size.append(int(run.get("size")))
                avg_read_length.append(float(run.get("total_bases"))/(float(run.find("Run").get("spot_count"))+float(run.find("Run").get("spot_count_mates"))))
                for base in run.find("Bases").findall("Base"):
                    if base.get("value")=="A":
                        baseA_count.append(int(base.get("count")))
                        countA = int(base.get("count"))
                    if base.get("value")=="C":
                        baseC_count.append(int(base.get("count")))
                        countC = int(base.get("count"))
                    if base.get("value")=="G":
                        baseG_count.append(int(base.get("count")))
                        countG = int(base.get("count"))
                    if base.get("value")=="T":
                        baseT_count.append(int(base.get("count")))
                        countT = int(base.get("count"))
                    if base.get("value")=="N":
                        baseN_count.append(int(base.get("count")))
                        countN = int(base.get("count"))
                gc_percent.append(float(countG+countC)/float(countC+countG+countA+countT))
                qual_count = {}
                if run.find("Run").find("QualityCount") is not None:
                    for qual in run.find("Run").find("QualityCount").findall("Quality"):
                        qual_count[qual.get("value")] = int(qual.get("count"))
                read_quality_counts.append(qual_count)


            max_index = total_num_reads.index(max(total_num_reads))

            srx_dict['run_ids'] = run_ids
            srx_dict['run_ids_maxrun'] = run_ids[max_index]
            srx_dict['total_num_reads'] = total_num_reads
            srx_dict['total_num_reads_maxrun'] = total_num_reads[max_index]
            srx_dict['total_num_bases'] = total_num_bases
            srx_dict['total_num_bases_maxrun'] = total_num_bases[max_index]
            srx_dict['download_size'] = download_size
            srx_dict['download_size_maxrun'] = download_size[max_index]
            srx_dict['avg_read_length'] = avg_read_length
            srx_dict['avg_read_length_maxrun'] = avg_read_length[max_index]
            srx_dict['baseA_count'] = baseA_count
            srx_dict['baseA_count_maxrun'] = baseA_count[max_index]
            srx_dict['baseC_count'] = baseC_count
            srx_dict['baseC_count_maxrun'] = baseC_count[max_index]
            srx_dict['baseG_count'] = baseG_count
            srx_dict['baseG_count_maxrun'] = baseG_count[max_index]
            srx_dict['baseT_count'] = baseT_count
            srx_dict['baseT_count_maxrun'] = baseT_count[max_index]
            srx_dict['baseN_count'] = baseN_count
            srx_dict['baseN_count_maxrun'] = baseN_count[max_index]
            srx_dict['gc_percent'] = gc_percent
            srx_dict['gc_percent_maxrun'] = gc_percent[max_index]
            srx_dict['read_quality_counts'] = read_quality_counts
            srx_dict['read_quality_counts_maxrun'] = read_quality_counts[max_index]


        sdict[sdict_id] = srx_dict

    return sdict


In [9]:
batch = batches[0]
batch_uid_list = map(int,uid_list[batch[0]:batch[1]])
batch_uid_list[0:10]

[4124571,
 4124570,
 4124569,
 4124568,
 4124567,
 4124566,
 4124565,
 4124564,
 4124563,
 4124562]

In [69]:
srx_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sra&tool=metaseq&email=metaseekcloud%40gmail.com&id='+str(batch_uid_list)[1:-1][0:16] 
srx_url

'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sra&tool=metaseq&email=metaseekcloud%40gmail.com&id=4124571, 4124570'

In [12]:
s = urllib.urlopen(srx_url)
sra_tree = etree.parse(s)
s.close()
sra_xml = sra_tree.getroot()

In [13]:
sra_xml.getchildren()

[<Element EXPERIMENT_PACKAGE at 0x1045f5cb0>,
 <Element EXPERIMENT_PACKAGE at 0x1038a1878>]

In [14]:
sra_samples = sra_xml.findall("EXPERIMENT_PACKAGE")
sdict = {}
sra_samples

[<Element EXPERIMENT_PACKAGE at 0x1045f5cb0>,
 <Element EXPERIMENT_PACKAGE at 0x1038a1878>]

In [15]:
sra_sample = sra_samples[0]
sra_sample

<Element EXPERIMENT_PACKAGE at 0x1045f5cb0>

In [20]:
sra_sample.getchildren()

[<Element EXPERIMENT at 0x103647cf8>,
 <Element SUBMISSION at 0x103789518>,
 <Element Organization at 0x103789248>,
 <Element STUDY at 0x10332d5a8>,
 <Element SAMPLE at 0x10332d908>,
 <Element Pool at 0x10332d488>,
 <Element RUN_SET at 0x10332da70>]

In [21]:
sra_sample.find("RUN_SET").findall("RUN")

[<Element RUN at 0x1037899e0>]

In [60]:
run_ids = []
total_num_reads = []
total_num_bases = []
download_size = []
avg_read_length = []
baseA_count = []
baseC_count = []
baseG_count = []
baseT_count = []
baseN_count = []
gc_percent = []
read_quality_counts = []

num_runs_in_accession = len(sra_sample.find("RUN_SET").findall("RUN"))
for run in sra_sample.find("RUN_SET").findall("RUN"):
    run_ids.append(run.get("accession"))
    total_num_reads.append(int(run.get("total_spots")))
    total_num_bases.append(int(run.get("total_bases")))
    download_size.append(int(run.get("size")))
    avg_read_length.append(float(run.get("total_bases"))/(float(run.find("Run").get("spot_count"))+float(run.find("Run").get("spot_count_mates"))))
    for base in run.find("Bases").findall("Base"):
        if base.get("value")=="A":
            baseA_count.append(int(base.get("count")))
            countA = int(base.get("count"))
        if base.get("value")=="C":
            baseC_count.append(int(base.get("count")))
            countC = int(base.get("count"))
        if base.get("value")=="G":
            baseG_count.append(int(base.get("count")))
            countG = int(base.get("count"))
        if base.get("value")=="T":
            baseT_count.append(int(base.get("count")))
            countT = int(base.get("count"))
        if base.get("value")=="N":
            baseN_count.append(int(base.get("count")))
            countN = int(base.get("count"))
    gc_percent.append(float(countG+countC)/float(countC+countG+countA+countT))
    qual_count = {}
    for qual in run.find("Run").find("QualityCount").findall("Quality"):
        qual_count[qual.get("value")] = int(qual.get("count"))
    read_quality_counts.append(qual_count)
    
max_index = total_num_reads.index(max(total_num_reads))

run_ids_maxrun = run_ids[max_index]
total_num_reads_maxrun = total_num_reads[max_index]
total_num_bases_maxrun = total_num_bases[max_index]
download_size_maxrun = download_size[max_index]
avg_read_length_maxrun = avg_read_length[max_index]
baseA_count_maxrun = baseA_count[max_index]
baseC_count_maxrun = baseC_count[max_index]
baseG_count_maxrun = baseG_count[max_index]
baseT_count_maxrun = baseT_count[max_index]
baseN_count_maxrun = baseN_count[max_index]
gc_percent_maxrun = gc_percent[max_index]
read_quality_counts_maxrun = read_quality_counts[max_index]

In [62]:
print run_ids_maxrun
print total_num_reads_maxrun
print total_num_bases_maxrun
print download_size_maxrun
print avg_read_length_maxrun
print baseA_count_maxrun
print baseC_count_maxrun
print baseG_count_maxrun
print baseT_count_maxrun
print baseN_count_maxrun
print gc_percent_maxrun
print read_quality_counts_maxrun

print run_ids
print total_num_reads
print total_num_bases
print download_size
print avg_read_length
print baseA_count
print baseC_count
print baseG_count
print baseT_count
print baseN_count
print gc_percent
print read_quality_counts


ERR1462501
25106994
3138374250
1163516874
125.0
859155672
753913849
654510698
870745228
48803
0.448782183615
{'27': 98304024, '14': 97672803, '22': 13830744, '33': 283013313, '37': 2619421880, '2': 26131486}
['ERR1462501']
[25106994]
[3138374250]
[1163516874]
[125.0]
[859155672]
[753913849]
[654510698]
[870745228]
[48803]
[0.448782183615261]
[{'27': 98304024, '14': 97672803, '22': 13830744, '33': 283013313, '37': 2619421880, '2': 26131486}]


In [203]:
#asking for a friend - how many of the sra uids have elinks to biosample?
elink_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=sra&db=biosample&tool=metaseq&email=metaseekcloud%40gmail.com'

for key in batch_uid_list:
    elink_url = elink_url+'&id='+str(key)

In [None]:
elink_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=sra&db=biosample&tool=metaseq&email=metaseekcloud%40gmail.com'
for batch in batches:
    print "processing batch %s" % batch
    batch_uid_list = map(int,uid_list[batch[0]:batch[1]])

    for key in batch_uid_list:
        elink_url = elink_url+'&id='+str(key)
    try:

        e = urllib.urlopen(elink_url)
        link_tree = etree.parse(e)
        e.close()
        link_xml = link_tree.getroot()
        print link_xml
    except etree.XMLSyntaxError, detail:
        print detail.error_log

    #note if there's no biosample link, <LinkSetDb> with <DbTo>=='biosample' just won't exist
    biosample_ids = []
    pubmed_ids = []
    linksets = link_xml.findall("LinkSet")
    for link in linksets:
        linkdb = link.findall("LinkSetDb")
        if len(linkdb)>0:
            for db in linkdb:
                if db.findtext("DbTo")=='biosample':
                    biosample_ids.append(int(db.find("Link").findtext("Id")))
                if db.findtext("DbTo")=='pubmed':
                    pubmed_ids.append(int(db.find("Link").findtext("Id")))



In [1]:
e = urllib.urlopen(elink_url)
link_tree = etree.parse(e)

NameError: name 'urllib' is not defined