In [2]:
import urllib
from lxml import etree

# Auto ingestion of SRA metadata

1. Get a list of SRA UIDs to scrape
    1. get all publically accessible SRA (SRX) accessions
    2. remove SRA UIDs already been ingested into MetaSeek DB (all 'source_db_uid' where 'source_db' = 'SRA'
    
2. Split UIDs to scrape into batches of 500

3. For each batch, scrape metadata:
    1. efetch scrape SRA
    2. elink to BioSample, Pubmed, Nuccore
    3. efetch scrape BioSample, if exists
    4. efetch scrape Pubmed, if exists
    5. efetch scrape nuccore, if exists
    
4. Merge redundant cols to MIxS-compliant MetaSeek fields (separate script)

5. Parse fields with controlled vocabularies to MIxS CV, if possible (NOTE: future error parsing)

6. Insert scraped data into MetaSeek DB directly

# 1.A - Get a list of SRA UIDs to scrape

* find out how many publicly available samples there are; make list retstarts to use 
* Call esearch api, with rotating retstarts, to get full UID list

NOTE - most comprehensive API call to find ALL the SRA samples I can find (since an empty search term returns no accessions, not all) is term=public&field=ACS; returns all publically accessible SRA UIDs

In [3]:
def get_retstart_list(url):
    #define retstarts need for get_uid_list eutilities requests - since can only get 100,000 at a time, need to make multiple queries to get total list
    #find out count of UIDs going to pull from SRA
    g = urllib.urlopen(url)
    count_tree = etree.parse(g)
    g.close()
    count_xml = count_tree.getroot()
    num_uids = count_xml.findtext("Count")
    print 'number of publicly available UIDs in SRA: %s' % num_uids
    num_queries = 1+int(num_uids)/100000  #number of queries to do, with shifting retstart
    retstart_list = [i*100000 for i in range(num_queries)]
    print 'retstarts to use: %s' % retstart_list
    return retstart_list

def get_uid_list(ret_list):
    #scrape UIDs into list
    uid_list = []
    for retstart in ret_list:
        f = urllib.urlopen('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term=public&field=ACS&tool=metaseq&email=metaseekcloud%40gmail.com&retmax=100000&retstart='+str(retstart))
        uid_tree = etree.parse(f)
        f.close()
        uid_xml = uid_tree.getroot()
        print "appending %s accessions" % len(uid_xml.find("IdList").findall("Id"))
        #add uids to list of accessions
        for id in uid_xml.find("IdList").iterchildren():
            value = id.text
            uid_list.append(value)
    return uid_list

In [4]:
retstart_list = get_retstart_list(url='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term=public&field=ACS&rettype=count&tool=metaseq&email=metaseekcloud%40gmail.com')

number of publicly available UIDs in SRA: 2694787
retstarts to use: [0, 100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1100000, 1200000, 1300000, 1400000, 1500000, 1600000, 1700000, 1800000, 1900000, 2000000, 2100000, 2200000, 2300000, 2400000, 2500000, 2600000]


In [5]:
uid_list = get_uid_list(ret_list=retstart_list)

appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 94787 accessions


# TODO 1.B - Remove SRA UIDs already in in MetaSeek DB

db_source_uid where db_source = 'SRA'

# 2 - Split UIDs to scrape into batches of 500

In [6]:
def get_batches(uid_list):
    starts = range(0,len(uid_list),500)
    ends = range(500,len(uid_list),500)
    ends.append(len(uid_list))
    batches = [list(a) for a in zip(starts, ends)]
    return batches

In [7]:
batches = get_batches(uid_list)

In [8]:
print len(batches)
print batches[0:10]
print batches[-10:]


5390
[[0, 500], [500, 1000], [1000, 1500], [1500, 2000], [2000, 2500], [2500, 3000], [3000, 3500], [3500, 4000], [4000, 4500], [4500, 5000]]
[[2690000, 2690500], [2690500, 2691000], [2691000, 2691500], [2691500, 2692000], [2692000, 2692500], [2692500, 2693000], [2693000, 2693500], [2693500, 2694000], [2694000, 2694500], [2694500, 2694787]]


# 3.A - Efetch scrape SRA metadata

In [149]:
def get_srx_metadata(batch_uid_list):
    print "Querying API and parsing XML..."
    srx_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sra&tool=metaseq&email=metaseekcloud%40gmail.com&id='+str(batch_uid_list)[1:-1]
    s = urllib.urlopen(srx_url)
    sra_tree = etree.parse(s)
    s.close()
    sra_xml = sra_tree.getroot()
    print "...parsing done!"

    sra_samples = sra_xml.findall("EXPERIMENT_PACKAGE")
    sdict = {}

    for which,sra_sample in enumerate(sra_samples): #the order of experiment_packages ARE in order of sra ids given - that's good
        print "--scraping srx metadata for sample %s out of %s" % (which+1,len(sra_samples))
        srx_dict = {}

        srx_uid = str(batch_uid_list[which])
        srx_dict['db_source_uid'] = srx_uid
        srx_dict['db_source'] = 'SRA'
        srx_dict['expt_link'] = "https://www.ncbi.nlm.nih.gov/sra/"+str(srx_uid)

        #There are 7 top tag groups. Have to scrape data a little different for each: ['EXPERIMENT','SUBMISSION','Organization','STUDY','SAMPLE','Pool','RUN_SET']

        ###EXPERIMENT -
        if sra_sample.find("EXPERIMENT").find("IDENTIFIERS").findtext("PRIMARY_ID") is not None:
            srx_dict['expt_id'] = sra_sample.find("EXPERIMENT").find("IDENTIFIERS").findtext("PRIMARY_ID")
        if sra_sample.find("EXPERIMENT").findtext("TITLE") is not None:
            srx_dict['expt_title'] = sra_sample.find("EXPERIMENT").findtext("TITLE")
        if sra_sample.find("EXPERIMENT").find("STUDY_REF").find("IDENTIFIERS") is not None:
            if sra_sample.find("EXPERIMENT").find("STUDY_REF").find("IDENTIFIERS").findtext("PRIMARY_ID") is not None:
                srx_dict["project_id"] = sra_sample.find("EXPERIMENT").find("STUDY_REF").find("IDENTIFIERS").findtext("PRIMARY_ID")   
        if sra_sample.find("EXPERIMENT").find("DESIGN").findtext("DESIGN_DESCRIPTION") is not None:
            srx_dict['expt_design_description'] = sra_sample.find("EXPERIMENT").find("DESIGN").findtext("DESIGN_DESCRIPTION")
        if sra_sample.find("EXPERIMENT").find("DESIGN").find("SAMPLE_DESCRIPTOR").find("IDENTIFIERS").findtext("PRIMARY_ID") is not None:
            srx_dict['sample_id'] = sra_sample.find("EXPERIMENT").find("DESIGN").find("SAMPLE_DESCRIPTOR").find("IDENTIFIERS").findtext("PRIMARY_ID")
        if sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_NAME") is not None:
            srx_dict['library_name'] = sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_NAME")
        if sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_STRATEGY") is not None:
            srx_dict['library_strategy'] = sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_STRATEGY")
        if sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_SOURCE").lower() is not None:
            srx_dict['library_source'] = sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_SOURCE").lower()
        ###change library_selection to MIxS field library_screening_strategy (cv for SRA, not for MIxS)
        if sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_SELECTION") is not None:
            srx_dict['library_screening_strategy'] = sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_SELECTION")
        ###change library_layout to MIxS field library_construction_method - cv single | paired
        if sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_LAYOUT") is not None:
            srx_dict['library_construction_method'] = sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").find("LIBRARY_LAYOUT").getchildren()[0].tag.lower()
        if sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_CONSTRUCTION_PROTOCOL") is not None:
            srx_dict['library_construction_protocol'] = sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_CONSTRUCTION_PROTOCOL")
        ###change platform to MIxS field sequencing_method - cv in SRA (not in MIxS)
        if sra_sample.find("EXPERIMENT").find("PLATFORM").getchildren() is not None:
            srx_dict['sequencing_method'] = sra_sample.find("EXPERIMENT").find("PLATFORM").getchildren()[0].tag.lower()
            if sra_sample.find("EXPERIMENT").find("PLATFORM").getchildren()[0].findtext("INSTRUMENT_MODEL") is not None:
                srx_dict['instrument_model'] = sra_sample.find("EXPERIMENT").find("PLATFORM").getchildren()[0].findtext("INSTRUMENT_MODEL")
        
        ###SUBMISSION - just need the submission id
        if sra_sample.find("SUBMISSION").find("IDENTIFIERS").findtext("PRIMARY_ID") is not None:
            srx_dict['submission_id'] = sra_sample.find("SUBMISSION").find("IDENTIFIERS").findtext("PRIMARY_ID")

        ###Organization - name, address, and contact
        if sra_sample.find("Organization").findtext("Name") is not None:
            srx_dict['organization_name'] = sra_sample.find("Organization").findtext("Name")
        if sra_sample.find("Organization").find("Address") is not None:
            address = ''
            for line in sra_sample.find("Organization").find("Address").iterchildren():
                address = address+line.text+', '
            address = address[:-2]
            srx_dict['organization_address'] = address
        if len(sra_sample.find("Organization").findall("Contact"))>0:
            contacts = []
            for contact in sra_sample.find("Organization").findall("Contact"):
                name = contact.find("Name").find("First").text+' '+contact.find("Name").find("Last").text
                email = contact.get('email')
                contacts.append(name+', '+email)
            srx_dict['organization_contacts'] = contacts
        
        ###STUDY -
        if sra_sample.find("STUDY").find("IDENTIFIERS").findtext("PRIMARY_ID") is not None: 
            srx_dict['study_id'] = sra_sample.find("STUDY").find("IDENTIFIERS").findtext("PRIMARY_ID")
        if len(sra_sample.find("STUDY").find("IDENTIFIERS").findall("EXTERNAL_ID"))>0:
            for external in sra_sample.find("STUDY").find("IDENTIFIERS").iterchildren("EXTERNAL_ID"):
                if external.get("namespace")=='BioProject':
                    srx_dict['bioproject_id'] = external.text
        if sra_sample.find("STUDY").find("DESCRIPTOR").findtext("STUDY_TITLE") is not None:
            srx_dict['study_title'] = sra_sample.find("STUDY").find("DESCRIPTOR").findtext("STUDY_TITLE")
        ###rename existing_study_type to study_type
        if sra_sample.find("STUDY").find("DESCRIPTOR").find("STUDY_TYPE") is not None:
            if sra_sample.find("STUDY").find("DESCRIPTOR").find("STUDY_TYPE").get("existing_study_type")=="Other":
                srx_dict['study_type'] = 'Other'
                if sra_sample.find("STUDY").find("DESCRIPTOR").find("STUDY_TYPE").get("add_study_type") is not None:
                    srx_dict['study_type_other'] = sra_sample.find("STUDY").find("DESCRIPTOR").find("STUDY_TYPE").get("add_study_type")
            else:
                srx_dict['study_type'] = sra_sample.find("STUDY").find("DESCRIPTOR").find("STUDY_TYPE").get("existing_study_type")
        if sra_sample.find("STUDY").find("DESCRIPTOR").findtext("STUDY_ABSTRACT"):
            srx_dict['study_abstract'] = sra_sample.find("STUDY").find("DESCRIPTOR").findtext("STUDY_ABSTRACT")
        if sra_sample.find("STUDY").find("STUDY_LINKS") is not None:
            study_links = {}
            for study_link in sra_sample.find("STUDY").find("STUDY_LINKS").iterchildren():
                if study_link.find("XREF_LINK") is not None:
                    study_links[study_link.find("XREF_LINK").findtext("DB")] = study_link.find("XREF_LINK").findtext("ID")
                if study_link.find("URL_LINK") is not None:
                    study_links[study_link.find("URL_LINK").findtext("LABEL")] = study_link.find("URL_LINK").findtext("URL")
            srx_dict['study_links'] = study_links
        if sra_sample.find("STUDY").find("STUDY_ATTRIBUTES") is not None:
            study_attributes = {}
            for attr in sra_sample.find("STUDY").find("STUDY_ATTRIBUTES").iterchildren():
                study_attributes[attr.findtext("TAG")] = attr.findtext("VALUE")
            srx_dict['study_attributes'] = study_attributes

        ###SAMPLE - get some BioSample stuff that's in easier format here: sample id, biosample id (if exists; it should but sometimes doesn't); also title, sample name stuff, and description; rest get from biosample scraping
        if sra_sample.find("SAMPLE").find("IDENTIFIERS").findtext("PRIMARY_ID") is not None:
            srx_dict['sample_id'] = sra_sample.find("SAMPLE").find("IDENTIFIERS").findtext("PRIMARY_ID")
        if len(sra_sample.find("SAMPLE").find("IDENTIFIERS").findall("EXTERNAL_ID"))>0:
            for external in sra_sample.find("SAMPLE").find("IDENTIFIERS").iterchildren("EXTERNAL_ID"):
                if external.get("namespace")=='BioSample':
                    srx_dict['biosample_id'] = external.text
        if sra_sample.find("SAMPLE").findtext("TITLE") is not None:
            srx_dict['sample_title'] = sra_sample.find("SAMPLE").findtext("TITLE")
        if sra_sample.find("SAMPLE").find("SAMPLE_NAME") is not None: 
            if sra_sample.find("SAMPLE").find("SAMPLE_NAME").findtext("TAXON_ID") is not None:
                srx_dict['ncbi_taxon_id'] = sra_sample.find("SAMPLE").find("SAMPLE_NAME").findtext("TAXON_ID")
            if sra_sample.find("SAMPLE").find("SAMPLE_NAME").findtext("SCIENTIFIC_NAME") is not None:
                srx_dict['taxon_scientific_name'] = sra_sample.find("SAMPLE").find("SAMPLE_NAME").findtext("SCIENTIFIC_NAME")
            if sra_sample.find("SAMPLE").find("SAMPLE_NAME").findtext("COMMON_NAME") is not None:
                srx_dict['taxon_common_name'] = sra_sample.find("SAMPLE").find("SAMPLE_NAME").findtext("COMMON_NAME")
        if sra_sample.find("SAMPLE").findtext("DESCRIPTION") is not None: 
            srx_dict['sample_description'] = sra_sample.find("SAMPLE").findtext("DESCRIPTION")


        ###Pool - skip, redundant

        ###RUN_SET - record stats for each run as list, for best run (maxrun, run for which total_num_reads is largest) as single value
        run_ids = []
        total_num_reads = []
        total_num_bases = []
        download_size = []
        avg_read_length = []
        baseA_count = []
        baseC_count = []
        baseG_count = []
        baseT_count = []
        baseN_count = []
        gc_percent = []
        read_quality_counts = []

        if len(sra_sample.find("RUN_SET").findall("RUN"))>0:
            srx_dict['num_runs_in_accession'] = len(sra_sample.find("RUN_SET").findall("RUN"))
            for run in sra_sample.find("RUN_SET").findall("RUN"):
                run_ids.append(run.get("accession"))
                total_num_reads.append(int(run.get("total_spots")))
                total_num_bases.append(int(run.get("total_bases")))
                download_size.append(int(run.get("size")))
                avg_read_length.append(float(run.get("total_bases"))/(float(run.find("Run").get("spot_count"))+float(run.find("Run").get("spot_count_mates"))))
                for base in run.find("Bases").findall("Base"):
                    if base.get("value")=="A":
                        baseA_count.append(int(base.get("count")))
                        countA = int(base.get("count"))
                    if base.get("value")=="C":
                        baseC_count.append(int(base.get("count")))
                        countC = int(base.get("count"))
                    if base.get("value")=="G":
                        baseG_count.append(int(base.get("count")))
                        countG = int(base.get("count"))
                    if base.get("value")=="T":
                        baseT_count.append(int(base.get("count")))
                        countT = int(base.get("count"))
                    if base.get("value")=="N":
                        baseN_count.append(int(base.get("count")))
                        countN = int(base.get("count"))
                gc_percent.append(float(countG+countC)/float(countC+countG+countA+countT))
                qual_count = {}
                if run.find("Run").find("QualityCount") is not None:
                    for qual in run.find("Run").find("QualityCount").findall("Quality"):
                        qual_count[qual.get("value")] = int(qual.get("count"))
                read_quality_counts.append(qual_count)


            max_index = total_num_reads.index(max(total_num_reads))

            srx_dict['run_ids'] = run_ids
            srx_dict['run_ids_maxrun'] = run_ids[max_index]
            srx_dict['total_num_reads'] = total_num_reads
            srx_dict['total_num_reads_maxrun'] = total_num_reads[max_index]
            srx_dict['total_num_bases'] = total_num_bases
            srx_dict['total_num_bases_maxrun'] = total_num_bases[max_index]
            srx_dict['download_size'] = download_size
            srx_dict['download_size_maxrun'] = download_size[max_index]
            srx_dict['avg_read_length'] = avg_read_length
            srx_dict['avg_read_length_maxrun'] = avg_read_length[max_index]
            srx_dict['baseA_count'] = baseA_count
            srx_dict['baseA_count_maxrun'] = baseA_count[max_index]
            srx_dict['baseC_count'] = baseC_count
            srx_dict['baseC_count_maxrun'] = baseC_count[max_index]
            srx_dict['baseG_count'] = baseG_count
            srx_dict['baseG_count_maxrun'] = baseG_count[max_index]
            srx_dict['baseT_count'] = baseT_count
            srx_dict['baseT_count_maxrun'] = baseT_count[max_index]
            srx_dict['baseN_count'] = baseN_count
            srx_dict['baseN_count_maxrun'] = baseN_count[max_index]
            srx_dict['gc_percent'] = gc_percent
            srx_dict['gc_percent_maxrun'] = gc_percent[max_index]
            srx_dict['read_quality_counts'] = read_quality_counts
            srx_dict['read_quality_counts_maxrun'] = read_quality_counts[max_index]


        sdict[srx_uid] = srx_dict

    return sdict


In [150]:
batch = batches[0]
batch_uid_list = map(int,uid_list[batch[0]:batch[1]])
print len(batch_uid_list)
batch_uid_list[0:10]

500


[4124571,
 4124570,
 4124569,
 4124568,
 4124567,
 4124566,
 4124565,
 4124564,
 4124563,
 4124562]

In [151]:
sdict = get_srx_metadata(batch_uid_list=batch_uid_list)

Querying API and parsing XML...
...parsing done!
--scraping srx metadata for sample 1 out of 500
--scraping srx metadata for sample 2 out of 500
--scraping srx metadata for sample 3 out of 500
--scraping srx metadata for sample 4 out of 500
--scraping srx metadata for sample 5 out of 500
--scraping srx metadata for sample 6 out of 500
--scraping srx metadata for sample 7 out of 500
--scraping srx metadata for sample 8 out of 500
--scraping srx metadata for sample 9 out of 500
--scraping srx metadata for sample 10 out of 500
--scraping srx metadata for sample 11 out of 500
--scraping srx metadata for sample 12 out of 500
--scraping srx metadata for sample 13 out of 500
--scraping srx metadata for sample 14 out of 500
--scraping srx metadata for sample 15 out of 500
--scraping srx metadata for sample 16 out of 500
--scraping srx metadata for sample 17 out of 500
--scraping srx metadata for sample 18 out of 500
--scraping srx metadata for sample 19 out of 500
--scraping srx metadata for s

In [160]:
sdict.keys()[0:5]
sdict['4123344']

{'avg_read_length': [243.1248156205851],
 'avg_read_length_maxrun': 243.1248156205851,
 'baseA_count': [26718113],
 'baseA_count_maxrun': 26718113,
 'baseC_count': [35901728],
 'baseC_count_maxrun': 35901728,
 'baseG_count': [35257176],
 'baseG_count_maxrun': 35257176,
 'baseN_count': [54440],
 'baseN_count_maxrun': 54440,
 'baseT_count': [27006982],
 'baseT_count_maxrun': 27006982,
 'bioproject_id': 'PRJNA388450',
 'biosample_id': 'SAMN07189570',
 'db_source': 'SRA',
 'db_source_uid': '4123344',
 'download_size': [71321164],
 'download_size_maxrun': 71321164,
 'expt_design_description': '250 paired ends reads of fragmented DNA',
 'expt_id': 'SRX2880321',
 'expt_link': 'https://www.ncbi.nlm.nih.gov/sra/4123344',
 'expt_title': 'whole genome sequencing of gram-negative bacteria',
 'gc_percent': [0.5698000109685789],
 'gc_percent_maxrun': 0.5698000109685789,
 'instrument_model': 'Illumina MiSeq',
 'library_construction_method': 'paired',
 'library_name': 'MB750',
 'library_screening_stra

In [13]:
sra_xml.getchildren()

[<Element EXPERIMENT_PACKAGE at 0x1045f5cb0>,
 <Element EXPERIMENT_PACKAGE at 0x1038a1878>]

In [14]:
sra_samples = sra_xml.findall("EXPERIMENT_PACKAGE")
sdict = {}
sra_samples

[<Element EXPERIMENT_PACKAGE at 0x1045f5cb0>,
 <Element EXPERIMENT_PACKAGE at 0x1038a1878>]

In [15]:
sra_sample = sra_samples[0]
sra_sample

<Element EXPERIMENT_PACKAGE at 0x1045f5cb0>

In [20]:
sra_sample.getchildren()

[<Element EXPERIMENT at 0x103647cf8>,
 <Element SUBMISSION at 0x103789518>,
 <Element Organization at 0x103789248>,
 <Element STUDY at 0x10332d5a8>,
 <Element SAMPLE at 0x10332d908>,
 <Element Pool at 0x10332d488>,
 <Element RUN_SET at 0x10332da70>]

In [84]:
if sra_sample.find("SAMPLE").findtext("DESCRIPTION") is not None: 
    print sra_sample.find("SAMPLE").findtext("DESCRIPTION")

rkd mutant ovules late phase A


# 3.B - elink to BioSample, Pubmed, Nuccore

In [203]:
#asking for a friend - how many of the sra uids have elinks to biosample?
elink_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=sra&db=biosample&tool=metaseq&email=metaseekcloud%40gmail.com'

for key in batch_uid_list:
    #this makes url with end &id=###&id=###&id=### - returns a set of links in order of sra uids
    elink_url = elink_url+'&id='+str(key)

In [None]:
elink_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=sra&db=biosample&tool=metaseq&email=metaseekcloud%40gmail.com'
for batch in batches:
    print "processing batch %s" % batch
    batch_uid_list = map(int,uid_list[batch[0]:batch[1]])

    for key in batch_uid_list:
        elink_url = elink_url+'&id='+str(key)
    try:

        e = urllib.urlopen(elink_url)
        link_tree = etree.parse(e)
        e.close()
        link_xml = link_tree.getroot()
        print link_xml
    except etree.XMLSyntaxError, detail:
        print detail.error_log

    #note if there's no biosample link, <LinkSetDb> with <DbTo>=='biosample' just won't exist
    biosample_ids = []
    pubmed_ids = []
    linksets = link_xml.findall("LinkSet")
    for link in linksets:
        linkdb = link.findall("LinkSetDb")
        if len(linkdb)>0:
            for db in linkdb:
                if db.findtext("DbTo")=='biosample':
                    biosample_ids.append(int(db.find("Link").findtext("Id")))
                if db.findtext("DbTo")=='pubmed':
                    pubmed_ids.append(int(db.find("Link").findtext("Id")))



In [1]:
e = urllib.urlopen(elink_url)
link_tree = etree.parse(e)

NameError: name 'urllib' is not defined

In [168]:
#takes list of SRX UIDs to query (batch_uid_list), and sdict into which to insert link uids; 
#return sdict with 'biosample_uid', 'pubmed_uids', and/or 'nuccore_uids' inserted; and link dict with lists of biosample_uids, pubmed_uids, and nuccore_uids to scrape
def get_links(batch_uid_list, sdict):
    print "sending elink request and parsing XML..."
    elink_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=sra&db=biosample,pubmed,nuccore&tool=metaseq&email=metaseekcloud%40gmail.com'
    for key in batch_uid_list:
        #this makes url with end &id=###&id=###&id=### - returns a set of links in order of sra uids
        elink_url = elink_url+'&id='+str(key)
    #run api request and parse xml
    e = urllib.urlopen(elink_url)
    link_tree = etree.parse(e)
    e.close()
    link_xml = link_tree.getroot()
    print "...parsing done!"

    print "finding links..."
    #scrape elink info
    #note if there's no biosample link, <LinkSetDb> with <DbTo>=='biosample' just won't exist
    biosample_uids = []
    pubmed_uids = []
    nuccore_uids = []

    linksets = link_xml.findall("LinkSet")
    for linkset in linksets:
        srx_uid = linkset.find("IdList").findtext("Id")
        #links from each target db will be in a tab called "LinkSetDb"
        if len(linkset.findall("LinkSetDb"))>0:
            for link in linkset.findall("LinkSetDb"):
                id_set = []
                if link.findtext("DbTo")=='biosample':
                    #for all Links, get Ids
                    for uid in link.findall("Link"):
                        id_set.append(int(uid.findtext("Id")))
                    biosample_uids.extend(id_set)
                    sdict[srx_uid]['biosample_uid'] = id_set
                elif link.findtext("DbTo")=='pubmed':
                    for uid in link.findall("Link"):
                        id_set.append(int(uid.findtext("Id")))
                    pubmed_uids.extend(id_set)
                    sdict[srx_uid]['pubmed_uids'] = id_set
                elif link.findtext("DbTo")=='nuccore':
                    for uid in link.findall("Link"):
                        id_set.append(int(uid.findtext("Id")))
                    nuccore_uids.extend(id_set)
                    sdict[srx_uid]['nuccore_uids'] = id_set

    biosample_uids = list(set(biosample_uids))
    pubmed_uids = list(set(pubmed_uids))
    nuccore_uids = list(set(nuccore_uids))
    
    linkdict = {'biosample_uids':biosample_uids,'pubmed_uids':pubmed_uids,'nuccore_uids':nuccore_uids}
    print "number of biosamples to scrape: %s" % len(linkdict['biosample_uids'])
    print "number of pubmeds to scrape: %s" % len(linkdict['pubmed_uids'])
    print "number of nuccores to scrape: %s" % len(linkdict['nuccore_uids'])
    
    return sdict,linkdict

In [128]:
biosample_uids = []
pubmed_uids = []
nuccore_uids = []

linksets = link_xml.findall("LinkSet")
for linkset in linksets:
    #srx uid
    srx_uid = linkset.find("IdList").findtext("Id")
    #links from each target db will be in a tab called "LinkSetDb"
    if len(linkset.findall("LinkSetDb"))>0:
        for link in linkset.findall("LinkSetDb"):
            id_set = []
            if link.findtext("DbTo")=='biosample':
                #for all Links, get Ids
                for uid in link.findall("Link"):
                    id_set.append(int(uid.findtext("Id")))
                biosample_uids.extend(id_set)
                #sdict[srx_uid]['biosample_uid'] = id_set
            elif link.findtext("DbTo")=='pubmed':
                for uid in link.findall("Link"):
                    id_set.append(int(uid.findtext("Id")))
                pubmed_uids.extend(id_set)
                #sdict[srx_uid]['pubmed_uids'] = id_set
            elif link.findtext("DbTo")=='nuccore':
                for uid in link.findall("Link"):
                    id_set.append(int(uid.findtext("Id")))
                nuccore_uids.extend(id_set)
                #sdict[srx_uid]['nuccore_uids'] = id_set

biosample_uids = list(set(biosample_uids))
pubmed_uids = list(set(pubmed_uids))
nuccore_uids = list(set(nuccore_uids))

print len(biosample_uids), biosample_uids[0:10]
print len(pubmed_uids), pubmed_uids[0:10]
print len(nuccore_uids), nuccore_uids[0:10]


380 [7151616, 7184736, 5570637, 7176304, 7176307, 7176308, 7176309, 7176310, 7176311, 7176313]
7 [28472260, 26044422, 27365352, 25103754, 25013133, 26679598, 25540336]
44 [1123333122, 651352708, 1028628997, 763503881, 763528202, 1194697611, 1003446637, 1003442578, 1003436440, 1003458205]


In [169]:
sdict, linkdict = get_links(batch_uid_list=batch_uid_list,sdict=sdict)

sending elink request and parsing XML...
...parsing done!
finding links...
number of biosamples to scrape: 380
number of pubmeds to scrape: 7
number of nuccores to scrape: 44


In [171]:
print linkdict.keys()
print len(linkdict['pubmed_uids'])
print len(linkdict['nuccore_uids'])
print len(linkdict['biosample_uids'])
pubmeds = []
nuccores = []
biosamples = []
for key in sdict.keys():
    if 'pubmed_uids' in sdict[key].keys():
        pubmeds.append(key)
    if 'nuccore_uids' in sdict[key].keys():
        nuccores.append(key)
    if 'biosample_uid' in sdict[key].keys():
        biosamples.append(key)
print len(pubmeds),len(nuccores),len(biosamples)
        

['pubmed_uids', 'nuccore_uids', 'biosample_uids']
7
44
380
166 99 445


In [99]:

d3 = {'this':12,'that':'for'}
d4 = {'this':24,'those':'next'}
print d3, d4
#when run dict1.update(dict2), dict2 is added to dict1. If there's overlap in the keys, dict2 val replaces dict1
d3.update(d4)
print d3, d4

{'this': 12, 'that': 'for'} {'this': 24, 'those': 'next'}
{'this': 24, 'those': 'next', 'that': 'for'} {'this': 24, 'those': 'next'}


In [103]:
if d3['this']==24:
    print "yes"
if 'wait' in d3.keys():
    print "in"
else:
    print "not in"

yes
not in


In [109]:
old = [2,4,6]
new = [7,8,9]
old+new

[2, 4, 6, 7, 8, 9]

In [111]:
old.extend(new)
old

[2, 4, 6, 7, 8, 9]