In [199]:
import urllib
from lxml import etree
from datetime import datetime

# Auto ingestion of SRA metadata

1. Get a list of SRA UIDs to scrape
    1. get all publically accessible SRA (SRX) accessions
    2. remove SRA UIDs already been ingested into MetaSeek DB (all 'source_db_uid' where 'source_db' = 'SRA'
    
2. Split UIDs to scrape into batches of 500

3. For each batch, scrape metadata:
    1. efetch scrape SRA
    2. elink to BioSample, Pubmed, Nuccore
    3. efetch scrape BioSamples
    4. efetch scrape Pubmed
    5. efetch scrape nuccore
    6. merge biosample, pubmed, and nuccore scrapes into SRX metadata
    
4. Merge redundant cols to MIxS-compliant MetaSeek fields (separate script)

5. Parse fields with controlled vocabularies to MIxS CV, if possible (NOTE: future error parsing)

6. Insert scraped data into MetaSeek DB directly

# 1.A - Get a list of SRA UIDs to scrape

* find out how many publicly available samples there are; make list retstarts to use 
* Call esearch api, with rotating retstarts, to get full UID list

NOTE - most comprehensive API call to find ALL the SRA samples I can find (since an empty search term returns no accessions, not all) is term=public&field=ACS; returns all publically accessible SRA UIDs

In [3]:
def get_retstart_list(url):
    #define retstarts need for get_uid_list eutilities requests - since can only get 100,000 at a time, need to make multiple queries to get total list
    #find out count of UIDs going to pull from SRA
    g = urllib.urlopen(url)
    count_tree = etree.parse(g)
    g.close()
    count_xml = count_tree.getroot()
    num_uids = count_xml.findtext("Count")
    print 'number of publicly available UIDs in SRA: %s' % num_uids
    num_queries = 1+int(num_uids)/100000  #number of queries to do, with shifting retstart
    retstart_list = [i*100000 for i in range(num_queries)]
    print 'retstarts to use: %s' % retstart_list
    return retstart_list

def get_uid_list(ret_list):
    #scrape UIDs into list
    uid_list = []
    for retstart in ret_list:
        f = urllib.urlopen('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term=public&field=ACS&tool=metaseq&email=metaseekcloud%40gmail.com&retmax=100000&retstart='+str(retstart))
        uid_tree = etree.parse(f)
        f.close()
        uid_xml = uid_tree.getroot()
        print "appending %s accessions" % len(uid_xml.find("IdList").findall("Id"))
        #add uids to list of accessions
        for id in uid_xml.find("IdList").iterchildren():
            value = id.text
            uid_list.append(value)
    return uid_list

In [4]:
retstart_list = get_retstart_list(url='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term=public&field=ACS&rettype=count&tool=metaseq&email=metaseekcloud%40gmail.com')

number of publicly available UIDs in SRA: 2694787
retstarts to use: [0, 100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1100000, 1200000, 1300000, 1400000, 1500000, 1600000, 1700000, 1800000, 1900000, 2000000, 2100000, 2200000, 2300000, 2400000, 2500000, 2600000]


In [5]:
uid_list = get_uid_list(ret_list=retstart_list)

appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 94787 accessions


# TODO 1.B - Remove SRA UIDs already in in MetaSeek DB

db_source_uid where db_source = 'SRA'

# 2 - Split UIDs to scrape into batches of 500

In [6]:
def get_batches(uid_list):
    starts = range(0,len(uid_list),500)
    ends = range(500,len(uid_list),500)
    ends.append(len(uid_list))
    batches = [list(a) for a in zip(starts, ends)]
    return batches

In [7]:
batches = get_batches(uid_list)

In [8]:
print len(batches)
print batches[0:10]
print batches[-10:]


5390
[[0, 500], [500, 1000], [1000, 1500], [1500, 2000], [2000, 2500], [2500, 3000], [3000, 3500], [3500, 4000], [4000, 4500], [4500, 5000]]
[[2690000, 2690500], [2690500, 2691000], [2691000, 2691500], [2691500, 2692000], [2692000, 2692500], [2692500, 2693000], [2693000, 2693500], [2693500, 2694000], [2694000, 2694500], [2694500, 2694787]]


# 3.A - Efetch scrape SRA metadata

In [486]:
def get_srx_metadata(batch_uid_list):
    print "Querying API and parsing XML..."
    s_parse_time = datetime.now()
    srx_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sra&tool=metaseq&email=metaseekcloud%40gmail.com'
    for key in batch_uid_list:
        #this makes url with end &id=###&id=###&id=### - returns a set of links in order of sra uids
        srx_url = srx_url+'&id='+str(key)
    #srx_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sra&tool=metaseq&email=metaseekcloud%40gmail.com&id='+str(batch_uid_list)[1:-1]
    s = urllib.urlopen(srx_url)
    sra_tree = etree.parse(s)
    s.close()
    sra_xml = sra_tree.getroot()
    print "...parsing done for %s srxs in %s" % (len(batch_uid_list),(datetime.now()-s_parse_time))

    sra_samples = sra_xml.findall("EXPERIMENT_PACKAGE")
    sdict = {}

    for which,sra_sample in enumerate(sra_samples): #the order of experiment_packages ARE in order of sra ids given - that's good
        print "--scraping srx metadata for sample %s out of %s" % (which+1,len(sra_samples))
        srx_dict = {}

        srx_uid = str(batch_uid_list[which])
        srx_dict['db_source_uid'] = srx_uid
        srx_dict['db_source'] = 'SRA'
        srx_dict['expt_link'] = "https://www.ncbi.nlm.nih.gov/sra/"+str(srx_uid)

        #There are 7 top tag groups. Have to scrape data a little different for each: ['EXPERIMENT','SUBMISSION','Organization','STUDY','SAMPLE','Pool','RUN_SET']

        ###EXPERIMENT -
        if sra_sample.find("EXPERIMENT").find("IDENTIFIERS").findtext("PRIMARY_ID") is not None:
            srx_dict['expt_id'] = sra_sample.find("EXPERIMENT").find("IDENTIFIERS").findtext("PRIMARY_ID")
        if sra_sample.find("EXPERIMENT").findtext("TITLE") is not None:
            srx_dict['expt_title'] = sra_sample.find("EXPERIMENT").findtext("TITLE")
        if sra_sample.find("EXPERIMENT").find("STUDY_REF").find("IDENTIFIERS") is not None:
            if sra_sample.find("EXPERIMENT").find("STUDY_REF").find("IDENTIFIERS").findtext("PRIMARY_ID") is not None:
                srx_dict["study_id"] = sra_sample.find("EXPERIMENT").find("STUDY_REF").find("IDENTIFIERS").findtext("PRIMARY_ID")   
        if sra_sample.find("EXPERIMENT").find("DESIGN").findtext("DESIGN_DESCRIPTION") is not None:
            srx_dict['expt_design_description'] = sra_sample.find("EXPERIMENT").find("DESIGN").findtext("DESIGN_DESCRIPTION")
        if sra_sample.find("EXPERIMENT").find("DESIGN").find("SAMPLE_DESCRIPTOR").find("IDENTIFIERS").findtext("PRIMARY_ID") is not None:
            srx_dict['sample_id'] = sra_sample.find("EXPERIMENT").find("DESIGN").find("SAMPLE_DESCRIPTOR").find("IDENTIFIERS").findtext("PRIMARY_ID")
        if sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_NAME") is not None:
            srx_dict['library_name'] = sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_NAME")
        if sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_STRATEGY") is not None:
            srx_dict['library_strategy'] = sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_STRATEGY")
        if sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_SOURCE").lower() is not None:
            srx_dict['library_source'] = sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_SOURCE").lower()
        ###change library_selection to MIxS field library_screening_strategy (cv for SRA, not for MIxS)
        if sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_SELECTION") is not None:
            srx_dict['library_screening_strategy'] = sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_SELECTION")
        ###change library_layout to MIxS field library_construction_method - cv single | paired
        if sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_LAYOUT") is not None:
            srx_dict['library_construction_method'] = sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").find("LIBRARY_LAYOUT").getchildren()[0].tag.lower()
        if sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_CONSTRUCTION_PROTOCOL") is not None:
            srx_dict['library_construction_protocol'] = sra_sample.find("EXPERIMENT").find("DESIGN").find("LIBRARY_DESCRIPTOR").findtext("LIBRARY_CONSTRUCTION_PROTOCOL")
        ###change platform to MIxS field sequencing_method - cv in SRA (not in MIxS)
        if sra_sample.find("EXPERIMENT").find("PLATFORM").getchildren() is not None:
            srx_dict['sequencing_method'] = sra_sample.find("EXPERIMENT").find("PLATFORM").getchildren()[0].tag.lower()
            if sra_sample.find("EXPERIMENT").find("PLATFORM").getchildren()[0].findtext("INSTRUMENT_MODEL") is not None:
                srx_dict['instrument_model'] = sra_sample.find("EXPERIMENT").find("PLATFORM").getchildren()[0].findtext("INSTRUMENT_MODEL")
        
        ###SUBMISSION - just need the submission id
        if sra_sample.find("SUBMISSION").find("IDENTIFIERS").findtext("PRIMARY_ID") is not None:
            srx_dict['submission_id'] = sra_sample.find("SUBMISSION").find("IDENTIFIERS").findtext("PRIMARY_ID")

        ###Organization - name, address, and contact
        if sra_sample.find("Organization").findtext("Name") is not None:
            srx_dict['organization_name'] = sra_sample.find("Organization").findtext("Name")
        if sra_sample.find("Organization").find("Address") is not None:
            address = ''
            for line in sra_sample.find("Organization").find("Address").iterchildren():
                address = address+line.text+', '
            address = address[:-2]
            srx_dict['organization_address'] = address
        if len(sra_sample.find("Organization").findall("Contact"))>0:
            contacts = []
            for contact in sra_sample.find("Organization").findall("Contact"):
                name = contact.find("Name").find("First").text+' '+contact.find("Name").find("Last").text
                email = contact.get('email')
                contacts.append(name+', '+email)
            srx_dict['organization_contacts'] = contacts
        
        ###STUDY -
        if sra_sample.find("STUDY").find("IDENTIFIERS").findtext("PRIMARY_ID") is not None: 
            srx_dict['study_id'] = sra_sample.find("STUDY").find("IDENTIFIERS").findtext("PRIMARY_ID")
        if len(sra_sample.find("STUDY").find("IDENTIFIERS").findall("EXTERNAL_ID"))>0:
            for external in sra_sample.find("STUDY").find("IDENTIFIERS").iterchildren("EXTERNAL_ID"):
                if external.get("namespace")=='BioProject':
                    srx_dict['bioproject_id'] = external.text
        if sra_sample.find("STUDY").find("DESCRIPTOR").findtext("STUDY_TITLE") is not None:
            srx_dict['study_title'] = sra_sample.find("STUDY").find("DESCRIPTOR").findtext("STUDY_TITLE")
        ###rename existing_study_type to study_type
        if sra_sample.find("STUDY").find("DESCRIPTOR").find("STUDY_TYPE") is not None:
            if sra_sample.find("STUDY").find("DESCRIPTOR").find("STUDY_TYPE").get("existing_study_type")=="Other":
                srx_dict['study_type'] = 'Other'
                if sra_sample.find("STUDY").find("DESCRIPTOR").find("STUDY_TYPE").get("add_study_type") is not None:
                    srx_dict['study_type_other'] = sra_sample.find("STUDY").find("DESCRIPTOR").find("STUDY_TYPE").get("add_study_type")
            else:
                srx_dict['study_type'] = sra_sample.find("STUDY").find("DESCRIPTOR").find("STUDY_TYPE").get("existing_study_type")
        if sra_sample.find("STUDY").find("DESCRIPTOR").findtext("STUDY_ABSTRACT"):
            srx_dict['study_abstract'] = sra_sample.find("STUDY").find("DESCRIPTOR").findtext("STUDY_ABSTRACT")
        if sra_sample.find("STUDY").find("STUDY_LINKS") is not None:
            study_links = {}
            for study_link in sra_sample.find("STUDY").find("STUDY_LINKS").iterchildren():
                if study_link.find("XREF_LINK") is not None:
                    study_links[study_link.find("XREF_LINK").findtext("DB")] = study_link.find("XREF_LINK").findtext("ID")
                if study_link.find("URL_LINK") is not None:
                    study_links[study_link.find("URL_LINK").findtext("LABEL")] = study_link.find("URL_LINK").findtext("URL")
            srx_dict['study_links'] = study_links
        if sra_sample.find("STUDY").find("STUDY_ATTRIBUTES") is not None:
            study_attributes = {}
            for attr in sra_sample.find("STUDY").find("STUDY_ATTRIBUTES").iterchildren():
                study_attributes[attr.findtext("TAG")] = attr.findtext("VALUE")
            srx_dict['study_attributes'] = study_attributes

        ###SAMPLE - get some BioSample stuff that's in easier format here: sample id, biosample id (if exists; it should but sometimes doesn't); also title, sample name stuff, and description; rest get from biosample scraping
        if sra_sample.find("SAMPLE").find("IDENTIFIERS").findtext("PRIMARY_ID") is not None:
            srx_dict['sample_id'] = sra_sample.find("SAMPLE").find("IDENTIFIERS").findtext("PRIMARY_ID")
        if len(sra_sample.find("SAMPLE").find("IDENTIFIERS").findall("EXTERNAL_ID"))>0:
            for external in sra_sample.find("SAMPLE").find("IDENTIFIERS").iterchildren("EXTERNAL_ID"):
                if external.get("namespace")=='BioSample':
                    srx_dict['biosample_id'] = external.text
        if sra_sample.find("SAMPLE").findtext("TITLE") is not None:
            srx_dict['sample_title'] = sra_sample.find("SAMPLE").findtext("TITLE")
        if sra_sample.find("SAMPLE").find("SAMPLE_NAME") is not None: 
            if sra_sample.find("SAMPLE").find("SAMPLE_NAME").findtext("TAXON_ID") is not None:
                srx_dict['ncbi_taxon_id'] = sra_sample.find("SAMPLE").find("SAMPLE_NAME").findtext("TAXON_ID")
            if sra_sample.find("SAMPLE").find("SAMPLE_NAME").findtext("SCIENTIFIC_NAME") is not None:
                srx_dict['taxon_scientific_name'] = sra_sample.find("SAMPLE").find("SAMPLE_NAME").findtext("SCIENTIFIC_NAME")
            if sra_sample.find("SAMPLE").find("SAMPLE_NAME").findtext("COMMON_NAME") is not None:
                srx_dict['taxon_common_name'] = sra_sample.find("SAMPLE").find("SAMPLE_NAME").findtext("COMMON_NAME")
        if sra_sample.find("SAMPLE").findtext("DESCRIPTION") is not None: 
            srx_dict['sample_description'] = sra_sample.find("SAMPLE").findtext("DESCRIPTION")


        ###Pool - skip, redundant

        ###RUN_SET - record stats for each run as list, for best run (maxrun, run for which total_num_reads is largest) as single value
        run_ids = []
        total_num_reads = []
        total_num_bases = []
        download_size = []
        avg_read_length = []
        baseA_count = []
        baseC_count = []
        baseG_count = []
        baseT_count = []
        baseN_count = []
        gc_percent = []
        read_quality_counts = []

        if len(sra_sample.find("RUN_SET").findall("RUN"))>0:
            srx_dict['num_runs_in_accession'] = len(sra_sample.find("RUN_SET").findall("RUN"))
            for run in sra_sample.find("RUN_SET").findall("RUN"):
                run_ids.append(run.get("accession"))
                total_num_reads.append(int(run.get("total_spots")))
                total_num_bases.append(int(run.get("total_bases")))
                download_size.append(int(run.get("size")))
                avg_read_length.append(float(run.get("total_bases"))/(float(run.find("Run").get("spot_count"))+float(run.find("Run").get("spot_count_mates"))))
                for base in run.find("Bases").findall("Base"):
                    if base.get("value")=="A":
                        baseA_count.append(int(base.get("count")))
                        countA = int(base.get("count"))
                    if base.get("value")=="C":
                        baseC_count.append(int(base.get("count")))
                        countC = int(base.get("count"))
                    if base.get("value")=="G":
                        baseG_count.append(int(base.get("count")))
                        countG = int(base.get("count"))
                    if base.get("value")=="T":
                        baseT_count.append(int(base.get("count")))
                        countT = int(base.get("count"))
                    if base.get("value")=="N":
                        baseN_count.append(int(base.get("count")))
                        countN = int(base.get("count"))
                gc_percent.append(float(countG+countC)/float(countC+countG+countA+countT))
                qual_count = {}
                if run.find("Run").find("QualityCount") is not None:
                    for qual in run.find("Run").find("QualityCount").findall("Quality"):
                        qual_count[qual.get("value")] = int(qual.get("count"))
                read_quality_counts.append(qual_count)


            max_index = total_num_reads.index(max(total_num_reads))

            srx_dict['run_ids'] = run_ids
            srx_dict['run_ids_maxrun'] = run_ids[max_index]
            srx_dict['total_num_reads'] = total_num_reads
            srx_dict['total_num_reads_maxrun'] = total_num_reads[max_index]
            srx_dict['total_num_bases'] = total_num_bases
            srx_dict['total_num_bases_maxrun'] = total_num_bases[max_index]
            srx_dict['download_size'] = download_size
            srx_dict['download_size_maxrun'] = download_size[max_index]
            srx_dict['avg_read_length'] = avg_read_length
            srx_dict['avg_read_length_maxrun'] = avg_read_length[max_index]
            srx_dict['baseA_count'] = baseA_count
            srx_dict['baseA_count_maxrun'] = baseA_count[max_index]
            srx_dict['baseC_count'] = baseC_count
            srx_dict['baseC_count_maxrun'] = baseC_count[max_index]
            srx_dict['baseG_count'] = baseG_count
            srx_dict['baseG_count_maxrun'] = baseG_count[max_index]
            srx_dict['baseT_count'] = baseT_count
            srx_dict['baseT_count_maxrun'] = baseT_count[max_index]
            srx_dict['baseN_count'] = baseN_count
            srx_dict['baseN_count_maxrun'] = baseN_count[max_index]
            srx_dict['gc_percent'] = gc_percent
            srx_dict['gc_percent_maxrun'] = gc_percent[max_index]
            srx_dict['read_quality_counts'] = read_quality_counts
            srx_dict['read_quality_counts_maxrun'] = read_quality_counts[max_index]


        sdict[srx_uid] = srx_dict

    return sdict


In [150]:
batch = batches[0]
batch_uid_list = map(int,uid_list[batch[0]:batch[1]])
print len(batch_uid_list)
batch_uid_list[0:10]

500


[4124571,
 4124570,
 4124569,
 4124568,
 4124567,
 4124566,
 4124565,
 4124564,
 4124563,
 4124562]

In [562]:
sdict = get_srx_metadata(batch_uid_list=batch_uid_list)

Querying API and parsing XML...
...parsing done for 500 srxs in 0:00:24.339173
--scraping srx metadata for sample 1 out of 500
--scraping srx metadata for sample 2 out of 500
--scraping srx metadata for sample 3 out of 500
--scraping srx metadata for sample 4 out of 500
--scraping srx metadata for sample 5 out of 500
--scraping srx metadata for sample 6 out of 500
--scraping srx metadata for sample 7 out of 500
--scraping srx metadata for sample 8 out of 500
--scraping srx metadata for sample 9 out of 500
--scraping srx metadata for sample 10 out of 500
--scraping srx metadata for sample 11 out of 500
--scraping srx metadata for sample 12 out of 500
--scraping srx metadata for sample 13 out of 500
--scraping srx metadata for sample 14 out of 500
--scraping srx metadata for sample 15 out of 500
--scraping srx metadata for sample 16 out of 500
--scraping srx metadata for sample 17 out of 500
--scraping srx metadata for sample 18 out of 500
--scraping srx metadata for sample 19 out of 500

In [160]:
sdict.keys()[0:5]
sdict['4123344']

{'avg_read_length': [243.1248156205851],
 'avg_read_length_maxrun': 243.1248156205851,
 'baseA_count': [26718113],
 'baseA_count_maxrun': 26718113,
 'baseC_count': [35901728],
 'baseC_count_maxrun': 35901728,
 'baseG_count': [35257176],
 'baseG_count_maxrun': 35257176,
 'baseN_count': [54440],
 'baseN_count_maxrun': 54440,
 'baseT_count': [27006982],
 'baseT_count_maxrun': 27006982,
 'bioproject_id': 'PRJNA388450',
 'biosample_id': 'SAMN07189570',
 'db_source': 'SRA',
 'db_source_uid': '4123344',
 'download_size': [71321164],
 'download_size_maxrun': 71321164,
 'expt_design_description': '250 paired ends reads of fragmented DNA',
 'expt_id': 'SRX2880321',
 'expt_link': 'https://www.ncbi.nlm.nih.gov/sra/4123344',
 'expt_title': 'whole genome sequencing of gram-negative bacteria',
 'gc_percent': [0.5698000109685789],
 'gc_percent_maxrun': 0.5698000109685789,
 'instrument_model': 'Illumina MiSeq',
 'library_construction_method': 'paired',
 'library_name': 'MB750',
 'library_screening_stra

In [472]:
[key for key in batch_uid_list if str(key) not in sdict.keys()]

[4123219,
 4123218,
 4123217,
 4123216,
 4122975,
 4122974,
 4122973,
 4122972,
 4122971,
 4122970,
 4122969]

In [13]:
sra_xml.getchildren()

[<Element EXPERIMENT_PACKAGE at 0x1045f5cb0>,
 <Element EXPERIMENT_PACKAGE at 0x1038a1878>]

In [14]:
sra_samples = sra_xml.findall("EXPERIMENT_PACKAGE")
sdict = {}
sra_samples

[<Element EXPERIMENT_PACKAGE at 0x1045f5cb0>,
 <Element EXPERIMENT_PACKAGE at 0x1038a1878>]

In [15]:
sra_sample = sra_samples[0]
sra_sample

<Element EXPERIMENT_PACKAGE at 0x1045f5cb0>

In [20]:
sra_sample.getchildren()

[<Element EXPERIMENT at 0x103647cf8>,
 <Element SUBMISSION at 0x103789518>,
 <Element Organization at 0x103789248>,
 <Element STUDY at 0x10332d5a8>,
 <Element SAMPLE at 0x10332d908>,
 <Element Pool at 0x10332d488>,
 <Element RUN_SET at 0x10332da70>]

In [84]:
if sra_sample.find("SAMPLE").findtext("DESCRIPTION") is not None: 
    print sra_sample.find("SAMPLE").findtext("DESCRIPTION")

rkd mutant ovules late phase A


In [515]:
print "Querying API and parsing XML..."
s_parse_time = datetime.now()
srx_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sra&tool=metaseq&email=metaseekcloud%40gmail.com'
for key in batch_uid_list:
    #this makes url with end &id=###&id=###&id=### - returns a set of links in order of sra uids
    srx_url = srx_url+'&id='+str(key)
#srx_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sra&tool=metaseq&email=metaseekcloud%40gmail.com&id='+str(batch_uid_list)[1:-1]

s = urllib.urlopen(srx_url)
sra_tree = etree.parse(s)
s.close()
sra_xml = sra_tree.getroot()
print "...parsing done for %s srxs in %s" % (len(batch_uid_list),(datetime.now()-s_parse_time))

sra_samples = sra_xml.findall("EXPERIMENT_PACKAGE")
experiment_ids = []
for sample in sra_samples:
    experiment_ids.append(sample.find("EXPERIMENT").get("accession"))

Querying API and parsing XML...
...parsing done for 500 srxs in 0:00:23.755727


In [516]:
print len(sra_samples)
for which,xpt in enumerate(sra_samples):
    if 'SRX2880196' in xpt.find("EXPERIMENT").get("accession"):
        print xpt, which
        print batch_uid_list[which]

500


In [509]:
print len(set(batch_uid_list))
all_experiment_ids = []
for key in batch_uid_list:
    srx_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sra&tool=metaseq&email=metaseekcloud%40gmail.com&id='+str(key)
    s = urllib.urlopen(srx_url)
    sra_tree = etree.parse(s)
    s.close()
    sra_xml = sra_tree.getroot()
    xpt = sra_xml.find("EXPERIMENT_PACKAGE").find("EXPERIMENT").get("accession")
    all_experiment_ids.append(xpt)

500


AttributeError: 'NoneType' object has no attribute 'find'

In [512]:
retstart_list = get_retstart_list(url='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term=public&field=ACS&rettype=count&tool=metaseq&email=metaseekcloud%40gmail.com')
uid_list = get_uid_list(ret_list=retstart_list)
batches = get_batches(uid_list)

number of publicly available UIDs in SRA: 2705494
retstarts to use: [0, 100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1100000, 1200000, 1300000, 1400000, 1500000, 1600000, 1700000, 1800000, 1900000, 2000000, 2100000, 2200000, 2300000, 2400000, 2500000, 2600000, 2700000]
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 accessions
appending 100000 acces

In [514]:
batch_uid_list = map(int,uid_list[batches[0][0]:batches[0][1]])
print len(batch_uid_list)
print batch_uid_list[0:10]

500
[4133271, 4133269, 4133268, 4133267, 4133266, 4133265, 4133263, 4133262, 4133261, 4133260]


# 3.B - elink to BioSample, Pubmed, Nuccore

In [203]:
#asking for a friend - how many of the sra uids have elinks to biosample?
elink_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=sra&db=biosample&tool=metaseq&email=metaseekcloud%40gmail.com'

for key in batch_uid_list:
    #this makes url with end &id=###&id=###&id=### - returns a set of links in order of sra uids
    elink_url = elink_url+'&id='+str(key)

In [None]:
elink_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=sra&db=biosample&tool=metaseq&email=metaseekcloud%40gmail.com'
for batch in batches:
    print "processing batch %s" % batch
    batch_uid_list = map(int,uid_list[batch[0]:batch[1]])

    for key in batch_uid_list:
        elink_url = elink_url+'&id='+str(key)
    try:

        e = urllib.urlopen(elink_url)
        link_tree = etree.parse(e)
        e.close()
        link_xml = link_tree.getroot()
        print link_xml
    except etree.XMLSyntaxError, detail:
        print detail.error_log

    #note if there's no biosample link, <LinkSetDb> with <DbTo>=='biosample' just won't exist
    biosample_ids = []
    pubmed_ids = []
    linksets = link_xml.findall("LinkSet")
    for link in linksets:
        linkdb = link.findall("LinkSetDb")
        if len(linkdb)>0:
            for db in linkdb:
                if db.findtext("DbTo")=='biosample':
                    biosample_ids.append(int(db.find("Link").findtext("Id")))
                if db.findtext("DbTo")=='pubmed':
                    pubmed_ids.append(int(db.find("Link").findtext("Id")))



In [1]:
e = urllib.urlopen(elink_url)
link_tree = etree.parse(e)

NameError: name 'urllib' is not defined

In [569]:
#takes list of SRX UIDs to query (batch_uid_list), and sdict into which to insert link uids; 
#return sdict with 'biosample_uid', 'pubmed_uids', and/or 'nuccore_uids' inserted; and link dict with lists of biosample_uids, pubmed_uids, and nuccore_uids to scrape
def get_links(batch_uid_list, sdict):
    print "sending elink request and parsing XML..."
    e_parse_time = datetime.now()
    elink_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=sra&db=biosample,pubmed,nuccore&tool=metaseq&email=metaseekcloud%40gmail.com'
    for key in batch_uid_list:
        #this makes url with end &id=###&id=###&id=### - returns a set of links in order of sra uids
        elink_url = elink_url+'&id='+str(key)
    #run api request and parse xml
    e = urllib.urlopen(elink_url)
    link_tree = etree.parse(e)
    e.close()
    link_xml = link_tree.getroot()
    print "...parsing done in %s" % (datetime.now()-e_parse_time)

    print "finding links..."
    #scrape elink info
    #note if there's no biosample link, <LinkSetDb> with <DbTo>=='biosample' just won't exist
    biosample_uids = []
    pubmed_uids = []
    nuccore_uids = []

    linksets = link_xml.findall("LinkSet")
    for linkset in linksets:
        srx_uid = linkset.find("IdList").findtext("Id")
        #links from each target db will be in a tab called "LinkSetDb"
        if len(linkset.findall("LinkSetDb"))>0:
            for link in linkset.findall("LinkSetDb"):
                id_set = []
                if link.findtext("DbTo")=='biosample':
                    #for all Links, get Ids
                    for uid in link.findall("Link"):
                        id_set.append(int(uid.findtext("Id")))
                    biosample_uids.extend(id_set)
                    sdict[srx_uid]['biosample_uid'] = id_set
                elif link.findtext("DbTo")=='pubmed':
                    for uid in link.findall("Link"):
                        id_set.append(int(uid.findtext("Id")))
                    pubmed_uids.extend(id_set)
                    sdict[srx_uid]['pubmed_uids'] = id_set
                elif link.findtext("DbTo")=='nuccore':
                    for uid in link.findall("Link"):
                        id_set.append(int(uid.findtext("Id")))
                    nuccore_uids.extend(id_set)
                    sdict[srx_uid]['nuccore_uids'] = id_set

    biosample_uids = list(set(biosample_uids))
    pubmed_uids = list(set(pubmed_uids))
    nuccore_uids = list(set(nuccore_uids))
    
    linkdict = {'biosample_uids':biosample_uids,'pubmed_uids':pubmed_uids,'nuccore_uids':nuccore_uids}
    print "number of biosamples to scrape: %s" % len(linkdict['biosample_uids'])
    print "number of pubmeds to scrape: %s" % len(linkdict['pubmed_uids'])
    print "number of nuccores to scrape: %s" % len(linkdict['nuccore_uids'])
    
    return sdict,linkdict

In [461]:
linkset.find("IdList").findtext("Id")

'4122969'

In [463]:
len(sdict.keys())

489

In [568]:
sdict, linkdict = get_links(batch_uid_list=batch_uid_list,sdict=sdict)

sending elink request and parsing XML...
...parsing done in 0:01:52.811588
...parsing done!
finding links...
number of biosamples to scrape: 446
number of pubmeds to scrape: 3
number of nuccores to scrape: 61


In [449]:
srx_uid in elink_url

True

In [171]:
print linkdict.keys()
print len(linkdict['pubmed_uids'])
print len(linkdict['nuccore_uids'])
print len(linkdict['biosample_uids'])
pubmeds = []
nuccores = []
biosamples = []
for key in sdict.keys():
    if 'pubmed_uids' in sdict[key].keys():
        pubmeds.append(key)
    if 'nuccore_uids' in sdict[key].keys():
        nuccores.append(key)
    if 'biosample_uid' in sdict[key].keys():
        biosamples.append(key)
print len(pubmeds),len(nuccores),len(biosamples)
        

['pubmed_uids', 'nuccore_uids', 'biosample_uids']
7
44
380
166 99 445


# 3.C - efetch scrape BioSamples

In [522]:
def get_biosample_metadata(batch_uid_list,bdict):
    #some stuff already captured with SRA - just get publication_date, Models, Package, and Attributes
    #and links?
    print "Querying API and parsing XML..."
    b_parse_time = datetime.now()
    biosample_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=biosample&tool=metaseq&email=metaseekcloud%40gmail.com&id='+str(batch_uid_list)[1:-1]
    b = urllib.urlopen(biosample_url)
    bio_tree = etree.parse(b)
    b.close()
    bio_xml = bio_tree.getroot()
    print "...parsing done for %s biosamples in %s" % (len(batch_uid_list),(datetime.now()-b_parse_time))
    
    biosamples = bio_xml.findall("BioSample")
    
    for which,biosample in enumerate(biosamples): 
        print "--scraping biosample metadata for sample %s out of %s" % (which+1,len(biosamples))
        bio_dict = {}

        bio_id = str(batch_uid_list[which])
        bio_dict['biosample_uid'] = bio_id
        bio_dict['biosample_link'] = "https://www.ncbi.nlm.nih.gov/biosample/"+str(bio_id)
        #publication date
        if biosample.get('publication_date') is not None:
            bio_dict['metadata_publication_date'] = datetime.strptime(biosample.get('publication_date'), '%Y-%m-%dT%H:%M:%S.%f')
        #if Package exists, probably don't need Models (but get them anyway); from package/models will parse investigation_type and env_package
        if biosample.findtext("Package") is not None:
            bio_dict['biosample_package'] = biosample.findtext("Package")
        if biosample.find("Models") is not None:
            models = []
            for model in biosample.find("Models").findall("Model"):
                models.append(model.text)
            bio_dict['biosample_models'] = models

        #Attributes - loop through attributes; save all as dict in single column (parse later)
        if biosample.find("Attributes") is not None:
            attr = {}
            for attribute in biosample.find("Attributes").findall("Attribute"):
                attr_value = attribute.text
                if attr_value=="missing" or attr_value=="Missing" or attr_value=="not applicable" or attr_value=="Not Applicable" or attr_value=="N/A": #if value is any of several None standins, change to None
                    attr_value = None
                if attribute.get("harmonized_name") is not None:
                    attr[attribute.get("harmonized_name")] = attr_value
                elif attribute.get("attribute_name") is not None:
                    attr[attribute.get("attribute_name")] = attr_value
            bio_dict['sample_attributes'] = attr
    
        bdict[bio_id] = bio_dict
        
    return bdict
    

In [556]:
#split into batches just in case there are >500 UIDs (note if there are <500 will just be a list of one batch)
biosample_batches = get_batches(uid_list=linkdict['biosample_uids'])
bdict = {}
for b_batch_ix,b_batch in enumerate(biosample_batches):
    print "processing biosample batch %s out of %s......" % (b_batch_ix+1,len(biosample_batches))
    biosample_batch_uids = map(int,linkdict['biosample_uids'][b_batch[0]:b_batch[1]])
    bdict = get_biosample_metadata(batch_uid_list=biosample_batch_uids,bdict=bdict)

processing biosample batch 1 out of 1......
Querying API and parsing XML...
...parsing done for 446 biosamples in 0:00:01.390574
--scraping biosample metadata for sample 1 out of 446
--scraping biosample metadata for sample 2 out of 446
--scraping biosample metadata for sample 3 out of 446
--scraping biosample metadata for sample 4 out of 446
--scraping biosample metadata for sample 5 out of 446
--scraping biosample metadata for sample 6 out of 446
--scraping biosample metadata for sample 7 out of 446
--scraping biosample metadata for sample 8 out of 446
--scraping biosample metadata for sample 9 out of 446
--scraping biosample metadata for sample 10 out of 446
--scraping biosample metadata for sample 11 out of 446
--scraping biosample metadata for sample 12 out of 446
--scraping biosample metadata for sample 13 out of 446
--scraping biosample metadata for sample 14 out of 446
--scraping biosample metadata for sample 15 out of 446
--scraping biosample metadata for sample 16 out of 446


In [524]:
bdict[bdict.keys()[250]]

{'biosample_link': 'https://www.ncbi.nlm.nih.gov/biosample/7173343',
 'biosample_models': ['Pathogen.env'],
 'biosample_package': 'Pathogen.env.1.0',
 'biosample_uid': '7173343',
 'metadata_publication_date': datetime.datetime(2017, 5, 26, 0, 0),
 'sample_attributes': {'attribute_package': 'environmental/food/other',
  'collected_by': 'NYAG',
  'collection_date': '2008-09-15',
  'description': 'cntn_cf_publicComments',
  'geo_loc_name': 'USA:NY',
  'host': None,
  'host_disease': None,
  'isolate_name_alias': 'CFSAN064645',
  'isolation_source': 'sponge',
  'lat_lon': None,
  'strain': '08B08300L-6'}}

# 3.D - efetch scrape Pubmeds

In [351]:
def get_pubmed_metadata(batch_uid_list,pdict):
    #some stuff already captured with SRA - just get publication_date, Models, Package, and Attributes
    #and links?
    print "Querying API and parsing XML..."
    p_parse_time = datetime.now()
    pub_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&tool=metaseq&email=metaseekcloud%40gmail.com&id='+str(pubmed_batch_uids)[1:-1]
    p = urllib.urlopen(pub_url)
    pub_tree = etree.parse(p)
    p.close()
    pub_xml = pub_tree.getroot()
    print "...parsing done for %s pubmeds in %s" % (len(pubmed_batch_uids),(datetime.now()-p_parse_time))
    
    pubmeds = pub_xml.findall("DocSum")
    
    for which,pubmed in enumerate(pubmeds):
        print "--scraping pubmed metadata for sample %s out of %s" % (which+1,len(pubmeds))
        pub_dict = {}
        
        pub_id = str(batch_uid_list[which])
        pub_dict['pubmed_uid'] = pub_id
        pub_dict['pubmed_link'] = "https://www.ncbi.nlm.nih.gov/pubmed/"+str(pub_id)
        
        if pubmed.find("Item[@Name='PubDate']").text is not None:
            pub_dict['pub_publication_date'] = datetime.strptime(pubmed.find("Item[@Name='PubDate']").text,"%Y %b %d")
        
        if pubmed.find("Item[@Name='AuthorList']") is not None:
            authors = []
            for author in pubmed.find("Item[@Name='AuthorList']").findall("Item[@Name='Author']"):
                authors.append(author.text)
            pub_dict['pub_authors'] = authors
            
        if pubmed.findtext("Item[@Name='Title']") is not None:
            pub_dict['pub_title'] = pubmed.findtext("Item[@Name='Title']")
        if pubmed.findtext("Item[@Name='Volume']") is not None:
            pub_dict['pub_volume'] = pubmed.findtext("Item[@Name='Volume']")
        if pubmed.findtext("Item[@Name='Issue']") is not None:
            pub_dict['pub_issue'] = pubmed.findtext("Item[@Name='Issue']")
        if pubmed.findtext("Item[@Name='Pages']") is not None:
            pub_dict['pub_pages'] = pubmed.findtext("Item[@Name='Pages']")
        if pubmed.findtext("Item[@Name='Source']") is not None:
            pub_dict['pub_journal'] = pubmed.findtext("Item[@Name='Source']")
        if pubmed.findtext("Item[@Name='DOI']") is not None:
            pub_dict['pub_doi'] = pubmed.findtext("Item[@Name='DOI']")

        pdict[pub_id] = pub_dict
    
    return pdict

In [557]:
pubmed_batches = get_batches(uid_list=linkdict['pubmed_uids'])
pdict = {}
for p_batch_ix,p_batch in enumerate(pubmed_batches):
    print "processing pubmed batch %s out of %s......" % (p_batch_ix+1,len(pubmed_batches))
    pubmed_batch_uids = map(int,linkdict['pubmed_uids'][p_batch[0]:p_batch[1]])
    pdict = get_pubmed_metadata(batch_uid_list=pubmed_batch_uids,pdict=pdict)

processing pubmed batch 1 out of 1......
Querying API and parsing XML...
...parsing done for 3 pubmeds in 0:00:00.107448
--scraping pubmed metadata for sample 1 out of 3
--scraping pubmed metadata for sample 2 out of 3
--scraping pubmed metadata for sample 3 out of 3


In [307]:
pub_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&tool=metaseq&email=metaseekcloud%40gmail.com&id='+str(pubmed_batch_uids)[1:-1]
p = urllib.urlopen(pub_url)
pub_tree = etree.parse(p)
p.close()
pub_xml = pub_tree.getroot()
pub_xml


<Element eSummaryResult at 0x10d624cb0>

In [357]:
pdict[pdict.keys()[6]]

{'pub_authors': ['Lindsey RL',
  'Trees E',
  'Sammons S',
  'Loparev V',
  'Frace M',
  'Strockbine N',
  'Sabol AL',
  'Sowers E',
  'Stripling D',
  'Martin H',
  'Knipe K',
  'Rowe L',
  'Gerner-Smidt P'],
 'pub_doi': '10.1128/genomeA.00501-14',
 'pub_issue': '4',
 'pub_journal': 'Genome Announc',
 'pub_pages': '',
 'pub_publication_date': datetime.datetime(2014, 7, 10, 0, 0),
 'pub_title': 'Draft Whole-Genome Sequences of Nine Non-O157 Shiga Toxin-Producing Escherichia coli Strains.',
 'pub_volume': '2',
 'pubmed_link': 'https://www.ncbi.nlm.nih.gov/pubmed/25013133',
 'pubmed_uid': '25013133'}

# 3.E - efetch scrape Nuccore

In [364]:
def get_nuccore_metadata(batch_uid_list,ndict):
    for which,nuccore in enumerate(batch_uid_list):
        print "--scraping nuccore metadata for sample %s out of %s" % (which+1,len(batch_uid_list))
        nuc_dict = {}
        
        nuc_id = str(nuccore)
        nuc_dict['nuccore_uid'] = nuc_id
        nuc_dict['nuccore_link'] = 'https://www.ncbi.nlm.nih.gov/nuccore/'+nuc_id
        
        ndict[nuc_id] = nuc_dict
        
    return ndict


In [558]:
nuccore_batches = get_batches(uid_list=linkdict['nuccore_uids'])
ndict = {}
for n_batch_ix,n_batch in enumerate(nuccore_batches):
    print "processing nuccore batch %s out of %s......" % (n_batch_ix+1,len(nuccore_batches))
    nuccore_batch_uids = map(int,linkdict['nuccore_uids'][n_batch[0]:n_batch[1]])
    ndict = get_nuccore_metadata(batch_uid_list=nuccore_batch_uids,ndict=ndict)

processing nuccore batch 1 out of 1......
--scraping nuccore metadata for sample 1 out of 61
--scraping nuccore metadata for sample 2 out of 61
--scraping nuccore metadata for sample 3 out of 61
--scraping nuccore metadata for sample 4 out of 61
--scraping nuccore metadata for sample 5 out of 61
--scraping nuccore metadata for sample 6 out of 61
--scraping nuccore metadata for sample 7 out of 61
--scraping nuccore metadata for sample 8 out of 61
--scraping nuccore metadata for sample 9 out of 61
--scraping nuccore metadata for sample 10 out of 61
--scraping nuccore metadata for sample 11 out of 61
--scraping nuccore metadata for sample 12 out of 61
--scraping nuccore metadata for sample 13 out of 61
--scraping nuccore metadata for sample 14 out of 61
--scraping nuccore metadata for sample 15 out of 61
--scraping nuccore metadata for sample 16 out of 61
--scraping nuccore metadata for sample 17 out of 61
--scraping nuccore metadata for sample 18 out of 61
--scraping nuccore metadata for

# Merge biosample, pubmed, and nuccore scrapes into SRX metadata

In [369]:
bdict

{'3850954': {'biosample_link': 'https://www.ncbi.nlm.nih.gov/biosample/3850954',
  'biosample_models': ['Metagenome or environmental'],
  'biosample_package': 'Metagenome.environmental.1.0',
  'biosample_uid': '3850954',
  'metadata_publication_date': datetime.datetime(2015, 7, 9, 11, 35, 49, 237000),
  'sample_attributes': {'collection_date': '18-Aug-2012',
   'geo_loc_name': 'USA: Coast Range Ophiolite, California',
   'isolation_source': 'Serpentinite rock and fluid',
   'lat_lon': '38.862 N 122.414 W'}},
 '5570637': {'biosample_link': 'https://www.ncbi.nlm.nih.gov/biosample/5570637',
  'biosample_models': ['Pathogen.cl'],
  'biosample_package': 'Pathogen.cl.1.0',
  'biosample_uid': '5570637',
  'metadata_publication_date': datetime.datetime(2016, 8, 12, 15, 35, 3, 640000),
  'sample_attributes': {'collected_by': 'CDC',
   'collection_date': 'Missing',
   'geo_loc_name': 'USA',
   'host': 'Missing',
   'host_disease': 'Missing',
   'isolation_source': 'Missing',
   'lat_lon': 'Missi

In [388]:
for srx in sdict.keys()[0:10]:
    if 'pubmed_uids' in sdict[srx].keys():
        print sdict[srx]['pubmed_uids']
        for bid in sdict[srx]['biosample_uid']:
            bdict[str(bid)]

[28472260]
[28472260]
[28472260]
[28472260]
[27365352, 26679598, 26044422, 25103754, 25013133]
[27365352, 26679598, 26044422, 25103754, 25013133]
[27365352, 26679598, 26044422, 25103754, 25013133]
[27365352, 26679598, 26044422, 25103754, 25013133]
[27365352, 26679598, 26044422, 25103754, 25013133]
[27365352, 26679598, 26044422, 25103754, 25013133]


In [570]:
def merge_scrapes(sdict,bdict,pdict,ndict):
    for srx in sdict.keys():
        if 'biosample_uid' in sdict[srx].keys():
            #append biosample metadata values to list metadata values for that field in sdict[srx] (if multiple pubmeds, e.g., for each field have list value for all pubmeds, like with run stuff)
            for bio in sdict[srx]['biosample_uid']:
                bio = str(bio)
                #don't need biosample_uid since already there 
                if 'biosample_link' in bdict[bio].keys():
                    if 'biosample_link' in sdict[srx].keys(): #biosample_link
                        sdict[srx]['biosample_link'].append(bdict[bio]['biosample_link'])
                    else:
                        sdict[srx]['biosample_link'] = [bdict[bio]['biosample_link']]
                if 'metadata_publication_date' in bdict[bio].keys():
                    if 'metadata_publication_date' in sdict[srx].keys(): #metadata_publication_date
                        sdict[srx]['metadata_publication_date'].append(bdict[bio]['metadata_publication_date'])
                    else:
                        sdict[srx]['metadata_publication_date'] = [bdict[bio]['metadata_publication_date']]
                if 'biosample_package' in bdict[bio].keys():
                    if 'biosample_package' in sdict[srx].keys(): #biosample_package
                        sdict[srx]['biosample_package'].append(bdict[bio]['biosample_package'])
                    else:
                        sdict[srx]['biosample_package'] = [bdict[bio]['biosample_package']]
                if 'biosample_models' in bdict[bio].keys():
                    if 'biosample_models' in sdict[srx].keys(): #biosample_models
                        sdict[srx]['biosample_models'].append(bdict[bio]['biosample_models'])
                    else:
                        sdict[srx]['biosample_models'] = [bdict[bio]['biosample_models']]
                if 'sample_attributes' in bdict[bio].keys():
                    if 'sample_attributes' in sdict[srx].keys(): #sample_attributes
                        sdict[srx]['sample_attributes'].append(bdict[bio]['sample_attributes'])
                    else:
                        sdict[srx]['sample_attributes'] = [bdict[bio]['sample_attributes']]
    
        if 'pubmed_uids' in sdict[srx].keys():
            #append pubmed metadata values to list metadata values for that field in sdict[srx]
            for pub in sdict[srx]['pubmed_uids']:
                pub = str(pub)
                #don't need pubmed_uid since already there 
                if 'pubmed_link' in pdict[pub].keys():
                    if 'pubmed_link' in sdict[srx].keys(): 
                        sdict[srx]['pubmed_link'].append(pdict[pub]['pubmed_link'])
                    else:
                        sdict[srx]['pubmed_link'] = [pdict[pub]['pubmed_link']]
                if 'pub_publication_date' in pdict[pub].keys():
                    if 'pub_publication_date' in sdict[srx].keys(): 
                        sdict[srx]['pub_publication_date'].append(pdict[pub]['pub_publication_date'])
                    else:
                        sdict[srx]['pub_publication_date'] = [pdict[pub]['pub_publication_date']]
                if 'pub_authors' in pdict[pub].keys():
                    if 'pub_authors' in sdict[srx].keys(): 
                        sdict[srx]['pub_authors'].append(pdict[pub]['pub_authors'])
                    else:
                        sdict[srx]['pub_authors'] = [pdict[pub]['pub_authors']]                        
                if 'pub_title' in pdict[pub].keys():
                    if 'pub_title' in sdict[srx].keys(): 
                        sdict[srx]['pub_title'].append(pdict[pub]['pub_title'])
                    else:
                        sdict[srx]['pub_title'] = [pdict[pub]['pub_title']]     
                if 'pub_volume' in pdict[pub].keys():
                    if 'pub_volume' in sdict[srx].keys(): 
                        sdict[srx]['pub_volume'].append(pdict[pub]['pub_volume'])
                    else:
                        sdict[srx]['pub_volume'] = [pdict[pub]['pub_volume']]  
                if 'pub_issue' in pdict[pub].keys():
                    if 'pub_issue' in sdict[srx].keys(): 
                        sdict[srx]['pub_issue'].append(pdict[pub]['pub_issue'])
                    else:
                        sdict[srx]['pub_issue'] = [pdict[pub]['pub_issue']]  
                if 'pub_pages' in pdict[pub].keys():
                    if 'pub_pages' in sdict[srx].keys(): 
                        sdict[srx]['pub_pages'].append(pdict[pub]['pub_pages'])
                    else:
                        sdict[srx]['pub_pages'] = [pdict[pub]['pub_pages']]  
                if 'pub_journal' in pdict[pub].keys():
                    if 'pub_journal' in sdict[srx].keys(): 
                        sdict[srx]['pub_journal'].append(pdict[pub]['pub_journal'])
                    else:
                        sdict[srx]['pub_journal'] = [pdict[pub]['pub_journal']]  
                if 'pub_doi' in pdict[pub].keys():
                    if 'pub_doi' in sdict[srx].keys(): 
                        sdict[srx]['pub_doi'].append(pdict[pub]['pub_doi'])
                    else:
                        sdict[srx]['pub_doi'] = [pdict[pub]['pub_doi']]  

        if 'nuccore_uids' in sdict[srx].keys():
            #append nuccore metadata values to list metadata values for that field in sdict[srx]
            for nuc in sdict[srx]['nuccore_uids']:
                nuc = str(nuc)
                #don't need nuccore_uid since already there - just add nuccore_link
                if 'nuccore_link' in ndict[nuc].keys():
                    if 'nuccore_link' in sdict[srx].keys(): 
                        sdict[srx]['nuccore_link'].append(ndict[nuc]['nuccore_link'])
                    else:
                        sdict[srx]['nuccore_link'] = [ndict[nuc]['nuccore_link']]
    return sdict
                        
                        

In [571]:
sdict = merge_scrapes(sdict=sdict,bdict=bdict,pdict=pdict,ndict=ndict)

In [572]:
sdict[sdict.keys()[10]]

{'avg_read_length': [149.4721691755932],
 'avg_read_length_maxrun': 149.4721691755932,
 'baseA_count': [125779901],
 'baseA_count_maxrun': 125779901,
 'baseC_count': [137830134],
 'baseC_count_maxrun': 137830134,
 'baseG_count': [139253096],
 'baseG_count_maxrun': 139253096,
 'baseN_count': [18900],
 'baseN_count_maxrun': 18900,
 'baseT_count': [125915826],
 'baseT_count_maxrun': 125915826,
 'bioproject_id': 'PRJNA242847',
 'biosample_id': 'SAMN07193952',
 'biosample_link': ['https://www.ncbi.nlm.nih.gov/biosample/7193952'],
 'biosample_models': [['Pathogen.env']],
 'biosample_package': ['Pathogen.env.1.0'],
 'biosample_uid': [7193952],
 'db_source': 'SRA',
 'db_source_uid': '4132449',
 'download_size': [315943734],
 'download_size_maxrun': 315943734,
 'expt_design_description': 'MiSeq deep shotgun sequencing of cultured isolate.',
 'expt_id': 'SRX2884651',
 'expt_link': 'https://www.ncbi.nlm.nih.gov/sra/4132449',
 'expt_title': 'Whole genome shotgun sequencing of FSIS1701045 by Illumi

In [545]:
sdict[sdict.keys()[100]]['biosample_uid']  = [6899308]

In [526]:

for key in bdict[bio].keys():
    print key, sdict[srx][key]

KeyError: '7159562'

In [423]:
sdict[srx]['biosample_uid']

KeyError: 'biosample_uid'

In [518]:
batch_uid_list = map(int,uid_list[batch[0]:batch[1]])
print "%s UIDs to scrape...... %s..." % (len(batch_uid_list),batch_uid_list[0:10])
#scrape sra metadata, return as dictionary of dictionaries; each sdict key is the SRA UID, value is a dictionary of srx metadata key/value pairs
print "scraping SRX metadata..."
sdict = get_srx_metadata(batch_uid_list=batch_uid_list)
#get link uids for any links to biosample, pubmed, and nuccore databases so can go scrape those too
print "getting elinks..."
sdict, linkdict = get_links(batch_uid_list=batch_uid_list,sdict=sdict)


500 UIDs to scrape...... [4133271, 4133269, 4133268, 4133267, 4133266, 4133265, 4133263, 4133262, 4133261, 4133260]...
scraping SRX metadata...
Querying API and parsing XML...
...parsing done for 500 srxs in 0:00:21.800152
--scraping srx metadata for sample 1 out of 500
--scraping srx metadata for sample 2 out of 500
--scraping srx metadata for sample 3 out of 500
--scraping srx metadata for sample 4 out of 500
--scraping srx metadata for sample 5 out of 500
--scraping srx metadata for sample 6 out of 500
--scraping srx metadata for sample 7 out of 500
--scraping srx metadata for sample 8 out of 500
--scraping srx metadata for sample 9 out of 500
--scraping srx metadata for sample 10 out of 500
--scraping srx metadata for sample 11 out of 500
--scraping srx metadata for sample 12 out of 500
--scraping srx metadata for sample 13 out of 500
--scraping srx metadata for sample 14 out of 500
--scraping srx metadata for sample 15 out of 500
--scraping srx metadata for sample 16 out of 500
--

In [269]:
import pandas as pd
#for batch of 500, make dataframe with correct 
testy = pd.DataFrame(columns=['this','that','other'])
line = {'this':12,'that':24}
testy.append(line, ignore_index=True)
testy.to_csv("testy.csv")

In [99]:

d3 = {'this':12,'that':'for'}
d4 = {'this':24,'those':'next'}
print d3, d4
#when run dict1.update(dict2), dict2 is added to dict1. If there's overlap in the keys, dict2 val replaces dict1
#e.g. sdict[srx].update(bdict[bio])
d3.update(d4)
print d3, d4

{'this': 12, 'that': 'for'} {'this': 24, 'those': 'next'}
{'this': 24, 'those': 'next', 'that': 'for'} {'this': 24, 'those': 'next'}


In [103]:
if d3['this']==24:
    print "yes"
if 'wait' in d3.keys():
    print "in"
else:
    print "not in"

yes
not in


In [109]:
old = [2,4,6]
new = [7,8,9]
old+new

[2, 4, 6, 7, 8, 9]

In [111]:
old.extend(new)
old

[2, 4, 6, 7, 8, 9]