In [1]:
import urllib, pandas as pd, re
from bs4 import BeautifulSoup

# Get a list of Biosample UIDs to scrape data for

### For now, only going to look for MIMS environmental metagenomes that correspond to sediment, soil, or water (prokaryotic only)

see [this link](https://submit.ncbi.nlm.nih.gov/biosample/template/) for other biosample types and attributes

note -- at experiment accession page, can see results in xml form and there's a lot more info in it. maybe revisit later
e.g. see https://www.ncbi.nlm.nih.gov/sra/SRX2177857[accn]?report=FullXML

In [3]:
#remember to change retmax to 100,000 
f = urllib.urlopen('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=biosample&term=MIMS+AND+sediment+OR+MIMS+AND+soil+OR+MIMS+AND+sediment&retmax=100000') 
biosample_id_html = BeautifulSoup(f, 'html.parser')
f.close()

ids = biosample_id_html.find_all('id')
id_list = []
for item in ids:
    id_list.append(item.string)
print id_list[0:10]
print "number of biosamples found: %s" % len(id_list)
if len(id_list)>10e5:
    print "there may be more UIDs to retrieve; adjust retstart and retmax"

id_list_10 = id_list[0:3]+id_list[300:310]+id_list[100:104]
print id_list_10

[u'5859286', u'4537377', u'4537376', u'4537375', u'4537374', u'4537373', u'3565657', u'5792885', u'5770782', u'5770781']
number of biosamples found: 3940
[u'5859286', u'4537377', u'4537376', u'4001714', u'4001656', u'4158626', u'4158625', u'4396782', u'3838322', u'3458958', u'4335888', u'4335887', u'4335886', u'5507008', u'5507007', u'5507006', u'5507005']


# Extract metadata 

## Start with biosample data

But first, check whether there is an experimental sra accession corresponding with it - if there isn't, skip this uid; if there's more than one, loop through each one and gather srx and biosample metadata for each; if there's exactly one, go ahead and gather srx and biosample metadata for it

if data in a certain field doesn't exist, field should be left blank

### first define some functions:
   * get_biosample_metadata scrapes metadata from the biosample page 
   * get_srx_metadata scrapes metadata from the srx accession page

In [52]:
#define function to scrape biosample metadata, given biosample id from html list and dict to insert results into 

def get_biosample_metadata(url,dict):
    b = urllib.urlopen(url)
    biosample_page = BeautifulSoup(b,'html.parser')
    b.close()
    
    uid = biosample_page.find('dl',{'class':'rprtid'})
    biosample_uid = uid.dd.string
    dict["biosample_uid"] = biosample_uid
    
    biosample_link = "https://www.ncbi.nlm.nih.gov/biosample/"+id
    dict["biosample_link"] = biosample_link

    biosample_title = biosample_page.find_all('h2',{'class':'title'})[0].string
    if biosample_title is not None:
        dict["biosample_title"] = biosample_title
    
    identifiers = biosample_page.find(text=re.compile('Sample name:',re.IGNORECASE))
    if identifiers is not None:
        identifiers_parsed = identifiers.split(';')
        sample_name = [s for s in identifiers_parsed if "ample name" in s][0].split(':')[1]
        dict["sample_name"] = sample_name
    
    docsum = biosample_page.find('div',{'class':'docsum'})
    org = docsum.find('dt',text="Organism")
    if org is not None:
        organism = org.next_sibling.a.get_text()
        dict["organism"] = organism
    env = docsum.find('dt',text="Package")
    if env is not None:
        environmental_package = env.next_sibling.a.get_text()
        dict["environmental_package"] = environmental_package
    
    attribute_table = docsum.find('dt',text='Attributes').next_sibling.find_all('tr')
    for row in range(0,len(attribute_table)):
        key = attribute_table[row].th.string
        value = attribute_table[row].td.string
        dict[key] = value

    des = docsum.find('dt',text="Description")
    if des is not None:
        description = des.next_sibling.get_text()
        dict["description"] = description
    
    proj = docsum.find('dt',text="BioProject")
    if proj is not None:
        bioproject = proj.next_sibling.a
        bioproject_link = list(["https://www.ncbi.nlm.nih.gov"+bioproject.get('href')])
        bioproject_id = list([bioproject.string])
        dict["bioproject_link"] = bioproject_link
        dict["bioproject_id"] = bioproject_id

    sub = docsum.find('dt',text="Submission")
    if sub is not None:
        submission = sub.next_sibling.string
        dict["submission"] = submission


In [62]:
def get_srx_metadata(url,dict):
    x = urllib.urlopen(url)
    srx_page_new = BeautifulSoup(x,'html.parser')
    x.close()
    
    #add experiment accession id and url info
    srx_id = srx_page_new.find('b').a.get_text()
    dict["experiment_id"] = srx_id

    srx_link = "https://www.ncbi.nlm.nih.gov"+srx_page_new.find('b').a.get('href')+"?report=Full"
    dict["experiment_link"] = srx_link
    
    #add experimental design info
    descriptor = srx_page_new.find_all('div',{'class':"sra-full-data"})
    for text in range(0,len(descriptor)):
        if "Design" in descriptor[text].get_text():
            key = descriptor[text].get_text().split(':')[0]
            value = descriptor[text].get_text().split(':')[1]
            dict[key] = value
            
    #add library info (details about sequencing technology, approach, etc)
    library_table = srx_page_new.find('div',{'class':'expand showed sra-full-data'}).find('div',{'class':'expand-body'})

    for field in range(0,len(library_table.find_all('div'))):
        key = "experiment_"+library_table.find_all('div')[field].get_text().split(":")[0].lower()
        value = library_table.find_all('div')[field].get_text().split(":")[1].lower()
        dict[key] = value
        
    #add sequence stats info from run table
    run_table = srx_page_new.find_all('table')
    
    for table in range(0,len(run_table)):

        #https://trace.ncbi.nlm.nih.gov/Traces/sra/?run=
        table_head = run_table[table].find('thead').find_all('th')
        table_rows = run_table[table].find('tbody').find_all('tr')

        for column in range(0,len(table_head)):
            key = table_head[column].get_text()
            values = []
            for row in range(0,len(table_rows)):
                table_body = table_rows[row].find_all('td')
                value = table_body[column].get_text()
                values.append(value)
            dict[key] = values

    #rename some dictionary keys to be more descriptive
    if "Run" in dict_metadata.keys():
        dict["run_id"] = dict.pop("Run")
    if "# of Spots" in dict_metadata.keys():
        dict["no_of_runs"] = dict.pop("# of Spots")
    if "# of Bases" in dict_metadata.keys():
        dict["no_of_bases"] = dict.pop("# of Bases")
    if "Published" in dict_metadata.keys():
        dict["date_sequences_published"] = dict.pop("Published")
    

### For each id in our id list, scrape biosample and SRX accession metadata and insert into dictionary as key:value pair for that id. 

#### Then append dictionary for that uid to metadata dataframe

In [None]:
metadata = pd.DataFrame()

for index, id in enumerate(id_list):
    print "currently processing biosample UID #%s - " % index + "%s" % id.string
    
    dict_metadata = {}
    
    #determine url to use to scrape biosample metadata
    biosample_search_term = "https://www.ncbi.nlm.nih.gov/biosample/?term="+id.string+"[uid]"
    
    ##check for srx accession corresponding to biosample uid
    sra_search_term = "https://www.ncbi.nlm.nih.gov/sra?LinkName=biosample_sra&from_uid="+id.string
    s = urllib.urlopen(sra_search_term)
    srx_page = BeautifulSoup(s,'html.parser')
    s.close()
    
    accessions = srx_page.find_all('dl',{'class':'rprtid'})
    
    if len(accessions)==0:     #srx accession doesn't exist, skip this biosample uid
        continue
    elif len(accessions)>1:    #if more than one item, means got to multiple accessions page; find each srx accession no and loop through each
        for accession in range(0,len(accessions)):
            #define url to srx accession to scrape metadata from
            srx_url = "https://www.ncbi.nlm.nih.gov/sra/"+accessions[accession].dd.string+"[accn]?report=Full"
            #scrape biosample metadata into dict
            get_biosample_metadata(url=biosample_search_term,dict=dict_metadata)
            #scrape srx additional metadata
            get_srx_metadata(url=srx_url,dict=dict_metadata)
            #write dict_metadata with metadata fields to pandas dataframe, with each row being a diff id
            #dict_df = pd.DataFrame.from_dict(dict_metadata)
            metadata = metadata.append(dict_metadata,ignore_index=True) #metadata.append(dict_df,ignore_index=True)

    elif len(accessions)==1:
        #define url where srx accession is
        srx_url = "https://www.ncbi.nlm.nih.gov"+srx_page.find_all('b')[0].a.get('href')+"?report=Full"
        #scrape biosample metadata into dict
        get_biosample_metadata(url=biosample_search_term,dict=dict_metadata)
        #scrape srx additional metadata
        get_srx_metadata(url=srx_url,dict=dict_metadata)
        #write dict_metadata with metadata fields to pandas dataframe, with each row being a diff id
        #dict_df = pd.DataFrame.from_dict(dict_metadata)
        metadata = metadata.append(dict_metadata,ignore_index=True) #metadata.append(dict_df,ignore_index=True)


currently processing biosample UID #0 - 5859286
currently processing biosample UID #1 - 4537377
currently processing biosample UID #2 - 4537376
currently processing biosample UID #3 - 4537375
currently processing biosample UID #4 - 4537374
currently processing biosample UID #5 - 4537373
currently processing biosample UID #6 - 3565657
currently processing biosample UID #7 - 5792885
currently processing biosample UID #8 - 5770782
currently processing biosample UID #9 - 5770781

In [64]:
print "number of SRX experiment IDs with metadata saved: %s" % len(metadata)


number of SRX experiment IDs with metadata saved: 202


In [68]:
#save metadata dataframe to csv
metadata.to_csv("metadata_200SRA.csv",encoding='utf-8')

In [82]:
url = "https://www.ncbi.nlm.nih.gov/sra?LinkName=biosample_sra&from_uid="+"4461595"
s = urllib.urlopen(url)
search_page = BeautifulSoup(s,'html.parser')
s.close()

In [83]:
print search_page.prettify()

<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
 <head xmlns:xi="http://www.w3.org/2001/XInclude">
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <!-- meta -->
  <meta content="index,nofollow,noarchive" name="robots"/>
  <meta content="entrez" name="ncbi_app"/>
  <meta content="sra" name="ncbi_db"/>
  <meta content="full" name="ncbi_report"/>
  <meta content="html" name="ncbi_format"/>
  <meta content="20" name="ncbi_pagesize"/>
  <meta content="default" name="ncbi_sortorder"/>
  <meta content="1" name="ncbi_pageno"/>
  <meta content="1" name="ncbi_resultcount"/>
  <meta content="biosample_sra" name="ncbi_linkname"/>
  <meta content="biosample" name="ncbi_dbfrom"/>
  <meta content="link" name="ncbi_op"/>
  <meta content="full" name="ncbi_pdid"/>
  <meta content="3969304F7FC01191_0018SID" name="ncbi_session

In [74]:
url2 = "https://www.ncbi.nlm.nih.gov/biosample/?term="+"4461596"+"[uid]"
b = urllib.urlopen(url2)
bio_page2 = BeautifulSoup(b,'html.parser')
b.close()
print bio_page2.prettify()

<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
 <head xmlns:xi="http://www.w3.org/2001/XInclude">
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <!-- meta -->
  <meta content="index,nofollow,noarchive" name="robots"/>
  <meta content="entrez" name="ncbi_app"/>
  <meta content="biosample" name="ncbi_db"/>
  <meta content="4461596[uid]" name="ncbi_term"/>
  <meta content="full" name="ncbi_report"/>
  <meta content="html" name="ncbi_format"/>
  <meta content="20" name="ncbi_pagesize"/>
  <meta content="default" name="ncbi_sortorder"/>
  <meta content="1" name="ncbi_pageno"/>
  <meta content="1" name="ncbi_resultcount"/>
  <meta content="search" name="ncbi_op"/>
  <meta content="full" name="ncbi_pdid"/>
  <meta content="396A5CFA7FC003C1_0129SID" name="ncbi_sessionid"/>
  <meta content="all" name="ncbi_filter"

In [78]:
srx_url

u'https://www.ncbi.nlm.nih.gov/sra/SRX1639104[accn]?report=Full'