## Write article ids to a file

In [14]:
import xml.etree.ElementTree as ET
import urllib2
retmax = 100 
ids_page_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=bcl2&retmax=" + str(retmax)
ids_xml_page = urllib2.urlopen(ids_page_url).read()

ids_root = ET.fromstring(ids_xml_page)
# write ids to article-ids.txt
with open('article-ids.txt', 'w+') as my_file:
    for ids in ids_root.iter('IdList'):
        for id in ids.iter('Id'):
            my_file.write(id.text.encode('utf-8') + '\n')

# Option 1: non-spark way to obtain url

loop through ids and extract url, and then write article urls to a file

In [None]:
abstract_url_base = "https://www.ncbi.nlm.nih.gov/pubmed/"
# write urls to article-urls.txt
with open('article-urls.txt', 'w+') as my_file:
    for ids in ids_root.iter('IdList'):
        for id in ids.iter('Id'):
            # build the full pubmed url
            url = abstract_url_base + id.text.encode('utf-8')
            source_page = urllib2.urlopen(url)
            source_page_soup = BeautifulSoup(source_page)
            body = source_page_soup.body
            icons_portlet = body.find('div', attrs={'class': 'icons portlet'})
            # if there is external link to the article, find the external link to the article
            if(icons_portlet):
                article_url = icons_portlet.find('a').attrs[0][1]
                #print(article_url)
                my_file.write(article_url + '\n')

# Option 2: spark way to obtain url
Load ids from a file, and create a data frame. Define a udf function to extract the article url based on the ids.

In [None]:
article_ids = spark.read.csv('article-ids.txt', sep=' ', inferSchema=True).toDF('id')
article_ids.show(5)

In [None]:
from pyspark.sql.functions import udf
from BeautifulSoup import BeautifulSoup
def get_article_url(id):
    # build the full pubmed url
    abstract_url_base = "https://www.ncbi.nlm.nih.gov/pubmed/"
    url = abstract_url_base + str(id)
    # extract page content from url
    source_page = urllib2.urlopen(url)
    source_page_soup = BeautifulSoup(source_page)
    body = source_page_soup.body
    icons_portlet = body.find('div', attrs={'class': 'icons portlet'})
    # if there is external link to the article, find the external link to the article
    if(icons_portlet):
        article_url = icons_portlet.find('a').attrs[0][1]
        return(article_url)
    else:
        return(None)

from pyspark.sql.types import StringType
get_article_url_udf = udf(get_article_url, StringType())

In [None]:
## Test the user defined function
test_id = 28386842
get_article_url(test_id)

In [None]:
url_df = article_ids.select(get_article_url_udf(article_ids.id).alias('article-url'))
url_df.show(5, truncate=False)

# Get article abstract links from "Full text links"

In [19]:
abstract_url_base = "https://www.ncbi.nlm.nih.gov/pubmed/"

with open('article-urls.txt', 'w+') as my_file:
    for ids in ids_root.iter('IdList'):
        for id in ids.iter('Id'):
            # build the full pubmed url
            url = abstract_url_base + id.text.encode('utf-8')
            source_page = urllib2.urlopen(url)
            source_page_soup = BeautifulSoup(source_page)
            body = source_page_soup.body
            icons_portlet = body.find('div', attrs={'class': 'icons portlet'})
            # if there is external link to the article, find the external link to the article
            if(icons_portlet):
                article_url = icons_portlet.find('a').attrs[0][1]
                #print(article_url)
                my_file.write(article_url + '\n')

In [77]:
soup.body.find('div', attrs={'class': 'icons portlet'}).find('a').attrs[0][1]

u'https://linkinghub.elsevier.com/retrieve/pii/S0168-0102(08)00148-X'

In [34]:
import xml.etree.ElementTree as ET
tree = ET.parse('pmc4304705.xml')

In [35]:
root = tree.getroot()

## Get article content

In [36]:
with open('pmc4304705.txt', 'w+') as my_file:
    for abstract in root.iter('abstract'):
        for p in abstract.iter('p'):
            my_file.write(p.text + ' ')
    for body in root.iter('body'):
        for p in body.iter('p'):
            my_file.write(p.text.encode("utf-8") + ' ')