In [None]:
import pandas as pd
import sys
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup

In [None]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [None]:
html_raw = simple_get('https://pubs.acs.org/doi/10.1021/acs.nanolett.9b00357')
print(len(html_raw))

In [None]:
#class to contain the tags and names of various parts of the retrieved paper
#tags are specfific to a journal or family of journals, so the initilization
#    of the object must specify the family or it will return nothing
##
#class journal has attributes: abstract, body, and references
#these attributes are all dicts specifying the html tag and name of the
#part of the html code to be retrieved
class journal_format:
    
    def __init__(self, family):
        #use series of if statements to specify the attributes of the
        #journal object
        if family.lower() == 'acs':
            self.abstract = {'tag': 'p', 'name': 'articleBody_abstractText'}
            self.body = {'tag': 'div', 'name': 'NLM_p'}
            self.figure_captions = {'tag': 'p', 'name': 'first last'}
            self.reference_title = {'tag': 'span', 'name': 'NLM_cas:atitle'}
            self.reference_abstract = {'tag': 'div', 'name': 'casAbstract'}
        else:
            self.abstract = None
            self.body = None
            self.figure_captions = None
            self.reference_title = None
            self.reference_abstract = None


In [None]:
#function to clean dirty html of objects that aren't plain text
#    or that we don't care about
def clean_html(dirty_html_list):
    """
    This function processes lists of dirty_html items into
    strings of plaintext clean_html items. This may not be 
    the most appropriate for things like reference_titles,
    but should be okay.
    
    Args:
        dirty_html_list: A `list` of `strings` containing dirty
            html of some type (e.g. reference_titles, or body
            paragraphs)
    
    Returns:
        clean_html: A `string` containing concatened cleaned
            html as given in dirty_html_list
    """
    
    #string to concatenate clean html
    clean_html=''
    
    #loop through the dirty items
    for dirty_html_item in dirty_html_list:
        #loop through objects in dirty items
        for obj in dirty_html_item:
            
            #drop all items unless they are simply formatted text,
            #(for these cases: italic, bold, subscript, or 
            #    superscript)
            #If simply formatted, drop the formatting.
            #This strategy will result in some ambiguity of 
            #subscript vs. superscript vs. numbering, but this 
            #should be better than dropping the formatted text.
            if obj.name==None  or obj.name=='i' or obj.name=='b' \
                        or obj.name=='sub' or obj.name=='sup':
                clean_html+=obj.string.replace(u'\xa0', u' ')
    return clean_html

In [None]:
def html_process(raw_html,journal_family):
    """
    This processes html from an ACS journal to plain text using BeautifulSoup.
    All html tags work as of 20190726
    
    Args:
        raw_html: A `string` of the webpage html to be processed
        journal_family: A `string` specifying the family of the journal which 
            indicates the format of the raw_html.
        
    Returns:
        cleaned_html_dict: A `dict` containing
            'abstract':  Plaintext `string` of the abstract of the paper
            'body': Plaintext `string` of the body of the paper 
                (including figure captions)
            'reference_titles': A `list` containing `strings` of the titles
                of each of the references
            'reference_abstracts': A `list` containing `strings` of the abstracts
                of each of the references
    """
    
    ##
    #Use BeautifulSoup to and lxml packages to parse html and make it searchable, etc
    #the documentation recommended lxml parser as something fast  and lenient
    html = BeautifulSoup(html_raw, 'lxml')
    ##
    

    #initialize journal object that contains the html tags and names of relevant
    #    parts of the paper we wish to retrieve as attributes
    #    (abstracts, body, figure_captions, reference_titles, and reference_abstract)
    journal_format_obj = journal_format('acs')
    
    #loop through attributes of journal_format and obtain cleaned html of elements 
    #specified by those attributes
    cleaned_html_dict = {}
    for attr, dict_spec in journal_format_obj.__dict__.items():
        #returns a list of raw_html items matching the tag and name specified
        #(e.g. abstracts of references or paragraphs)
        raw_html_items = html.find_all(dict_spec['tag'],dict_spec['name'])
        
        #loops through each item, cleans them, and concatenates the result
        #(e.g. paragraphs->body or reference_title->big string of all titles
        cleaned_html_dict[attr] = clean_html(raw_html_items)

    return cleaned_html_dict

In [None]:
print(html_process(html_raw,'acs').values())