## Note
- Author: Luke Arunanondchai
- Last version: 1/30/2019
- This version: 1/30/2019
- Changes since last version: 
	- None

- Software: Python 3.7
- Purpose of Work: Scrape the bold auditing words from 'https://www.ais-cpa.com/glosa/' 
- Project: ARM

#### Load all the necessary packages

In [1]:
from pandas import DataFrame
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup

#### Basic functions for importing the html -  Credit: DataCamp

In [2]:
# Attempts to get the content at `url` by making an HTTP GET request.
def simple_get(url):
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None

# Returns True if the response seems to be HTML, False otherwise.
def is_good_response(resp):
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)

# Keep log for errors (if any)
def log_error(e):
    print(e)

### ** Input Here **

In [3]:
url = 'https://www.ais-cpa.com/glosa'

In [5]:
html

<!DOCTYPE html>
<html lang="en-US" prefix="og: http://ogp.me/ns#"><head><meta charset="utf-8"/><link href="http://gmpg.org/xfn/11" rel="profile"/><link href="https://www.ais-cpa.com/xmlrpc.php" rel="pingback"/> <script type="text/javascript">window.__lo_site_id = 132194;
(function() {
var wa = document.createElement('script'); wa.type = 'text/javascript'; wa.async = true;
wa.src = 'https://d10lpsik1i8c69.cloudfront.net/w.js';
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(wa, s);
})();</script>
<link href="//www.ais-cpa.com/wp-content/cache/wpfc-minified/5g55621/g561j.css" media="all" rel="stylesheet" type="text/css"><title>Auditing Dictionary of Terms</title><link href="https://www.ais-cpa.com/glosa/" rel="canonical"><meta content="en_US" property="og:locale"><meta content="article" property="og:type"/><meta content="Auditing Dictionary of Terms" property="og:title"/><meta content="The terms defined on this page have all appeared in past CPA exam questio

In [4]:
raw_html = simple_get(url)
html = BeautifulSoup(raw_html, 'html.parser')

# Create an empty list to collect all the auditing words
auditing = []

# Collecting words
for i, s in enumerate(html.select('strong')):
    auditing.append(s.text)

df1 = DataFrame(auditing, columns = ['Word'])

In [None]:
df1

### Exporting the words

In [None]:
df1.to_excel('../output/fromWeb.xlsx')