# Library BeautifulSoup is requird

In [1]:
# pip install beautifulsoup4

In [2]:
import os
def conditionalMkdir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

In [3]:
import os
import argparse
import urllib.request
import re
import json
import functools
from multiprocessing import Pool
import time

try:
        from BeautifulSoup import BeautifulSoup
except ImportError:
        from bs4 import BeautifulSoup


''' Finds all links to XPT files in source HTML '''
def parsePageXPT(html_source):
    # Parse HTML source code with BeautifulSoup library
    soup = BeautifulSoup(html_source, 'html.parser')

    # Get all <a>...</a> with .XPT extensions
    xpt_urls = soup.findAll('a', href=re.compile('\.XPT$'))
    xpt_urls = [url['href'] for url in xpt_urls]
    return xpt_urls


''' Parses a page for 'codebook' div with descriptions of column labels '''
def parsePageLabels(html_source):
    # Parse HTML source code with BeautifulSoup library
    soup = BeautifulSoup(html_source, 'html.parser')

    # Find div element with codebook
    try:
        div = soup.findAll('div', id="CodebookLinks")[0]
    except:
        # If we can't find the div, print
        print('Error, no CodebookLinks Div')
        return {}

    # Get all links and their text in the div
    labels = [link.string.rstrip() for link in div.findAll('a')]

    # Put labels into library
    labels = [re.split('( - )', label, 1) for label in labels]
    labels = {label[0]:label[-1] for label in labels}
    return labels


'''Get year associated with file '''
def getFileYear(file_url):
    # Search URL for a year
    year = re.search('\/(\d+-\d+)\/', file_url)

    # Get value from regular expression match
    if year:
        year = year.group(1)
    else:
        # If no match, assign year as 'Other'
        year = 'Other'

    return year


''' Creates directory for file and downloads file from provided URL '''
def getFile(file_dir, file_url, file_type):
    # Get data year
    year = getFileYear(file_url)
#     if year == '2017-2018':
    # Compile file location
    file_dir = os.path.join(file_dir, year, file_type)

    # Make directory for file if necessary
    conditionalMkdir(file_dir)

    # Get name for file
    file_name = file_url.split('/')[-1]
    file_loc = os.path.join(file_dir, file_name)

    # Check that file does not already exist
    if not os.path.isfile(file_loc):
        print('Getting file: %s' % file_url)
        # Download the file and write to local

        urllib.request.urlretrieve(file_url, file_loc)
#         print('sleep')
#         time.sleep(5)


''' Obtains column labels from NHANES website and saves to JSON '''
def getLabel(file_dir, file_url, file_type):
    # Get data year
    year = getFileYear(file_url)

    # Combile file location:
    file_dir = os.path.join(file_dir, year, file_type)

    # Get name for file
    file_name = file_url.split('/')[-1].replace('.XPT', '.JSON')
    file_loc = os.path.join(file_dir, file_name)

    # Modify XPT file_url to load page with labels
    file_url = file_url.replace('.XPT', '.htm')

    # Check that file does not already exist
    if not os.path.isfile(file_loc):
        # Open the website and download source HTML
        with urllib.request.urlopen(file_url) as page:
            html_source = page.read()

        # Parse the website for column label
        file_labels = parsePageLabels(html_source)

        # Save the file to JSON
        print('Saving label data: %s' % file_loc)
        with open(file_loc, 'w') as open_file:
            json.dump(file_labels, open_file)


''' Reads HTML source from provided URLs, parses HTML for XPT files, and saves files '''
def parseWebSite(url, output_dir):
    # Get base URL for appending to relative file URLs
    base_url = 'http://' + url.lstrip('http://').split('/')[0]

    # Get file type for this URL
    file_type = re.search('Component=([a-zA-Z]+)', url)
    if file_type:
        file_type = file_type.group(1)
    else:
        file_type = 'Other'

    # Open the website and download source HTML
    with urllib.request.urlopen(url) as page:
        html_source = page.read()

    # Parse the website for .XPT file links
    file_urls = parsePageXPT(html_source)
    file_urls = [base_url + file_url for file_url in file_urls]

    # Download each file and store locally
    for file_url in file_urls:
        getFile(output_dir, file_url, file_type)
        getLabel(output_dir, file_url, file_type)

In [4]:
url = ['http://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Demographics',
'http://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Dietary',
'http://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Examination',
'http://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Laboratory',
'http://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Questionnaire',
'http://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Non-Public']

In [1]:
#  Demographics
output_dir = './data/raw_data/'
parseWebSite(url[0], output_dir)

#  Dietary
output_dir = './data/raw_data/'
parseWebSite(url[1], output_dir)

#  Examination
output_dir = './data/raw_data/'
parseWebSite(url[2], output_dir)

#  Laboratory
output_dir = './data/raw_data/'
parseWebSite(url[3], output_dir)

#  Questionnaire
output_dir = './data/raw_data/'
parseWebSite(url[4], output_dir)
