# Library BeautifulSoup is requird

In [1]:
# pip install beautifulsoup4

In [2]:
import os
def conditionalMkdir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

In [None]:
''' Obtains column labels from NHANES website and saves to JSON '''
def getLabel(file_dir, file_url, file_type):
    # Get data year
    year = getFileYear(file_url)

    # Combile file location:
    file_dir = os.path.join(file_dir, year, file_type)

    # Get name for file
    file_name = file_url.split('/')[-1].replace('.XPT', '.JSON')
    file_loc = os.path.join(file_dir, file_name)

    # Modify XPT file_url to load page with labels
    file_url = file_url.replace('.XPT', '.htm')

    # Check that file does not already exist
    if not os.path.isfile(file_loc):
        # Open the website and download source HTML
        with urllib.request.urlopen(file_url) as page:
            html_source = page.read()

        # Parse the website for column label
        file_labels = parsePageLabels(html_source)

        # Save the file to JSON
        print('Saving label data: %s' % file_loc)
        with open(file_loc, 'w') as open_file:
            json.dump(file_labels, open_file)
            
''' Parses a page for 'codebook' div with descriptions of column labels '''
def parsePageLabels(html_source):
    # Parse HTML source code with BeautifulSoup library
    soup = BeautifulSoup(html_source, 'html.parser')

    # Find div element with codebook
    try:
        div = soup.findAll('div', id="CodebookLinks")[0]
    except:
        # If we can't find the div, print
        print('Error, no CodebookLinks Div')
        return {}

    # Get all links and their text in the div
    labels = [link.string.rstrip() for link in div.findAll('a')]

    # Put labels into library
    labels = [re.split('( - )', label, 1) for label in labels]
    labels = {label[0]:label[-1] for label in labels}
    return labels


In [3]:
import os
import argparse
import urllib.request
import re
import json
import functools
from multiprocessing import Pool
import time



try:
        from BeautifulSoup import BeautifulSoup
except ImportError:
        from bs4 import BeautifulSoup


''' Finds all links to XPT files in source HTML '''
def parsePageXPT(html_source):
    # Parse HTML source code with BeautifulSoup library
    soup = BeautifulSoup(html_source, 'html.parser')

    # Get all <a>...</a> with .XPT extensions
    xpt_urls = soup.findAll('a', href=re.compile('\.XPT$'))
    xpt_urls = [url['href'] for url in xpt_urls]
    return xpt_urls

'''Get year associated with file '''
def getFileYear(file_url):
    # Search URL for a year
    year = re.search('\/(\d+-\d+)\/', file_url)

    # Get value from regular expression match
    if year:
        year = year.group(1)
    else:
        # If no match, assign year as 'Other'
        year = 'Other'

    return year

''' Creates directory for file and downloads file from provided URL '''
def getFile(file_dir, file_url, file_type):
    # Get data year
    year = getFileYear(file_url)
    if year == '2017-2018':
        # Compile file location
        file_dir = os.path.join(file_dir, year, file_type)

        # Make directory for file if necessary
        conditionalMkdir(file_dir)

        # Get name for file
        file_name = file_url.split('/')[-1]
        file_loc = os.path.join(file_dir, file_name)

        # Check that file does not already exist
        if not os.path.isfile(file_loc):
            print('Getting file: %s' % file_url)
            # Download the file and write to local

            urllib.request.urlretrieve(file_url, file_loc)

''' Reads HTML source from provided URLs, parses HTML for XPT files, and saves files '''
def parseWebSite(url, output_dir):
    # Get base URL for appending to relative file URLs
    base_url = 'http://' + url.lstrip('http://').split('/')[0]

    # Get file type for this URL
    file_type = re.search('Component=([a-zA-Z]+)', url)
    if file_type:
        file_type = file_type.group(1)
    else:
        file_type = 'Other'

    # Open the website and download source HTML
    with urllib.request.urlopen(url) as page:
        html_source = page.read()

    # Parse the website for .XPT file links
    file_urls = parsePageXPT(html_source)
    file_urls = [base_url + file_url for file_url in file_urls]

    # Download each file and store locally
    for file_url in file_urls:
        getFile(output_dir, file_url, file_type)


In [4]:
url = ['http://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Demographics',
'http://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Dietary',
'http://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Examination',
'http://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Laboratory',
'http://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Questionnaire',
'http://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Non-Public']

In [5]:
#  Demographics
output_dir = './data/raw_data/'
parseWebSite(url[0], output_dir)

Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEMO_J.XPT


KeyboardInterrupt: 

In [6]:
#  Dietary
output_dir = './data/raw_data/'
parseWebSite(url[1], output_dir)

Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DR1IFF_J.XPT


KeyboardInterrupt: 

In [None]:
#  Examination
output_dir = './data/raw_data/'
parseWebSite(url[2], output_dir)

In [15]:
#  Laboratory
output_dir = './data/raw_data/'
parseWebSite(url[3], output_dir)

Saving label data: ./data/raw_data/2003-2004/Laboratory/L06AGE_C.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/AMDGYD_D.XPT
Saving label data: ./data/raw_data/2005-2006/Laboratory/AMDGYD_D.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/AMDGYD_H.XPT
Error, no CodebookLinks Div
Saving label data: ./data/raw_data/2013-2014/Laboratory/AMDGYD_H.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/AMDGYD_I.XPT
Error, no CodebookLinks Div
Saving label data: ./data/raw_data/2015-2016/Laboratory/AMDGYD_I.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/AMDGDS_H.XPT
Error, no CodebookLinks Div
Saving label data: ./data/raw_data/2013-2014/Laboratory/AMDGDS_H.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/SSAFB_A.XPT
Saving label data: ./data/raw_data/1999-2000/Laboratory/SSAFB_A.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/ALB_CR_E.XPT
Saving label data: ./data/raw_data/2007-2008/Laboratory/ALB_CR_E.JSON
Getting file: 

KeyboardInterrupt: 

In [12]:
#  Questionnaire
output_dir = './data/raw_data/'
parseWebSite(url[4], output_dir)


Saving label data: ./data/raw_data/2005-2006/Questionnaire/ACQ_D.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/ACQ_E.XPT
sleep
Saving label data: ./data/raw_data/2007-2008/Questionnaire/ACQ_E.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/ACQ.XPT
sleep
Saving label data: ./data/raw_data/1999-2000/Questionnaire/ACQ.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/ACQ_C.XPT
sleep
Saving label data: ./data/raw_data/2003-2004/Questionnaire/ACQ_C.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/ACQ_B.XPT
sleep
Saving label data: ./data/raw_data/2001-2002/Questionnaire/ACQ_B.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/ACQ_F.XPT
sleep
Saving label data: ./data/raw_data/2009-2010/Questionnaire/ACQ_F.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/ACQ_G.XPT
sleep
Saving label data: ./data/raw_data/2011-2012/Questionnaire/ACQ_G.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/ACQ_H.XPT
sleep
Saving lab

sleep
Saving label data: ./data/raw_data/2009-2010/Questionnaire/BHQ_F.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/CDQ_D.XPT
sleep
Saving label data: ./data/raw_data/2005-2006/Questionnaire/CDQ_D.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/CDQ_E.XPT
sleep
Saving label data: ./data/raw_data/2007-2008/Questionnaire/CDQ_E.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/CDQ.XPT
sleep
Error, no CodebookLinks Div
Saving label data: ./data/raw_data/1999-2000/Questionnaire/CDQ.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/CDQ_C.XPT
sleep
Saving label data: ./data/raw_data/2003-2004/Questionnaire/CDQ_C.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/CDQ_B.XPT
sleep
Saving label data: ./data/raw_data/2001-2002/Questionnaire/CDQ_B.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/CDQ_F.XPT
sleep
Saving label data: ./data/raw_data/2009-2010/Questionnaire/CDQ_F.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/20

sleep
Error, no CodebookLinks Div
Saving label data: ./data/raw_data/2003-2004/Questionnaire/DIQ_C.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/DIQ_B.XPT
sleep
Error, no CodebookLinks Div
Saving label data: ./data/raw_data/2001-2002/Questionnaire/DIQ_B.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/DIQ_F.XPT
sleep
Saving label data: ./data/raw_data/2009-2010/Questionnaire/DIQ_F.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/DIQ_G.XPT
sleep
Saving label data: ./data/raw_data/2011-2012/Questionnaire/DIQ_G.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/DIQ_H.XPT
sleep
Saving label data: ./data/raw_data/2013-2014/Questionnaire/DIQ_H.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/DIQ_I.XPT
sleep
Saving label data: ./data/raw_data/2015-2016/Questionnaire/DIQ_I.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DIQ_J.XPT
sleep
Error, no CodebookLinks Div
Saving label data: ./data/raw_data/2017-2018/Questionnaire

sleep
Saving label data: ./data/raw_data/2007-2008/Questionnaire/HIQ_E.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/HIQ.XPT
sleep
Error, no CodebookLinks Div
Saving label data: ./data/raw_data/1999-2000/Questionnaire/HIQ.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/HIQ_D.XPT
sleep
Saving label data: ./data/raw_data/2005-2006/Questionnaire/HIQ_D.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/HIQ_C.XPT
sleep
Error, no CodebookLinks Div
Saving label data: ./data/raw_data/2003-2004/Questionnaire/HIQ_C.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/HIQ_B.XPT
sleep
Error, no CodebookLinks Div
Saving label data: ./data/raw_data/2001-2002/Questionnaire/HIQ_B.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/HIQ_F.XPT
sleep
Saving label data: ./data/raw_data/2009-2010/Questionnaire/HIQ_F.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/HIQ_G.XPT
sleep
Saving label data: ./data/raw_data/2011-2012/Questionnaire/HIQ

sleep
Error, no CodebookLinks Div
Saving label data: ./data/raw_data/2015-2016/Questionnaire/INQ_I.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/INQ_J.XPT
sleep
Error, no CodebookLinks Div
Saving label data: ./data/raw_data/2017-2018/Questionnaire/INQ_J.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/KIQ.XPT
sleep
Saving label data: ./data/raw_data/1999-2000/Questionnaire/KIQ.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/KIQ_U_E.XPT
sleep
Saving label data: ./data/raw_data/2007-2008/Questionnaire/KIQ_U_E.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/KIQ_U_D.XPT
sleep
Saving label data: ./data/raw_data/2005-2006/Questionnaire/KIQ_U_D.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/KIQ_U_C.XPT
sleep
Saving label data: ./data/raw_data/2003-2004/Questionnaire/KIQ_U_C.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/KIQ_U_B.XPT
sleep
Saving label data: ./data/raw_data/2001-2002/Questionnaire/KIQ_U_B.JSON
Gett

sleep
Error, no CodebookLinks Div
Saving label data: ./data/raw_data/1999-2000/Questionnaire/OHQ.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/OHQ_D.XPT
sleep
Saving label data: ./data/raw_data/2005-2006/Questionnaire/OHQ_D.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/OHQ_C.XPT
sleep
Error, no CodebookLinks Div
Saving label data: ./data/raw_data/2003-2004/Questionnaire/OHQ_C.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/OHQ_B.XPT
sleep
Error, no CodebookLinks Div
Saving label data: ./data/raw_data/2001-2002/Questionnaire/OHQ_B.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/OHQ_F.XPT
sleep
Saving label data: ./data/raw_data/2009-2010/Questionnaire/OHQ_F.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/OHQ_G.XPT
sleep
Saving label data: ./data/raw_data/2011-2012/Questionnaire/OHQ_G.JSON
Getting file: http://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/OHQ_H.XPT
sleep
Saving label data: ./data/raw_data/2013-2014/Questionnaire/O

ConnectionResetError: [Errno 54] Connection reset by peer