In [76]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import datetime
import time
from urllib.parse import urlparse
from utils import update_bq_table, generate_filename, upload_blob_from_string
bucket_name = 'nhanes'

In [145]:
def scrape_nhanes_table(soup, component):
    table = soup.find('table',id='GridView1')

    headers = []
    for i in table.find_all('th'):
        title = i.text
        headers.append(title)

    base_url = "https://wwwn.cdc.gov"

    data = []

    for j in table.find_all('tr')[1:]:
        row_data = j.find_all('td')
        row = [i.text for i in row_data] + [base_url + i.a['href'] for i in row_data if i.find('a')]
        data.append(row)
    
    if len(data[0]) == len(headers) + 2:
        headers = ['_'.join(h.lower().split()) for h in headers] + ['doc_file_url','data_file_url']
    elif len(data[0]) == len(headers) + 1:
        headers = ['_'.join(h.lower().split()) for h in headers] + ['doc_file_url']
    else:
        headers = ['_'.join(h.lower().split()) for h in headers]

    df = pd.DataFrame(columns = headers,data=data)
    
    return df

### Scrape Continuous NHANES

In [114]:
dfs = []

components = ["Demographics","Dietary","Examination","Laboratory","Questionnaire","LimitedAccess"]

for component in components: 
    r = requests.get(f"https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component={component}")

    soup = BeautifulSoup(r.text, 'lxml')

    table = soup.find('table',id='GridView1')

    headers = []
    for i in table.find_all('th'):
        title = i.text
        headers.append(title)
    
    if component != 'LimitedAccess':
        headers = ['_'.join(h.lower().split()) for h in headers] + ['doc_file_url','data_file_url']
    else:
        headers = ['_'.join(h.lower().split()) for h in headers] + ['doc_file_url']

    base_url = "https://wwwn.cdc.gov"

    data = []

    for j in table.find_all('tr')[1:]:
        row_data = j.find_all('td')
        row = [i.text for i in row_data] + [base_url + i.a['href'] for i in row_data if i.find('a')]
        data.append(row)

    df = pd.DataFrame(columns = headers,data=data)

    df['years'] = df['years'].str.strip()
    df['doc_file'] = df['doc_file'].str.strip()
    df['data_file'] = df['data_file'].str.strip()
    df['date_published'] = df['date_published'].str.strip().str.replace('Updated ','')
    df['start_year'] = df['years'].apply(lambda x: x.split('-')[0]).astype(int)
    df['end_year'] = df['years'].apply(lambda x: x.split('-')[1]).astype(int)
    df['published_date'] = pd.to_datetime(df['date_published'],format='%B %Y',errors='ignore')
    df['page_component'] = component
    df['last_updated'] = datetime.datetime.utcnow()
    df['gcs_doc_filename'] = (f'gs://{bucket_name}/docs/' + df['data_file_name'] + ' ' + df['start_year'].astype(str) + ' ' + df['end_year'].astype(str) + ' Documentation').apply(lambda x: generate_filename(x,extension='.html'))
    df['gcs_data_filename'] = (f'gs://{bucket_name}/data/' +df['data_file_name'] + ' ' + df['start_year'].astype(str) + ' ' + df['end_year'].astype(str) + ' Data').apply(lambda x: generate_filename(x,extension='.XPT'))
    
    dfs.append(df)

final_df = pd.concat(dfs,ignore_index=True)

alias = 'nhanes_continuous_file_metadata'
update_bq_table(final_df, alias, dataset='nhanes',bucket='nhanes', truncate = True, max_error=0, schema=None)

nhanes_continuous_file_metadata_20231107_203909.csv uploaded to nhanes / nhanes_continuous_file_metadata/
Starting job b013b250-2144-4057-b9ed-cce75b93fdc2
Job finished.
Table Row Count 1737 rows.


### Download Data and Docs to GCS

In [96]:
start_time = time.time()

for index, row in final_df.iterrows():
    
    if not row['doc_file_url'].endswith('.aspx'):
        try:
            r = requests.get(row['doc_file_url'],stream=True,timeout=20)
            html_content = r.content
            file_name = row['gcs_doc_filename']
            upload_blob_from_string(bucket_name = bucket_name,
                                    bucket_folder = 'docs/',
                                    file_name = file_name,
                                    blob_string = html_content,
                                    encoding='text/html')
        except Exception as ex:
            print(ex)
            print(f"Unable to upload {row['gcs_doc_filename']}")
    else:
        print(f"Skipping {row['doc_file_url']}")
    

    if not pd.isnull(row['data_file_url']) and not 'RDC' in row['data_file_url']:
        try:
            data_file_df = pd.read_sas(row['data_file_url'])
            data_file_df.to_parquet(f"gs://{bucket_name}/data/{row['gcs_data_filename'].replace('.XPT','.parquet')}")
            print(f"{row['data_file_url']} uploaded to gs://{bucket_name}/data/")
        except Exception as ex:
            print(ex)
            print(f"Unable to upload {row['gcs_data_filename']}")
            
    else:
        print(f"Skipping {row['gcs_data_filename']}")
    
print(f"Entire process took {time.time() - start_time} seconds")
    
    
    

### Scrape Yearly Files

In [179]:
response = requests.get(f"https://wwwn.cdc.gov/nchs/nhanes/default.aspx")

soup = BeautifulSoup(response.text, 'lxml')

ids = [tag['id'] for tag in soup.select('div[id]')]

lis = soup.find('ul',id='nav-primary').find_all('li',{"class": "list-group-item nav-lvl2"})
ids = [el.ul['id'] for el in lis if el.ul]
ids = [i for i in ids if 'nhanes' in i]

# ['nav-group-all-continuous-nhanes',
#  'nav-group-nhanes-2021-2023',
#  'nav-group-nhanes-2017-2020',
#  'nav-group-nhanes-2019-2020']

start_time = time.time()
new_time = time.time()
dfs = []

base_url = "https://wwwn.cdc.gov"

for nhanes_id in ids:
    lis_to_scrape = soup.find('ul',id=nhanes_id).find_all('li')
    for li in lis_to_scrape:
        if 'Data' in li.text.strip() and 'Overview' not in li.text.strip() and 'Errata' not in li.text.strip() and 'Issues' not in li.text.strip() and 'Files' not in li.text.strip():
            component_text = li.text.strip().replace('All ','').replace(' Data','')
            url = base_url + li.a['href']
            dataset = nhanes_id.replace('nav-group-','')
            r = requests.get(url, timeout=20)
            soup = BeautifulSoup(r.text)
            df = scrape_nhanes_table(soup, component_text)
            
            columns = df.columns.tolist()
            df['page_component'] = component_text
            df['dataset'] = dataset
            df['last_updated'] = datetime.datetime.utcnow()
            
            if 'years' in columns:
                df['years'] = df['years'].str.strip()
                df['start_year'] = df['years'].apply(lambda x: x.split('-')[0]).astype(int)
                df['end_year'] = df['years'].apply(lambda x: x.split('-')[1]).astype(int)
            if 'doc_file' in columns:
                df['doc_file'] = df['doc_file'].str.strip()
            if 'data_file' in columns:
                df['data_file'] = df['data_file'].str.strip()
            if 'date_published' in columns:
                df['date_published'] = df['date_published'].str.strip().str.replace('Updated ','')
                df['published_date'] = pd.to_datetime(df['date_published'],format='%B %Y',errors='ignore')
            if 'data_file_name' in columns:
                if 'continuous' in dataset:
                    df['gcs_doc_filename'] = (df['data_file_name'] + ' ' + df['dataset'].astype(str) + ' ' + df['start_year'].astype(str) + ' ' + df['end_year'].astype(str) + ' Documentation').apply(lambda x: generate_filename(x,extension='.html'))
                    df['gcs_data_filename'] = (df['data_file_name'] + ' ' + df['dataset'].astype(str) + ' ' + df['start_year'].astype(str) + ' ' + df['end_year'].astype(str) + ' Data').apply(lambda x: generate_filename(x,extension='.XPT'))
                else:
                    df['gcs_doc_filename'] = (df['data_file_name'] + ' ' + df['dataset'].astype(str) + ' Documentation').apply(lambda x: generate_filename(x,extension='.html'))
                    df['gcs_data_filename'] = (df['data_file_name'] + ' ' + df['dataset'].astype(str) + ' Data').apply(lambda x: generate_filename(x,extension='.XPT'))
                    
                
            dfs.append(df)
        
yearly_df = pd.concat(dfs,ignore_index=True)

alias = 'nhanes_file_metadata'
update_bq_table(yearly_df, alias, dataset='nhanes',bucket='nhanes', truncate = True, max_error=0, schema=None)

print(f"Entire process took {time.time() - start_time} seconds")
        
        

nhanes_file_metadata_20231107_212632.csv uploaded to nhanes / nhanes_file_metadata/
Starting job 07240954-8fc0-4ad3-a8e2-3d2d0f9b9c8c
Job finished.
Table Row Count 3578 rows.
Entire process took 58.127763509750366 seconds


In [180]:
# yearly_df

In [None]:
start_time = time.time()
new_time = time.time()
for index, row in yearly_df.iterrows():
    
    if not row['doc_file_url'].endswith('.aspx'):
        try:
            r = requests.get(row['doc_file_url'],stream=True,timeout=20)
            html_content = r.content
            file_name = row['gcs_doc_filename']
            upload_blob_from_string(bucket_name = bucket_name,
                                    bucket_folder = row['dataset']+'/docs/',
                                    file_name = file_name,
                                    blob_string = html_content,
                                    encoding='text/html')
        except Exception as ex:
            print(ex)
            print(f"Unable to upload {row['gcs_doc_filename']}")
    else:
        print(f"Skipping {row['doc_file_url']}")
    

    if not pd.isnull(row['data_file_url']) and not 'RDC' in row['data_file_url'] and row['gcs_data_filename'].lower().endswith('.xpt'):
        try:
            data_file_df = pd.read_sas(row['data_file_url'])
            data_file_df.to_parquet(f"gs://{bucket_name}/{row['dataset']}/data/{row['gcs_data_filename'].replace('.XPT','.parquet')}")
            print(f"{row['gcs_data_filename']} uploaded to gs://{bucket_name}/{row['dataset']}/data/")
        except Exception as ex:
            print(ex)
            print(f"Unable to upload {row['gcs_data_filename']}")
            
    else:
        print(f"Skipping {row['gcs_data_filename']}")
        
    if index % 100 == 0 and index > 0:
        print(f"Last 100 datasets completed in {time.time() - new_time()}")
        new_time = time.time()
    
print(f"Entire process took {time.time() - start_time} seconds")

demographic_variables_sample_weights_all_continuous_nhanes_2005_2006_documentation.html uploaded to nhanes / all-continuous-nhanes/docs/
demographic_variables_sample_weights_all_continuous_nhanes_2005_2006_data.XPT uploaded to gs://nhanes/data/
demographic_variables_sample_weights_all_continuous_nhanes_2007_2008_documentation.html uploaded to nhanes / all-continuous-nhanes/docs/
demographic_variables_sample_weights_all_continuous_nhanes_2007_2008_data.XPT uploaded to gs://nhanes/data/
demographic_variables_sample_weights_all_continuous_nhanes_2003_2004_documentation.html uploaded to nhanes / all-continuous-nhanes/docs/
demographic_variables_sample_weights_all_continuous_nhanes_2003_2004_data.XPT uploaded to gs://nhanes/data/
demographic_variables_sample_weights_all_continuous_nhanes_2001_2002_documentation.html uploaded to nhanes / all-continuous-nhanes/docs/
demographic_variables_sample_weights_all_continuous_nhanes_2001_2002_data.XPT uploaded to gs://nhanes/data/
demographic_variable

In [163]:
# table = soup.find('table',id='GridView1')

# headers = []
# for i in table.find_all('th'):
#     title = i.text
#     headers.append(title)

# base_url = "https://wwwn.cdc.gov"

# data = []

# for j in table.find_all('tr')[1:]:
#     row_data = j.find_all('td')
#     row = [i.text for i in row_data] + [base_url + i.a['href'] for i in row_data if i.find('a')]
#     data.append(row)
    
# if len(data[0]) == len(headers) + 2:
#     headers = ['_'.join(h.lower().split()) for h in headers] + ['doc_file_url','data_file_url']
# elif len(data[0]) == len(headers) + 1:
#     headers = ['_'.join(h.lower().split()) for h in headers] + ['doc_file_url']
# else:
#     headers = ['_'.join(h.lower().split()) for h in headers]

# df = pd.DataFrame(columns = headers,data=data)

In [None]:
# table

In [31]:
# final_df[index:][final_df['doc_file_url'].str.contains('.aspx')]