In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import datetime
from utils import update_bq_table, generate_filename, upload_blob_from_string
bucket_name = 'nhanes'

In [2]:
components = ["Demographics","Dietary","Examination","Laboratory","Questionnaire","LimitedAccess"]

In [3]:
dfs = []

for component in components: 
    r = requests.get(f"https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component={component}")

    soup = BeautifulSoup(r.text, 'lxml')

    table = soup.find('table',id='GridView1')

    headers = []
    for i in table.find_all('th'):
        title = i.text
        headers.append(title)
    
    if component != 'LimitedAccess':
        headers = ['_'.join(h.lower().split()) for h in headers] + ['doc_file_url','data_file_url']
    else:
        headers = ['_'.join(h.lower().split()) for h in headers] + ['doc_file_url']

    base_url = "https://wwwn.cdc.gov"

    data = []

    for j in table.find_all('tr')[1:]:
        row_data = j.find_all('td')
        row = [i.text for i in row_data] + [base_url + i.a['href'] for i in row_data if i.find('a')]
        data.append(row)

    df = pd.DataFrame(columns = headers,data=data)

    df['years'] = df['years'].str.strip()
    df['doc_file'] = df['doc_file'].str.strip()
    df['data_file'] = df['data_file'].str.strip()
    df['date_published'] = df['date_published'].str.strip().str.replace('Updated ','')
    df['start_year'] = df['years'].apply(lambda x: x.split('-')[0]).astype(int)
    df['end_year'] = df['years'].apply(lambda x: x.split('-')[1]).astype(int)
    df['published_date'] = pd.to_datetime(df['date_published'],format='%B %Y',errors='ignore')
    df['page_component'] = component
    df['last_updated'] = datetime.datetime.utcnow()
    df['gcs_doc_filename'] = (df['data_file_name'] + ' ' + df['start_year'].astype(str) + ' ' + df['end_year'].astype(str) + ' Documentation').apply(lambda x: generate_filename(x,extension='.html'))
    df['gcs_data_filename'] = (df['data_file_name'] + ' ' + df['start_year'].astype(str) + ' ' + df['end_year'].astype(str) + ' Data').apply(lambda x: generate_filename(x,extension='.XPT'))
    
    dfs.append(df)

In [4]:
final_df = pd.concat(dfs,ignore_index=True)

alias = 'nhanes_file_metadata'
update_bq_table(final_df, alias, dataset='nhanes',bucket='nhanes', truncate = True, max_error=0, schema=None)

nhanes_file_metadata_20231107_041958.csv uploaded to nhanes / nhanes_file_metadata/
Starting job ce74d6a9-0118-4227-af90-d1e31b09010a
Job finished.
Table Row Count 1737 rows.


In [6]:
# final_df

In [None]:
for index, row in final_df.iterrows():
    r = requests.get(row['doc_file_url'],stream=True)
    html_content = r.content
    file_name = row['gcs_doc_filename']
    upload_blob_from_string(bucket_name = bucket_name,
                            bucket_folder = 'docs/',
                            file_name = file_name,
                            blob_string = html_content,
                            encoding='text/html')
    
    if row['data_file_url']:
        data_file_df = pd.read_sas(row['data_file_url'])
        data_file_df.to_parquet(f"gs://{bucket_name}/data/{row['gcs_data_filename'].replace('.XPT','.parquet')}")
    
    
    
    

demographic_variables_sample_weights_2005_2006_documentation.html uploaded to nhanes / docs/
demographic_variables_sample_weights_2007_2008_documentation.html uploaded to nhanes / docs/
demographic_variables_sample_weights_2003_2004_documentation.html uploaded to nhanes / docs/
demographic_variables_sample_weights_2001_2002_documentation.html uploaded to nhanes / docs/
demographic_variables_sample_weights_1999_2000_documentation.html uploaded to nhanes / docs/
demographic_variables_sample_weights_2009_2010_documentation.html uploaded to nhanes / docs/
demographic_variables_sample_weights_2011_2012_documentation.html uploaded to nhanes / docs/
demographic_variables_and_sample_weights_2013_2014_documentation.html uploaded to nhanes / docs/
demographic_variables_and_sample_weights_2015_2016_documentation.html uploaded to nhanes / docs/
demographic_variables_and_sample_weights_2017_2018_documentation.html uploaded to nhanes / docs/
demographic_variables_and_sample_weights_2017_2020_documen

In [23]:
file_name

'demographic_variables_sample_weights0_2005_1_2007_2_2003_3_2001_4_1999_5_2009_6_2011_7_2013_8_2015_9_2017_10_2017_name_start_year_dtype_int64_0_2006_1_2008_2_2004_3_2002_4_2000_5_2010_6_2012_7_2014_8_2016_9_2018_10_2020_name_end_year_dtype_int64_documentation.html'