In [22]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import datetime
import time
from utils import update_bq_table, generate_filename, upload_blob_from_string
bucket_name = 'nhanes'

In [2]:
components = ["Demographics","Dietary","Examination","Laboratory","Questionnaire","LimitedAccess"]

In [3]:
dfs = []

for component in components: 
    r = requests.get(f"https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component={component}")

    soup = BeautifulSoup(r.text, 'lxml')

    table = soup.find('table',id='GridView1')

    headers = []
    for i in table.find_all('th'):
        title = i.text
        headers.append(title)
    
    if component != 'LimitedAccess':
        headers = ['_'.join(h.lower().split()) for h in headers] + ['doc_file_url','data_file_url']
    else:
        headers = ['_'.join(h.lower().split()) for h in headers] + ['doc_file_url']

    base_url = "https://wwwn.cdc.gov"

    data = []

    for j in table.find_all('tr')[1:]:
        row_data = j.find_all('td')
        row = [i.text for i in row_data] + [base_url + i.a['href'] for i in row_data if i.find('a')]
        data.append(row)

    df = pd.DataFrame(columns = headers,data=data)

    df['years'] = df['years'].str.strip()
    df['doc_file'] = df['doc_file'].str.strip()
    df['data_file'] = df['data_file'].str.strip()
    df['date_published'] = df['date_published'].str.strip().str.replace('Updated ','')
    df['start_year'] = df['years'].apply(lambda x: x.split('-')[0]).astype(int)
    df['end_year'] = df['years'].apply(lambda x: x.split('-')[1]).astype(int)
    df['published_date'] = pd.to_datetime(df['date_published'],format='%B %Y',errors='ignore')
    df['page_component'] = component
    df['last_updated'] = datetime.datetime.utcnow()
    df['gcs_doc_filename'] = (df['data_file_name'] + ' ' + df['start_year'].astype(str) + ' ' + df['end_year'].astype(str) + ' Documentation').apply(lambda x: generate_filename(x,extension='.html'))
    df['gcs_data_filename'] = (df['data_file_name'] + ' ' + df['start_year'].astype(str) + ' ' + df['end_year'].astype(str) + ' Data').apply(lambda x: generate_filename(x,extension='.XPT'))
    
    dfs.append(df)

In [4]:
final_df = pd.concat(dfs,ignore_index=True)

alias = 'nhanes_file_metadata'
update_bq_table(final_df, alias, dataset='nhanes',bucket='nhanes', truncate = True, max_error=0, schema=None)

nhanes_file_metadata_20231107_041958.csv uploaded to nhanes / nhanes_file_metadata/
Starting job ce74d6a9-0118-4227-af90-d1e31b09010a
Job finished.
Table Row Count 1737 rows.


In [6]:
# final_df

In [None]:
start_time = time.time()

for index, row in final_df[230:].iterrows():
    
    if not row['doc_file_url'].endswith('.aspx'):
        try:
            r = requests.get(row['doc_file_url'],stream=True,timeout=20)
            html_content = r.content
            file_name = row['gcs_doc_filename']
            upload_blob_from_string(bucket_name = bucket_name,
                                    bucket_folder = 'docs/',
                                    file_name = file_name,
                                    blob_string = html_content,
                                    encoding='text/html')
        except Exception as ex:
            print(ex)
            print(f"Unable to upload {row['gcs_doc_filename']}")
    else:
        print(f"Skipping {row['doc_file_url']}")
    

    if not 'RDC' in row['data_file_url'] and row['data_file_url']:
        try:
            data_file_df = pd.read_sas(row['data_file_url'])
            data_file_df.to_parquet(f"gs://{bucket_name}/data/{row['gcs_data_filename'].replace('.XPT','.parquet')}")
            print(f"{row['data_file_url']} uploaded to gs://{bucket_name}/data/")
        except Exception as ex:
            print(ex)
            print(f"Unable to upload {row['gcs_data_filename']}")
            
    else:
        print(f"Skipping {row['gcs_data_filename']}")
    
print(f"Entire process took {time.time() - start_time} seconds")
    
    
    

Skipping https://wwwn.cdc.gov/Nchs/Nhanes/Dxa/Dxa.aspx
unable to infer format of SAS file from filename: 'https://wwwn.cdc.gov/nchs/nhanes/dxa/dxa.aspx'
Unable to upload dual_energy_x_ray_absorptiometry_whole_body_1999_2006_data.XPT
dual_energy_x_ray_absorptiometry_whole_body_2011_2012_documentation.html uploaded to nhanes / docs/
https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/DXX_G.XPT uploaded to gs://nhanes/data/
dual_energy_x_ray_absorptiometry_whole_body_2015_2016_documentation.html uploaded to nhanes / docs/
https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/DXX_I.XPT uploaded to gs://nhanes/data/
dual_energy_x_ray_absorptiometry_whole_body_2013_2014_documentation.html uploaded to nhanes / docs/
https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/DXX_H.XPT uploaded to gs://nhanes/data/
dual_energy_x_ray_absorptiometry_whole_body_2017_2018_documentation.html uploaded to nhanes / docs/
https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DXX_J.XPT uploaded to gs://nhanes/data/
dual_energy_x_ray_absorptiometry_wh

In [15]:
index

230

In [21]:
# final_df[index:][final_df['doc_file_url'].str.contains('.aspx')]

  final_df[index:][final_df['doc_file_url'].str.contains('.aspx')]


Unnamed: 0,years,data_file_name,doc_file,data_file,date_published,doc_file_url,data_file_url,start_year,end_year,published_date,page_component,last_updated,gcs_doc_filename,gcs_data_filename
230,1999-2006,Dual-Energy X-ray Absorptiometry - Whole Body,All Years Doc,All Years Data,December 2016,https://wwwn.cdc.gov/Nchs/Nhanes/Dxa/Dxa.aspx,https://wwwn.cdc.gov/Nchs/Nhanes/Dxa/Dxa.aspx,1999,2006,2016-12-01 00:00:00,Examination,2023-11-07 04:19:55.075614,dual_energy_x_ray_absorptiometry_whole_body_19...,dual_energy_x_ray_absorptiometry_whole_body_19...
790,2009-2012,Oral Microbiome Project,All Years Doc,All Years Data,October 2022,https://wwwn.cdc.gov/Nchs/Nhanes/Omp/Default.aspx,https://wwwn.cdc.gov/Nchs/Nhanes/Omp/Default.aspx,2009,2012,October 2022,Laboratory,2023-11-07 04:19:56.321392,oral_microbiome_project_2009_2012_documentatio...,oral_microbiome_project_2009_2012_data.XPT
1000,2001-2002,Vitamin D,VID_B Doc,VID_B Data [XPT - 136.2 KB],October 2015,https://wwwn.cdc.gov../vitamind/analyticalnote...,https://wwwn.cdc.gov../vitamind/analyticalnote...,2001,2002,October 2015,Laboratory,2023-11-07 04:19:56.321392,vitamin_d_2001_2002_documentation.html,vitamin_d_2001_2002_data.XPT
1001,2003-2004,Vitamin D,VID_C Doc,VID_C Data [XPT - 144.5 KB],October 2015,https://wwwn.cdc.gov../vitamind/analyticalnote...,https://wwwn.cdc.gov../vitamind/analyticalnote...,2003,2004,October 2015,Laboratory,2023-11-07 04:19:56.321392,vitamin_d_2003_2004_documentation.html,vitamin_d_2003_2004_data.XPT
1002,2005-2006,Vitamin D,VID_D Doc,VID_D Data [XPT - 148.5 KB],October 2015,https://wwwn.cdc.gov../vitamind/analyticalnote...,https://wwwn.cdc.gov../vitamind/analyticalnote...,2005,2006,October 2015,Laboratory,2023-11-07 04:19:56.321392,vitamin_d_2005_2006_documentation.html,vitamin_d_2005_2006_data.XPT
1003,2007-2008,Vitamin D,VID_E Doc,VID_E Data [XPT - 656.4 KB],October 2015,https://wwwn.cdc.gov../vitamind/analyticalnote...,https://wwwn.cdc.gov../vitamind/analyticalnote...,2007,2008,October 2015,Laboratory,2023-11-07 04:19:56.321392,vitamin_d_2007_2008_documentation.html,vitamin_d_2007_2008_data.XPT
1004,2009-2010,Vitamin D,VID_F Doc,VID_F Data [XPT - 693.5 KB],October 2015,https://wwwn.cdc.gov../vitamind/analyticalnote...,https://wwwn.cdc.gov../vitamind/analyticalnote...,2009,2010,October 2015,Laboratory,2023-11-07 04:19:56.321392,vitamin_d_2009_2010_documentation.html,vitamin_d_2009_2010_data.XPT
1688,2009-2012,Oral Microbiome Project,All Years Doc,RDC Only,October 2022,https://wwwn.cdc.gov/Nchs/Nhanes/Omp/Default.aspx,,2009,2012,October 2022,LimitedAccess,2023-11-07 04:19:58.300398,oral_microbiome_project_2009_2012_documentatio...,oral_microbiome_project_2009_2012_data.XPT
