In [3]:
import duckdb
import pandas as pd
import pandas_gbq
import re
import time
import numpy as np
import os
from utils import generate_filename, update_bq_table
from dotenv import load_dotenv

load_dotenv('../myenv.env')

BUCKET_NAME = os.getenv("BUCKET_NAME")
PROJECT_ID = os.getenv("PROJECT_ID")
GCP_KEY_ID = os.getenv("GCP_KEY_ID")
GCP_ACCESS_KEY = os.getenv("GCP_ACCESS_KEY")
DATASET = "nhanes"

bucket_name = 'nhanes_clean'

%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

In [6]:
def configure_duckdb_gcp():
    """
    Configures DuckDB for use with GCP by setting necessary S3 configurations.
    Retrieves GCP access keys from environment variables.
    Raises EnvironmentError if keys are not found.
    """
    gcp_access_key_id = GCP_KEY_ID
    gcp_secret_access_key = GCP_ACCESS_KEY

    if not gcp_access_key_id or not gcp_secret_access_key:
        logging.error("GCP access key or secret key not set in environment variables.")
        raise EnvironmentError("GCP keys not found in environment variables.")

    duckdb.sql("SET s3_endpoint='storage.googleapis.com';")
    duckdb.sql(f"SET s3_access_key_id='{gcp_access_key_id}';")
    duckdb.sql(f"SET s3_secret_access_key='{gcp_secret_access_key}';")

In [4]:
%sql duckdb:///:default:

In [22]:
def get_metadata_df():
    """
    Retrieves metadata dataframe from BigQuery.

    Returns:
        DataFrame containing metadata information.
    """
    query = f"""
    SELECT *,
        replace(gcs_data_filename,'.XPT','.parquet') as parquet_filename
    FROM `{PROJECT_ID}.nhanes.nhanes_file_metadata`
    WHERE dataset = 'all-continuous-nhanes'
    AND page_component != 'Limited Access'
    """
    return pd.read_gbq(query, 
                       project_id=PROJECT_ID,
                       dialect="standard")

In [23]:
df = get_metadata_df()

In [21]:
# df

In [3]:
### GET METADATA DATAFRAME
metadata_df = pd.read_gbq(
    """SELECT *,
    replace(gcs_data_filename,'.XPT','.parquet') as parquet_filename
    FROM nhanes.nhanes_file_metadata
    WHERE dataset = 'all-continuous-nhanes'
    AND page_component != 'Limited Access'
    """,
    project_id="nhanes-genai",
    dialect="standard",
)

In [100]:
# metadata_df

In [4]:
file_subset_df = metadata_df[["data_file_name","page_component"]].drop_duplicates()
file_subset_df["data_file_name"] = file_subset_df["data_file_name"].str.strip()
file_subset_df["page_component"] = file_subset_df["page_component"].str.strip()

file_subset_df.drop_duplicates(inplace=True)


In [103]:
# file_subset_df.sort_values(by=['data_file_name','page_component'])[file_subset_df['data_file_name'].str.contains('Acrylamide')]['data_file_name'].unique().tolist()


  file_subset_df.sort_values(by=['data_file_name','page_component'])[file_subset_df['data_file_name'].str.contains('Acrylamide')]['data_file_name'].unique().tolist()


['Acrylamide & Glycidamide', 'Acrylamide & Glycidamide - Special Sample']

Unnamed: 0,Success


In [25]:
# %%sql
# output_df << SELECT * FROM read_parquet('s3://nhanes_clean/all-continuous-nhanes/data/acculturation_questionnaire_*.parquet',
#                                            union_by_name=True,
#                                            filename=True);

In [34]:
# duckdb.rollback()

In [7]:
configure_duckdb_gcp()

In [15]:
# df = duckdb.read_parquet('s3://nhanes_clean/all-continuous-nhanes/data/dual_energy_x_ray_absorptiometry_whole_body*.parquet',
#                     union_by_name=True,
#                     filename=True).to_df()

In [14]:
# df['filename'].unique().tolist()

In [46]:
# df['SEQN'] = df['SEQN'].astype('int64')

In [51]:
# for column in df.columns.tolist():
#     if column != 'filename':
#         df[column] = df[column].astype('int64')

In [None]:
# alias = generate_filename("Dietary Supplement Use 30-Day - File 1,".lower() + " " + page_component.lower(),'')
# alias

In [55]:
# metadata_df['data_file_name'].unique().tolist()

In [96]:
# metadata_df[(metadata_df['data_file_name'].str.contains('Hepatitis B'))].sort_values(by=['data_file_name','page_component'])

In [90]:
# file_subset_df.sort_values(by='data_file_name').reset_index(inplace=True)

In [99]:
# file_subset_df[file_subset_df['data_file_name'].str.contains("Hepatitis B")]['data_file_name'].unique()

In [8]:
i = 0

start_time = time.time()
new_time = time.time()

metadata_cols = ['start_year','end_year','last_updated','published_date','parquet_filename','data_file_url','doc_file_url','dataset']

for data_file_name, page_component in file_subset_df.sort_values(by='data_file_name').values:
    file_df = metadata_df[(metadata_df['data_file_name'] == data_file_name.strip()) &
                          (metadata_df['page_component'].str.contains(page_component.strip()))]
    if file_df['start_year'].max() > 2015:
        
        alias = generate_filename(data_file_name.lower() + " " + page_component.lower(),'')
        
        df = duckdb.read_parquet(f's3://{bucket_name}/all-continuous-nhanes/data/{alias}*.parquet',
                    union_by_name=True,
                    filename=True).to_df()
        
        df['survey'] = data_file_name
        df['survey_type'] = page_component
        if 'SEQN' in df.columns:
            df['SEQN'] = df['SEQN'].astype('Int64')  
        
        df['filename_only'] = df['filename'].apply(lambda x: x.split('/')[-1])
        
        df = df.merge(metadata_df[metadata_cols],
                 how='left',left_on='filename_only',right_on='parquet_filename')
        
        df['start_year'] = df['start_year'].astype('Int64')
        df['end_year'] = df['end_year'].astype('Int64')
        df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')
        
        df.drop(['filename_only'],axis=1, inplace=True)
        
        try:
            str_df = df.select_dtypes([object])
            str_df = str_df.stack().str.decode('utf-8').unstack()
        
            for col in str_df.columns:
                if col not in metadata_cols and col != 'filename':
                    df[col] = str_df[col]
        except Exception as ex:
            print(ex)
            print(f"Unable to convert data types from bytes to string for: {data_file_name} - {page_component}") 

        update_bq_table(
                        df,
                        alias,
                        dataset="nhanes",
                        bucket="nhanes_clean",
                        truncate=True,
                        max_error=0,
                        schema=None,
                    )
        i += 1
    
        if i % 10 == 0 and i > 0:
            print(f"Last 10 datasets took {time.time() - new_time} seconds")
            
print(f"Entire process took {time.time() - start_time} seconds")

acculturation_questionnaire_20231110_185042.csv uploaded to nhanes_clean / acculturation_questionnaire/
Starting job 09084aac-724b-4c28-a8ed-529c3769db19
Job finished.
Table Row Count 92158 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


albumin_creatinine_urine_laboratory_20231110_185055.csv uploaded to nhanes_clean / albumin_creatinine_urine_laboratory/
Starting job 966ac994-c854-403d-9aea-9e2dd16b4824
Job finished.
Table Row Count 94940 rows.
alcohol_use_questionnaire_20231110_185109.csv uploaded to nhanes_clean / alcohol_use_questionnaire/
Starting job 2424b5d0-7daa-48ad-be84-e0c0fc86a364
Job finished.
Table Row Count 62524 rows.
alpha_1_acid_glycoprotein_serum_surplus_laboratory_20231110_185116.csv uploaded to nhanes_clean / alpha_1_acid_glycoprotein_serum_surplus_laboratory/
Starting job 030ee158-990f-4175-834f-ab20279d4cb6
Job finished.
Table Row Count 6002 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


arsenic_total_urine_laboratory_20231110_185121.csv uploaded to nhanes_clean / arsenic_total_urine_laboratory/
Starting job 194cebe8-cfa8-43d7-b17e-4136f109e245
Job finished.
Table Row Count 13903 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


arsenics_speciated_urine_laboratory_20231110_185126.csv uploaded to nhanes_clean / arsenics_speciated_urine_laboratory/
Starting job 2e67964c-2355-43ed-a008-9ca0b9346452
Job finished.
Table Row Count 10624 rows.
audiometry_questionnaire_20231110_185135.csv uploaded to nhanes_clean / audiometry_questionnaire/
Starting job 875124b8-685d-4a60-97ac-585af5f868d5
Job finished.
Table Row Count 102027 rows.
audiometry_examination_20231110_185153.csv uploaded to nhanes_clean / audiometry_examination/
Starting job 0a30c6e3-c5e5-4297-a5d2-bece73618ae7
Job finished.
Table Row Count 29714 rows.
audiometry_acoustic_reflex_examination_20231110_185215.csv uploaded to nhanes_clean / audiometry_acoustic_reflex_examination/
Starting job 924d3e89-9139-434a-932c-45d663d34bd6
Job finished.
Table Row Count 156493 rows.
audiometry_wideband_reflectance_examination_20231110_185407.csv uploaded to nhanes_clean / audiometry_wideband_reflectance_examination/
Starting job e6e7e043-6bdf-4815-bbdf-ad626b6eae6e
Job fi

  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


cholesterol_high_density_lipoprotein_hdl_laboratory_20231110_185513.csv uploaded to nhanes_clean / cholesterol_high_density_lipoprotein_hdl_laboratory/
Starting job 431aff3b-9c4b-4cfc-912f-c81338ab3311
Job finished.
Table Row Count 27654 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


cholesterol_low_density_lipoproteins_ldl_triglycerides_laboratory_20231110_185520.csv uploaded to nhanes_clean / cholesterol_low_density_lipoproteins_ldl_triglycerides_laboratory/
Starting job eeaab540-1890-46d2-875d-ef7afb7e7ce2
Job finished.
Table Row Count 8126 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


cholesterol_total_laboratory_20231110_185528.csv uploaded to nhanes_clean / cholesterol_total_laboratory/
Starting job 5ae0953b-6f47-4471-97ec-759a5ad04d36
Job finished.
Table Row Count 68575 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


chromium_cobalt_laboratory_20231110_185535.csv uploaded to nhanes_clean / chromium_cobalt_laboratory/
Starting job 8f21ddf4-0c60-4f2d-90bc-540c3759d04b
Job finished.
Table Row Count 13235 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


chromium_urine_laboratory_20231110_185540.csv uploaded to nhanes_clean / chromium_urine_laboratory/
Starting job 2b7b5b60-a674-4c31-88c0-6c93716cd894
Job finished.
Table Row Count 7869 rows.
complete_blood_count_with_5_part_differential_laboratory_20231110_185545.csv uploaded to nhanes_clean / complete_blood_count_with_5_part_differential_laboratory/
Starting job 344a3d09-c24f-4735-805b-76df3e2de805
Job finished.
Table Row Count 8366 rows.
complete_blood_count_with_5_part_differential_in_whole_blood_laboratory_20231110_185550.csv uploaded to nhanes_clean / complete_blood_count_with_5_part_differential_in_whole_blood_laboratory/
Starting job 234bb84f-75ca-417b-b3d8-1fac7f724a96
Job finished.
Table Row Count 13772 rows.
Last 10 datasets took 318.44487619400024 seconds
consumer_behavior_questionnaire_20231110_185558.csv uploaded to nhanes_clean / consumer_behavior_questionnaire/
Starting job 6ab94c2f-5e47-4656-b6fa-c65d32212082
Job finished.
Table Row Count 59842 rows.
consumer_behavior_p

  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


cotinine_and_hydroxycotinine_serum_laboratory_20231110_185628.csv uploaded to nhanes_clean / cotinine_and_hydroxycotinine_serum_laboratory/
Starting job 77c01477-2dac-4a3c-8e3e-516f918543a5
Job finished.
Table Row Count 38484 rows.
current_health_status_questionnaire_20231110_185639.csv uploaded to nhanes_clean / current_health_status_questionnaire/
Starting job 7f8f8488-830d-4ae0-962f-83966c33c1a1
Job finished.
Table Row Count 102685 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


cytomegalovirus_igg_igm_antibodies_serum_laboratory_20231110_185650.csv uploaded to nhanes_clean / cytomegalovirus_igg_igm_antibodies_serum_laboratory/
Starting job 541ce115-36aa-421b-b358-a9799aa8869b
Job finished.
Table Row Count 3640 rows.
demographic_variables_and_sample_weights_demographics_20231110_185656.csv uploaded to nhanes_clean / demographic_variables_and_sample_weights_demographics/
Starting job fe82e71a-5371-4b83-adee-b5b47ae58fef
Job finished.
Table Row Count 44960 rows.
dermatology_questionnaire_20231110_185712.csv uploaded to nhanes_clean / dermatology_questionnaire/
Starting job 1f688c41-77f2-4e50-b782-5d18202a3630
Job finished.
Table Row Count 43385 rows.
diabetes_questionnaire_20231110_185724.csv uploaded to nhanes_clean / diabetes_questionnaire/
Starting job 34a5dfb1-7acd-45ee-b3bf-359b8aef9c80
Job finished.
Table Row Count 111797 rows.
diet_behavior_nutrition_questionnaire_20231110_185742.csv uploaded to nhanes_clean / diet_behavior_nutrition_questionnaire/
Starti

  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


ethylene_oxide_laboratory_20231110_191100.csv uploaded to nhanes_clean / ethylene_oxide_laboratory/
Starting job 4fbefe49-ede6-4f1a-8809-4720cb1d3101
Job finished.
Table Row Count 11986 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


fasting_questionnaire_laboratory_20231110_191110.csv uploaded to nhanes_clean / fasting_questionnaire_laboratory/
Starting job 5b86ff13-f562-47b7-af4a-349f7de21d8d
Job finished.
Table Row Count 106203 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


ferritin_laboratory_20231110_191124.csv uploaded to nhanes_clean / ferritin_laboratory/
Starting job dcc26707-b3c8-4f28-a54a-854f21fb5e46
Job finished.
Table Row Count 33176 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


flame_retardants_urine_laboratory_20231110_191129.csv uploaded to nhanes_clean / flame_retardants_urine_laboratory/
Starting job 9c8fa031-d737-4853-954b-0d1309f46869
Job finished.
Table Row Count 7915 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


folate_rbc_laboratory_20231110_191137.csv uploaded to nhanes_clean / folate_rbc_laboratory/
Starting job 24b0d47d-d11f-4b8d-8dee-65131b546b4b
Job finished.
Table Row Count 41021 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


folate_forms_total_individual_serum_laboratory_20231110_191144.csv uploaded to nhanes_clean / folate_forms_total_individual_serum_laboratory/
Starting job c7550e0d-216a-4658-bd09-696f0f0868a9
Job finished.
Table Row Count 41021 rows.
Last 10 datasets took 1273.016511440277 seconds
food_security_questionnaire_20231110_191155.csv uploaded to nhanes_clean / food_security_questionnaire/
Starting job 8e62a6a8-4706-4a41-a5ff-1721ccd0dea3
Job finished.
Table Row Count 116876 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


glycohemoglobin_laboratory_20231110_191215.csv uploaded to nhanes_clean / glycohemoglobin_laboratory/
Starting job d26ce792-5d02-4935-8a91-649c761e7fd3
Job finished.
Table Row Count 79541 rows.
health_insurance_questionnaire_20231110_191228.csv uploaded to nhanes_clean / health_insurance_questionnaire/
Starting job 009eb132-76ef-47da-b56a-fa5fefd67c30
Job finished.
Table Row Count 116876 rows.
hepatitis_questionnaire_20231110_191240.csv uploaded to nhanes_clean / hepatitis_questionnaire/
Starting job 979d3c88-8799-44af-9aac-8d7eb83b6573
Job finished.
Table Row Count 38035 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


hepatitis_a_laboratory_20231110_191248.csv uploaded to nhanes_clean / hepatitis_a_laboratory/
Starting job c217e949-855d-4831-8027-d351f6754c3f
Job finished.
Table Row Count 39630 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


hepatitis_b_surface_antibody_laboratory_20231110_191301.csv uploaded to nhanes_clean / hepatitis_b_surface_antibody_laboratory/
Starting job 855b0f3f-a556-4d00-a43f-d167f4e48fc6
Job finished.
Table Row Count 103028 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


hepatitis_b_surface_antibody_laboratory_20231110_191314.csv uploaded to nhanes_clean / hepatitis_b_surface_antibody_laboratory/
Starting job cda8b70f-5a83-4ca3-9d26-fcfff0024b29
Job finished.
Table Row Count 103028 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


hepatitis_c_rna_hcv_rna_confirmed_antibody_inno_lia_genotype_laboratory_20231110_191323.csv uploaded to nhanes_clean / hepatitis_c_rna_hcv_rna_confirmed_antibody_inno_lia_genotype_laboratory/
Starting job baf587e8-9ab4-427f-8c61-a7ed5d463ad1
Job finished.
Table Row Count 19633 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


hepatitis_e_igg_igm_antibodies_laboratory_20231110_191330.csv uploaded to nhanes_clean / hepatitis_e_igg_igm_antibodies_laboratory/
Starting job 1644b5fb-0bcc-400b-b20b-5d05ca89dae0
Job finished.
Table Row Count 52357 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


high_sensitivity_c_reactive_protein_laboratory_20231110_191336.csv uploaded to nhanes_clean / high_sensitivity_c_reactive_protein_laboratory/
Starting job b03fcd2f-7a76-496b-afcc-b32939b9e085
Job finished.
Table Row Count 22138 rows.
Last 10 datasets took 1386.0373814105988 seconds
hospital_utilization_access_to_care_questionnaire_20231110_191348.csv uploaded to nhanes_clean / hospital_utilization_access_to_care_questionnaire/
Starting job 5101db9b-1c97-4399-b8ba-901f91a77448
Job finished.
Table Row Count 116876 rows.
housing_characteristics_questionnaire_20231110_191404.csv uploaded to nhanes_clean / housing_characteristics_questionnaire/
Starting job cc7fc7a2-5f85-4102-9aa8-5f1e0c9b5ff7
Job finished.
Table Row Count 101316 rows.
immunization_questionnaire_20231110_191418.csv uploaded to nhanes_clean / immunization_questionnaire/
Starting job c83ba971-b687-4f8b-94a5-b00557d892f2
Job finished.
Table Row Count 116876 rows.
income_questionnaire_20231110_191432.csv uploaded to nhanes_clea

  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


insulin_laboratory_20231110_191441.csv uploaded to nhanes_clean / insulin_laboratory/
Starting job 3c6d90bb-860c-4b6b-b890-f7e3575a5ad7
Job finished.
Table Row Count 14646 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


iodine_urine_laboratory_20231110_191450.csv uploaded to nhanes_clean / iodine_urine_laboratory/
Starting job 8195386b-2d6d-42a0-a8e2-1014a2c0b297
Job finished.
Table Row Count 35880 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


iron_status_serum_laboratory_20231110_191456.csv uploaded to nhanes_clean / iron_status_serum_laboratory/
Starting job c8fdd710-343f-4dca-afbd-58d19bf48ecb
Job finished.
Table Row Count 16810 rows.
kidney_conditions_urology_questionnaire_20231110_191505.csv uploaded to nhanes_clean / kidney_conditions_urology_questionnaire/
Starting job ff94b4fe-f919-474d-87dd-85c94cc77735
Job finished.
Table Row Count 59433 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


lead_cadmium_total_mercury_selenium_manganese_blood_laboratory_20231110_191514.csv uploaded to nhanes_clean / lead_cadmium_total_mercury_selenium_manganese_blood_laboratory/
Starting job 8688416a-3a71-4ec6-9ad7-046f0627a11e
Job finished.
Table Row Count 27953 rows.
medical_conditions_questionnaire_20231110_191526.csv uploaded to nhanes_clean / medical_conditions_questionnaire/
Starting job 225ee409-8293-45f8-b592-c7da6c8b26a3
Job finished.
Table Row Count 111797 rows.
Last 10 datasets took 1507.734755039215 seconds
mental_health_depression_screener_questionnaire_20231110_191548.csv uploaded to nhanes_clean / mental_health_depression_screener_questionnaire/
Starting job 804504c6-dab8-4e16-ae9f-22fe84caa271
Job finished.
Table Row Count 49461 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


mercury_inorganic_urine_laboratory_20231110_191558.csv uploaded to nhanes_clean / mercury_inorganic_urine_laboratory/
Starting job a768b67e-53cc-48b0-b766-82c751c2939d
Job finished.
Table Row Count 27523 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


mercury_inorganic_ethyl_and_methyl_blood_laboratory_20231110_191605.csv uploaded to nhanes_clean / mercury_inorganic_ethyl_and_methyl_blood_laboratory/
Starting job 17482153-91e1-41fe-9e32-f6ac68893d52
Job finished.
Table Row Count 29069 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


metals_urine_laboratory_20231110_191614.csv uploaded to nhanes_clean / metals_urine_laboratory/
Starting job 510c6427-0b3e-4663-8ca2-0223c9378d42
Job finished.
Table Row Count 32874 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


nickel_urine_laboratory_20231110_191622.csv uploaded to nhanes_clean / nickel_urine_laboratory/
Starting job c3666df5-ed89-4b4b-80f6-291050b5c2e9
Job finished.
Table Row Count 7869 rows.
occupation_questionnaire_20231110_191630.csv uploaded to nhanes_clean / occupation_questionnaire/
Starting job 6857d1ae-9fdc-4b30-b2f1-097763cfa736
Job finished.
Table Row Count 75038 rows.
oral_health_questionnaire_20231110_191644.csv uploaded to nhanes_clean / oral_health_questionnaire/
Starting job 1dc44e61-6295-4fc7-8245-69e617850e5e
Job finished.
Table Row Count 98221 rows.
osteoporosis_questionnaire_20231110_191702.csv uploaded to nhanes_clean / osteoporosis_questionnaire/
Starting job d90ab0ae-ede4-43f3-b633-f4e985639df9
Job finished.
Table Row Count 44335 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


perchlorate_nitrate_thiocyanate_urine_laboratory_20231110_191716.csv uploaded to nhanes_clean / perchlorate_nitrate_thiocyanate_urine_laboratory/
Starting job 442bcab0-ea58-4ae6-bcd1-2a7aacb6009b
Job finished.
Table Row Count 35656 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


perfluoroalkyl_and_polyfluoroalkyl_substances_laboratory_20231110_191724.csv uploaded to nhanes_clean / perfluoroalkyl_and_polyfluoroalkyl_substances_laboratory/
Starting job 4429f307-0ca3-41a6-b6fa-d15c28767edb
Job finished.
Table Row Count 4461 rows.
Last 10 datasets took 1611.2208850383759 seconds
perfluoroalkyl_and_polyfluoroalkyl_substances_surplus_laboratory_20231110_191729.csv uploaded to nhanes_clean / perfluoroalkyl_and_polyfluoroalkyl_substances_surplus_laboratory/
Starting job 683e2b94-1fc2-43f8-829a-ee0efdb12403
Job finished.
Table Row Count 1672 rows.
pesticide_use_questionnaire_20231110_191737.csv uploaded to nhanes_clean / pesticide_use_questionnaire/
Starting job 7f55e5fe-d4be-4f15-9184-753c861f67b4
Job finished.
Table Row Count 99701 rows.
physical_activity_questionnaire_20231110_191753.csv uploaded to nhanes_clean / physical_activity_questionnaire/
Starting job 8b61da53-32d9-4726-b34c-a82541f49aa7
Job finished.
Table Row Count 100509 rows.
physical_activity_youth_ques

  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


plasma_fasting_glucose_laboratory_20231110_191832.csv uploaded to nhanes_clean / plasma_fasting_glucose_laboratory/
Starting job 7cfaf1c5-8e82-430c-ba19-bb1170d70330
Job finished.
Table Row Count 14646 rows.
'utf-8' codec can't decode byte 0xf6 in position 18: invalid start byte
Unable to convert data types from bytes to string for: Prescription Medications - Questionnaire
prescription_medications_questionnaire_20231110_191846.csv uploaded to nhanes_clean / prescription_medications_questionnaire/
Starting job 683cacb8-3f49-4c7c-b25d-9233786f99f5
Job finished.
Table Row Count 223038 rows.
preventive_aspirin_use_questionnaire_20231110_191904.csv uploaded to nhanes_clean / preventive_aspirin_use_questionnaire/
Starting job f9ac7178-697c-4007-99ce-8ea26cf204fb
Job finished.
Table Row Count 21499 rows.
reproductive_health_questionnaire_20231110_191914.csv uploaded to nhanes_clean / reproductive_health_questionnaire/
Starting job fc50daf8-a3de-4f58-8dde-59063319807a
Job finished.
Table Row C

  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


standard_biochemistry_profile_laboratory_20231110_192024.csv uploaded to nhanes_clean / standard_biochemistry_profile_laboratory/
Starting job a0044f8e-e1eb-46e9-87d4-12f6634feaaf
Job finished.
Table Row Count 72783 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


transferrin_receptor_laboratory_20231110_192038.csv uploaded to nhanes_clean / transferrin_receptor_laboratory/
Starting job 5b7d0bfe-8769-4be8-b507-9ce912dff72e
Job finished.
Table Row Count 25761 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


urine_flow_rate_laboratory_20231110_192046.csv uploaded to nhanes_clean / urine_flow_rate_laboratory/
Starting job b8984a57-e7d7-40e5-bc1c-a0dd999a5344
Job finished.
Table Row Count 54274 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


urine_pregnancy_test_laboratory_20231110_192053.csv uploaded to nhanes_clean / urine_pregnancy_test_laboratory/
Starting job ca7f5e76-dfd0-4496-8535-3fb1ceed4eca
Job finished.
Table Row Count 2807 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


vitamin_c_laboratory_20231110_192059.csv uploaded to nhanes_clean / vitamin_c_laboratory/
Starting job 3b666405-87af-442d-9e2d-f473167155f6
Job finished.
Table Row Count 23503 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


volatile_organic_compound_voc_metabolites_urine_laboratory_20231110_192105.csv uploaded to nhanes_clean / volatile_organic_compound_voc_metabolites_urine_laboratory/
Starting job bc94279b-bff9-42ed-869f-a51685eb3260
Job finished.
Table Row Count 16454 rows.
Last 10 datasets took 1835.585443496704 seconds
volatile_organic_compound_voc_metabolites_ii_urine_laboratory_20231110_192113.csv uploaded to nhanes_clean / volatile_organic_compound_voc_metabolites_ii_urine_laboratory/
Starting job 800726cb-bbac-4c4c-ac7a-cb449aa03eae
Job finished.
Table Row Count 4890 rows.
volatile_organic_compound_voc_metabolites_ii_urine_surplus_laboratory_20231110_192117.csv uploaded to nhanes_clean / volatile_organic_compound_voc_metabolites_ii_urine_surplus_laboratory/
Starting job ec8578e6-d481-4f5d-94d6-9b88b098ae77
Job finished.
Table Row Count 2979 rows.


  df['published_date'] = pd.to_datetime(df['published_date'],errors='ignore')


volatile_organic_compounds_and_trihalomethanes_mtbe_blood_laboratory_20231110_192122.csv uploaded to nhanes_clean / volatile_organic_compounds_and_trihalomethanes_mtbe_blood_laboratory/
Starting job 94ebc1ee-7c96-421c-b574-796730981bd7
Job finished.
Table Row Count 15251 rows.
volatile_toxicant_questionnaire_20231110_192134.csv uploaded to nhanes_clean / volatile_toxicant_questionnaire/
Starting job 85ae842b-0409-47db-a37d-50c2f3870da3
Job finished.
Table Row Count 11762 rows.
weight_history_questionnaire_20231110_192145.csv uploaded to nhanes_clean / weight_history_questionnaire/
Starting job bd3fcaec-c06e-42de-b21a-fbb8730029e5
Job finished.
Table Row Count 73787 rows.
weight_history_youth_questionnaire_20231110_192159.csv uploaded to nhanes_clean / weight_history_youth_questionnaire/
Starting job c020ea7e-0df2-4518-bb60-3196eff4edd8
Job finished.
Table Row Count 12777 rows.
Entire process took 1885.296294927597 seconds


In [None]:
# str_df.values[2][1].decode('utf-8')

In [10]:
# df

In [24]:
col

'DRXFCSD'

In [15]:
# file_df['gcs_data_filename'].apply(lambda x: x.split('all_continuous')[0] + '*.parquet')

226     acculturation_questionnaire_*.parquet
449     acculturation_questionnaire_*.parquet
534     acculturation_questionnaire_*.parquet
1049    acculturation_questionnaire_*.parquet
1077    acculturation_questionnaire_*.parquet
1164    acculturation_questionnaire_*.parquet
1187    acculturation_questionnaire_*.parquet
1329    acculturation_questionnaire_*.parquet
1349    acculturation_questionnaire_*.parquet
1395    acculturation_questionnaire_*.parquet
1475    acculturation_questionnaire_*.parquet
Name: gcs_data_filename, dtype: object

In [57]:
df

Unnamed: 0,SEQN,ACD010A,ACD010B,ACD010C,ACQ020,ACQ030,ACD040,ACQ050,ACQ060,ACD070,ACD080,ACD011A,ACD011B,ACD011C,ACD110,filename
0,2,1.0,,,,,,,,,,,,,,s3://nhanes_clean/all-continuous-nhanes/data/a...
1,5,1.0,,,,,,,,,,,,,,s3://nhanes_clean/all-continuous-nhanes/data/a...
2,6,1.0,,,,,,,,,,,,,,s3://nhanes_clean/all-continuous-nhanes/data/a...
3,7,1.0,,,,,,,,,,,,,,s3://nhanes_clean/all-continuous-nhanes/data/a...
4,8,1.0,,,,,,,,,,,,,,s3://nhanes_clean/all-continuous-nhanes/data/a...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92153,124817,,,,,,2.0,,,,,,,,,s3://nhanes_clean/all-continuous-nhanes/data/a...
92154,124818,,,,,,,,,,,1.0,,,,s3://nhanes_clean/all-continuous-nhanes/data/a...
92155,124820,,,,,,,,,,,1.0,,,,s3://nhanes_clean/all-continuous-nhanes/data/a...
92156,124821,,,,,,,,,,,1.0,,,,s3://nhanes_clean/all-continuous-nhanes/data/a...


In [59]:
# metadata_df['gcs_data_filename'].tolist()

In [56]:
# metadata_df[metadata_df['gcs_data_filename'].str.contains('audiometry_all_continuous_nhanes_1999_2000_data.parquet')]

In [149]:
file_df.sort_values(by=['start_year','end_year','data_file_name'], ascending=False).drop_duplicates(subset=['gcs_data_filename'])['gcs_data_filename'].tolist()



['physical_activity_monitor_raw_data_80hz_examination_all_continuous_nhanes_2013_2014_data.XPT',
 'physical_activity_monitor_raw_data_80hz_examination_all_continuous_nhanes_2011_2012_data.XPT']

Unnamed: 0,Success


In [None]:
%%sql
output_df << SELECT * FROM read_parquet(f's3://nhanes/all-continuous-nhanes/data/audiometry_all*.parquet',
                                           union_by_name=True,
                                           filename=true);

In [44]:
# output_df['filename_only'] = output_df['filename'].apply(lambda x: x.split('/')[-1])

In [54]:
# output_df['filename_only'].tolist()

In [61]:
output_df.merge(metadata_df[['start_year','end_year','last_updated','published_date','parquet_filename']],how='left',left_on='filename_only',right_on='parquet_filename')




Unnamed: 0,SEQN,AUQ130,AUQ140,AUQ150,AUQ160,AUQ170,AUQ180,AUQ190,AUQ200,AUQ210,...,AUQ490,AUQ500,AUQ510,filename,filename_only,start_year,end_year,last_updated,published_date,parquet_filename
0,1.0,1.0,,,,,,,,,...,,,,s3://nhanes/all-continuous-nhanes/data/audiome...,audiometry_all_continuous_nhanes_1999_2000_dat...,1999.0,2000.0,2023-11-07 21:25:40.087858+00:00,2005-02-01 00:00:00,audiometry_all_continuous_nhanes_1999_2000_dat...
1,1.0,1.0,,,,,,,,,...,,,,s3://nhanes/all-continuous-nhanes/data/audiome...,audiometry_all_continuous_nhanes_1999_2000_dat...,1999.0,2000.0,2023-11-07 21:25:42.493038+00:00,2002-06-01 00:00:00,audiometry_all_continuous_nhanes_1999_2000_dat...
2,2.0,2.0,4.0,2.0,,,,2.0,,2.0,...,,,,s3://nhanes/all-continuous-nhanes/data/audiome...,audiometry_all_continuous_nhanes_1999_2000_dat...,1999.0,2000.0,2023-11-07 21:25:40.087858+00:00,2005-02-01 00:00:00,audiometry_all_continuous_nhanes_1999_2000_dat...
3,2.0,2.0,4.0,2.0,,,,2.0,,2.0,...,,,,s3://nhanes/all-continuous-nhanes/data/audiome...,audiometry_all_continuous_nhanes_1999_2000_dat...,1999.0,2000.0,2023-11-07 21:25:42.493038+00:00,2002-06-01 00:00:00,audiometry_all_continuous_nhanes_1999_2000_dat...
4,3.0,1.0,,,,,,,,,...,,,,s3://nhanes/all-continuous-nhanes/data/audiome...,audiometry_all_continuous_nhanes_1999_2000_dat...,1999.0,2000.0,2023-11-07 21:25:40.087858+00:00,2005-02-01 00:00:00,audiometry_all_continuous_nhanes_1999_2000_dat...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204049,124820.0,,,,,,,,,,...,,,,s3://nhanes/all-continuous-nhanes/data/audiome...,audiometry_all_continuous_nhanes_2017_2020_dat...,2017.0,2020.0,2023-11-07 21:25:42.493038+00:00,2021-09-01 00:00:00,audiometry_all_continuous_nhanes_2017_2020_dat...
204050,124821.0,,,,,,,,,,...,,,,s3://nhanes/all-continuous-nhanes/data/audiome...,audiometry_all_continuous_nhanes_2017_2020_dat...,2017.0,2020.0,2023-11-07 21:25:40.087858+00:00,2022-03-01 00:00:00,audiometry_all_continuous_nhanes_2017_2020_dat...
204051,124821.0,,,,,,,,,,...,,,,s3://nhanes/all-continuous-nhanes/data/audiome...,audiometry_all_continuous_nhanes_2017_2020_dat...,2017.0,2020.0,2023-11-07 21:25:42.493038+00:00,2021-09-01 00:00:00,audiometry_all_continuous_nhanes_2017_2020_dat...
204052,124822.0,,,,,,,,,,...,2.0,,,s3://nhanes/all-continuous-nhanes/data/audiome...,audiometry_all_continuous_nhanes_2017_2020_dat...,2017.0,2020.0,2023-11-07 21:25:40.087858+00:00,2022-03-01 00:00:00,audiometry_all_continuous_nhanes_2017_2020_dat...
