## Import libraries

Import the libraries that are used throughout the rest of the notebook


In [24]:
# @title Import libraries
%matplotlib inline

import ipywidgets as widgets
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from IPython.display import display

## Load data

Load the data that will be used in the rest of the notebook

In [25]:
# @title Load data


demographics_columns = [
    "RIAGENDR", "RIDAGEYR", "RIDAGEMN", "RIDRETH1", "RIDRETH3", "DMDEDUC3", "DMDEDUC2", "INDHHIN2", "INDFMIN2"
]
examination_columns = [
    "BPXCHR", "BPXPLS", "BPXSY1", "BPXDI1", "BPXSY2", "BPXDI2", "BPXSY3", "BPXDI3", "BPXSY4", "BPXDI4", "BMXWT", "BMXHT", "BMXBMI", "BMXWAIST", "BMXHIP",
]
lab_columns = [
     "LBDHDD", "LBXTC", "LBXGLU",
]
questionnaire_columns = [
    "DIQ010", "DIQ160", "DIQ170", "DIQ172", "DIQ175A"
]

demographics_df = pd.read_csv('https://raw.githubusercontent.com/RileyZurrin/NHANES_Extractor/main/Data_scraper/NHANES_2017-2018/demographics.csv').set_index('SEQN')
dietary_df = pd.read_csv('https://raw.githubusercontent.com/RileyZurrin/NHANES_Extractor/main/Data_scraper/NHANES_2017-2018/dietary.csv').set_index('SEQN')
examination_df = pd.read_csv('https://raw.githubusercontent.com/RileyZurrin/NHANES_Extractor/main/Data_scraper/NHANES_2017-2018/examination.csv').set_index('SEQN')
labs_df = pd.read_csv('https://raw.githubusercontent.com/RileyZurrin/NHANES_Extractor/main/Data_scraper/NHANES_2017-2018/laboratory.csv').set_index('SEQN')
questionnaire_df = pd.read_csv('https://raw.githubusercontent.com/RileyZurrin/NHANES_Extractor/main/Data_scraper/NHANES_2017-2018/questionnaire.csv').set_index('SEQN')

encodings_df = pd.read_csv('https://raw.githubusercontent.com/RileyZurrin/NHANES_Extractor/main/Dictionary_scraper/NHANES_2017-2018/encodings.csv')
description_df = pd.read_csv('https://raw.githubusercontent.com/RileyZurrin/NHANES_Extractor/main/Dictionary_scraper/NHANES_2017-2018/descriptions.csv')
master_df = demographics_df[demographics_columns]
master_df = master_df.join(other=[examination_df[examination_columns], labs_df[lab_columns], questionnaire_df[questionnaire_columns]], how="outer")

final_columns = demographics_columns + examination_columns + lab_columns + questionnaire_columns
master_encodings = encodings_df[encodings_df["Variable"].isin(final_columns)]
master_descriptions = description_df[description_df["Variable"].isin(final_columns)]
master_meta_df = master_descriptions.merge(right=master_encodings, on="Variable", how="outer")

master_meta_df

Unnamed: 0,Variable,DataType,Label,Plain Description,Target,Encoding
0,RIAGENDR,demographics,Gender,Gender of the participant.,Both males and females 0 YEARS -150 YEARS,"{1: 'Male', 2: 'Female', '.': 'Missing'}"
1,RIDAGEYR,demographics,Age in years at screening,Age in years of the participant at the time of...,Both males and females 0 YEARS -150 YEARS,"{80: '80 years of age and over', '.': 'Missing'}"
2,RIDAGEMN,demographics,Age in months at screening - 0 to 24 mos,Age in months of the participant at the time o...,Both males and females 0 YEARS -2 YEARS,{'.': 'Missing'}
3,RIDRETH1,demographics,Race/Hispanic origin,Recode of reported race and Hispanic origin in...,Both males and females 0 YEARS -150 YEARS,"{1: 'Mexican American', 2: 'Other Hispanic', 3..."
4,RIDRETH3,demographics,Race/Hispanic origin w/ NH Asian,Recode of reported race and Hispanic origin in...,Both males and females 0 YEARS -150 YEARS,"{1: 'Mexican American', 2: 'Other Hispanic', 3..."
5,DMDEDUC3,demographics,Education level - Children/Youth 6-19,What is the highest grade or level of school {...,Both males and females 6 YEARS -19 YEARS,"{0: 'Never attended / kindergarten only', 1: '..."
6,DMDEDUC2,demographics,Education level - Adults 20+,What is the highest grade or level of school {...,Both males and females 20 YEARS -150 YEARS,"{1: 'Less than 9th grade', 2: '9-11th grade (I..."
7,INDHHIN2,demographics,Annual household income,Total household income (reported as a range va...,Both males and females 0 YEARS -150 YEARS,"{1: '$ 0 to $ 4,999', 2: '$ 5,000 to $ 9,999',..."
8,INDFMIN2,demographics,Annual family income,Total family income (reported as a range value...,Both males and females 0 YEARS -150 YEARS,"{1: '$ 0 to $ 4,999', 2: '$ 5,000 to $ 9,999',..."
9,BPXCHR,examination,60 sec HR (30 sec HR * 2),60 sec HR (30 sec HR * 2),Both males and females 0 YEARS -7 YEARS,{'.': 'Missing'}


## Exploring the dataset

Use the below cells to get a feeling for the dataset.

You can plot a histogram (normalised or unnormalised) of the selected field

In [26]:
# @title Visualising data code

def plot_histogram(field_desc, split_on_desc, normalise, split):
    field = master_meta_df[master_meta_df['Label'] == field_desc]['Variable'].values[0]
    if split:
      split_on = master_meta_df[master_meta_df['Label'] == split_on_desc]['Variable'].values[0]
    else:
      split_on = None

    if normalise:
        sns.histplot(master_df, x=field, hue=split_on, stat='probability', common_norm=False, kde=True)
    else:
        sns.histplot(master_df, x=field, hue=split_on, kde=True)

    plt.show()

hist_widgets = widgets.interact(
    plot_histogram,
    field_desc=widgets.Dropdown(options=master_meta_df['Label'], value=master_meta_df['Label'][20], description='Field to plot:', disabled=False),
    split_on_desc=widgets.Dropdown(options=master_meta_df['Label'], value=master_meta_df['Label'][0], description='Split by:', disabled=False),
    normalise=widgets.Checkbox(value=False),
    split=widgets.Checkbox(value=True)
)

interactive(children=(Dropdown(description='Field to plot:', index=20, options=('Gender', 'Age in years at scr…